In [1]:
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Load the test data on which the predictions will be made using our best model

In [None]:
congress_test = pd.read_csv('/content/drive/MyDrive/Data/Data/congress_test.csv')
bjp_test = pd.read_csv('/content/drive/MyDrive/Data/Data/bjp_test.csv')

In [None]:
congress_test =congress_test[:2000]
bjp_test = bjp_test[0:2000]

In [None]:
congress_test[0:5]

Preprocessing the test tweets in the same manner that we had done for the training data.

In [None]:
def tweet_to_words( raw_review ):
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", str(raw_review))
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', str(letters_only)) # remove URLs
    tweet = re.sub('RT', ' ', str(tweet))

    #Convert to lower case, split into individual words
    tweet = letters_only.lower().split()



    return( " ".join(tweet))

In [None]:
# Get the number of Tweets based on the dataframe column size
num_tweets = 2000

# Initialize an empty list to hold the clean reviews


# Loop over each tweet; create an index i that goes from 0 to the length
# of the tweet list
def clean_test(dataframe):
    clean_train_tweets = []
    for i in range( 0, num_tweets ):
        # Call function for each one, and add the result to the list of
        clean_train_tweets.append( tweet_to_words(dataframe[i]))
    return clean_train_tweets

In [None]:
congress_inputs = clean_test(congress_test['clean_text'])
bjp_inputs = clean_test(bjp_test['clean_text'])

Tokenize the text data. The length of the vector is kept 2000. Because this was the same lenght that was using for the Bidirectional LSTM which was our best model.

In [None]:
def tokenze_data(data_inputs):
        tokenizer = Tokenizer(nb_words=2000)
        tokenizer.fit_on_texts(data_inputs)
        sequences = tokenizer.texts_to_sequences(data_inputs)

        word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))
        max_len = 200
        data = pad_sequences(sequences, max_len)
        print('Shape of data tensor:', data.shape)
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
        return data

In [None]:
congress_inputs = np.array(["Your", "congress", "input", "text"])
bjp_inputs = np.array(["Your", "bjp", "input", "text"])

congress_inputs = [str(text) for text in congress_inputs]
bjp_inputs = [str(text) for text in bjp_inputs]

congress_inputs = tokenze_data(congress_inputs)
bjp_inputs = tokenze_data(bjp_inputs)


**LOAD THE BEST MODEL (BIDIRECTIONAL LSTM)**

In [None]:
from keras.models import model_from_json
# load json and create model
json_file = open("/content/drive/MyDrive/Data/SavedModels/Model_Bidir_LSTM.h5", 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/content/drive/MyDrive/Data/SavedModels/Weights_bidir_LSTM.h5")
print("Loaded model from disk")

**SENTIMENT PREDICTION USING THE MODEL**

In [None]:
congress_prediction = loaded_model.predict(congress_inputs)
bjp_prediction = loaded_model.predict(bjp_inputs)

If the probabilty of the outcome is greater than 0.5 for any class then the sentiment belongs to that particular class. Since we are concerned with only the count of positive sentiments. We will check the second column variables for our inference.

In [None]:
congress_pred = (congress_prediction>0.5)
bjp_pred = (bjp_prediction>0.5)

In [None]:
def get_predictions(party_pred):
    x = 0
    for i in party_pred:
        if(i[1]==True):
            x+=1
    return x

In [None]:
congress_numbers = get_predictions(congress_pred)
bjp_numbers = get_predictions(bjp_pred)
print("Congress Positive Tweets:",congress_numbers)
print("BJP Positive Tweets:",bjp_numbers)

Just like the training data the majority of the tweets have a negative sentiment attached to them. After feeding 2000 tweets for both the Congress and BJP. The model predicted that BJP has 660 positive tweets while Congress has 416 positive tweets.<br><br> This indicated that the contest this year would be close and the chances of BJP winning on Majority like the 2015 elections are less. This has been corraborated by the poor perfomace of the BJP in the recent state elections where the lost power in three Major Hindi speaking states Rajasthan, Madhya Pradesh and Chattishgarh. <br><br> The challanges faced in this project were the limited data availablity and that Twitter as a platform is only accessable to the elite urban population and I is difficult to make any absolute prediction on a phenomenon like the Indian Election. The Coming month will tell how things unfold for both the parties.  