In [1]:
import pandas as pd 
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import itertools

Using TensorFlow backend.


## Dataset preparation

In [0]:
def shuffle(df, n=1, axis=0):  
    df = df.copy()
    for _ in range():
        df.apply(np.random.shuffle, axis=axis)
    return df

In [3]:
df_bjp = pd.read_csv('bjp_data.csv',encoding="ISO-8859-1")
df_bjp = df_bjp.rename(columns = {'commentText': 'SentimentText', 'Label': 'Sentiment'})
df_bjp.head()

Unnamed: 0,SentimentText,Sentiment
0,Banda apna 100 % best he,1
1,WE NEED THE KING MAKER ---- NARENDRA MODI,1
2,Awesome... Banda apna Best hai,1
3,Best PM ever,1
4,RAHUL GANDHI LEFT THE CHAT,1


In [0]:
# df_cong = pd.read_csv('CONGRESS.csv',encoding="ISO-8859-1")
# df_cong = df_cong.rename(columns = {'ï»¿commentText': 'SentimentText', 'Label': 'Sentiment'})

## Data Preprocessing

In [0]:
import re
#handling links
df_bjp['SentimentText'] = df_bjp['SentimentText'].str.replace("https?://\S*", " ")
#handling user tags
df_bjp['SentimentText'] = df_bjp['SentimentText'].str.replace("@[\w]*", " ")
#handling special character,punctuation,numbers
df_bjp['SentimentText'] = df_bjp['SentimentText'].str.replace("[^a-zA-Z#]", " ")
#fix misspelled words
df_bjp['SentimentText'] = df_bjp['SentimentText'].apply(lambda x: ''.join(''.join(s)[:2] for _, s in itertools.groupby(x)))

df_bjp['SentimentText'] = df_bjp['SentimentText'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", x).split()))

df_bjp['SentimentText'] = df_bjp['SentimentText'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [6]:
df_bjp.head()

Unnamed: 0,SentimentText,Sentiment
0,Banda apna best,1
1,NEED THE KING MAKER NARENDRA MODI,1
2,Awesome Banda apna Best hai,1
3,Best ever,1
4,RAHUL GANDHI LEFT THE CHAT,1


In [7]:
import nltk
# nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
stopwords = []
with open('hinglishStopWords.txt', 'r') as f:
    hinglishStopWords = f.read().split('\n')
stopwords+=hinglishStopWords

with open('englishStopWords','r') as f:
    englishStopWords = f.read().split('\n')
stopwords+=englishStopWords

In [0]:
from nltk.tokenize import word_tokenize

def tokenization(tweet):
    text = []
    for token in word_tokenize(tweet):
        if token in stopwords:
            continue
        else:
            text.append(token)
    return text

In [0]:
df_bjp['SentimentText'] = df_bjp['SentimentText'].apply(lambda x: tokenization(x))

In [0]:
df_bjp['SentimentText'] = df_bjp['SentimentText'].apply(lambda x: ' '.join(x) )

In [0]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df_bjp['SentimentText'], df_bjp['Sentiment'])

In [43]:
count_vect = CountVectorizer(analyzer='word', tokenizer=tokenization)
count_vect.fit(df_bjp['SentimentText'])

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)



In [30]:
classifier = naive_bayes.MultinomialNB()
# classifier = svm.SVC()
# classifier = xgboost.XGBClassifier()
classifier.fit(xtrain_count, train_y)
predictions = classifier.predict(xvalid_count)
print(predictions.shape)
metrics.accuracy_score(predictions, valid_y)

(488,)


0.8155737704918032

In [44]:
input_size = xtrain_count.shape[1]
input_layer = layers.Input((input_size, ),)
hidden_layer1 = layers.Dense(1024, activation="relu")(input_layer)
hidden_layer2 = layers.Dense(512, activation="relu")(hidden_layer1)
hidden_layer3 = layers.Dense(256, activation="relu")(hidden_layer2)
hidden_layer = layers.Dense(128, activation="relu")(hidden_layer3)
output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)
classifier = models.Model(inputs = input_layer, outputs = output_layer)
classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
classifier.fit(xtrain_count, train_y,epochs=50)
predictions = classifier.predict(xvalid_count)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [0]:
predictionf = predictions[:,0] >= 0.5

In [49]:
metrics.accuracy_score(predictionf, valid_y)

0.8381147540983607

In [50]:
from sklearn.metrics import f1_score
f1_score(valid_y, predictionf)

0.8454011741682974

### Tfidf ngram


In [51]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', tokenizer=tokenization, ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df_bjp['SentimentText'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)



In [55]:
# classifier = naive_bayes.MultinomialNB()
classifier = svm.SVC()
# classifier = xgboost.XGBClassifier()
classifier.fit(xtrain_tfidf_ngram, train_y)
predictions = classifier.predict(xvalid_tfidf_ngram)
print(predictions.shape)
metrics.accuracy_score(predictions, valid_y)

(488,)


0.7930327868852459

In [57]:
input_size = xtrain_tfidf_ngram.shape[1]
input_layer = layers.Input((input_size, ))
hidden_layer1 = layers.Dense(1024, activation="relu")(input_layer)
hidden_layer2 = layers.Dense(512, activation="relu")(hidden_layer1)
hidden_layer3 = layers.Dense(256, activation="relu")(hidden_layer2)
hidden_layer = layers.Dense(128, activation="relu")(hidden_layer3)
output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)
classifier = models.Model(inputs = input_layer, outputs = output_layer)
classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
classifier.fit(xtrain_tfidf_ngram, train_y,epochs=50)
predictions = classifier.predict(xvalid_tfidf_ngram)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [58]:
predictionf = predictions[:,0] >= 0.5
metrics.accuracy_score(predictionf, valid_y)

0.7745901639344263

In [59]:
from sklearn.metrics import f1_score
f1_score(valid_y, predictionf)

0.808362369337979