<a href="https://colab.research.google.com/github/Pushkar-Bhuse/Political-Influence/blob/master/Sentimental_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Input
from keras.layers import LSTM, Bidirectional, Conv1D, GlobalMaxPool1D, Embedding, Dense, MaxPooling1D, GlobalMaxPooling1D

In [0]:
import pydrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded = drive.CreateFile({'id':"1SVJBIsINVBj4ewqxr6hGmw2qC2obV2hm"}) 
downloaded.GetContentFile('sentiments.csv')  

In [0]:
downloaded = drive.CreateFile({'id':"1CJCcOIF5-6ceTotmjnF2Lm7JgEMm4zvJ"}) 
downloaded.GetContentFile('glove.6B.300d.txt')  

In [0]:
dataset = pd.read_csv('sentiments.csv', engine='python', header=None)

In [19]:
dataset.shape

(1600000, 6)

In [0]:
dataset = pd.DataFrame(dataset[[0,5]])

In [0]:
dataset.rename(columns={0: "category", 5: "sentiment"}, inplace=True)

In [14]:
dataset['category'].value_counts()

1    800000
0    800000
Name: category, dtype: int64

In [31]:
dataset.isna().sum()

category     0
sentiment    0
dtype: int64

In [11]:
print(dataset['sentiment'][90])

I should have paid more attention when we covered photoshop in my webpage design class in undergrad 


In [0]:
txt = dataset['sentiment'][90]
txt = ' '.join(word for word in txt.split(' ') if not word.startswith('@'))

In [0]:
dataset['sentiment'] = dataset['sentiment'].apply(lambda x: ' '.join(word for word in x.split(' ') if not word.startswith('@')))

In [0]:
def change_y(text):
  if text == 4:
    return 1
  return 0

In [0]:
dataset['category'] = dataset['category'].apply(lambda x: change_y(x))

In [0]:
X = dataset['sentiment']
y = dataset['category']

In [0]:
BATCH_SIZE = 4096
VALIDATION_SPLIT = 0.2
EPOCHS = 20
LSTM_UNITS = 30
MAX_VOCAB = 30000
DIMENSIONS = 300

In [0]:
word2Vec = {}
with open('glove.6B.300d.txt') as f:
  for line in f:
    line = line.rstrip().split(' ')
    word = line[0]
    embedding_vector = line[1:]
    word2Vec[word] = embedding_vector

In [0]:
max_seq_len = max(len(s) for s in X)

In [0]:
tokenizer = Tokenizer(MAX_VOCAB)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

In [0]:
inputs_words = tokenizer.word_counts


In [0]:
# number_inputs_words

In [0]:
number_words = min(MAX_VOCAB, len(inputs_words)+1)
embedding_matrix = np.zeros((number_words, DIMENSIONS))
for word, i in inputs_words.items():
  if i < MAX_VOCAB:
    embedding_vector = word2Vec.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [0]:
X = pad_sequences(X, max_seq_len)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [0]:
embedding_layer = Embedding(number_words, DIMENSIONS, trainable=True, weights=[embedding_matrix])
embedding_layer_input = Input(shape=(max_seq_len,))
embedding_layer_x = embedding_layer(embedding_layer_input)

In [0]:
lstm = LSTM(LSTM_UNITS, return_sequences=True)
lstm_ouputs = lstm(embedding_layer_x)
after_lstm = GlobalMaxPool1D()(lstm_ouputs)

In [0]:
dense_layer = Dense(1, activation='sigmoid')
output_category = dense_layer(after_lstm)

In [0]:
model = Model(embedding_layer_input, output_category)

In [0]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [50]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 359)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 359, 300)          9000000   
_________________________________________________________________
lstm_8 (LSTM)                (None, 359, 30)           39720     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 30)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 31        
Total params: 9,039,751
Trainable params: 9,039,751
Non-trainable params: 0
_________________________________________________________________


In [68]:
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=VALIDATION_SPLIT)

Train on 768000 samples, validate on 192000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7f7619e86710>

In [0]:
model.save('rnn_on_sentimental.h5')

In [0]:
embedding_cnn_input = Input(shape=(max_seq_len,))
embedding_cnn_layer = Embedding(number_words, DIMENSIONS, weights=[embedding_matrix], trainable=True)
embedding_x = embedding_cnn_layer(embedding_cnn_input)

In [0]:
x = Conv1D(128, 3, activation='relu')(embedding_x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

In [66]:
model2 = Model(embedding_cnn_input, output)
model2.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 359)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 359, 300)          9000000   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 357, 128)          115328    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 119, 128)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 117, 128)          49280     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               1651

In [0]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [0]:
model2.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=VALIDATION_SPLIT)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 768000 samples, validate on 192000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20