Loading Libraries

In [18]:
import numpy as np
import gensim
import pandas as pd
from sklearn import model_selection, preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Loading Dataset

In [19]:
df = pd.read_csv('Dataset.csv')

Finding the maximum length of our tweets

In [20]:
df['word_count'] = df['clean'].apply(lambda x: len(str(x).split()))
max(df['word_count'])

45

Spliting the dataset into 70% training, 30% testing. Notic that we shuffled the data so it won't have a sprcific order.

In [21]:
# split the dataset into training and test datasets 70% training, 30% testing
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['clean'], df['sentiment'], test_size=0.3, random_state = 1)

Encoding the labels using preprocessing.LabelEncoder

In [23]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

Building the tokenizer to convert the tweets into sequences and pad them to the same length

In [24]:
#Tokenize the sentences
tokenizer = Tokenizer()

#preparing vocabulary
tokenizer.fit_on_texts(list(train_x))

#converting text into integer sequences
train_x_seq  = tokenizer.texts_to_sequences(train_x) 
test_x_seq = tokenizer.texts_to_sequences(test_x)

#padding to prepare sequences of same length
train_x_seq  = pad_sequences(train_x_seq, maxlen=45)
test_x_seq = pad_sequences(test_x_seq, maxlen=45)

In [25]:
size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
print(size_of_vocabulary)

81074


# Word Embeddings Using Pre-Traind Model

We will use the pre-trained AraVec Model. AraVec Model converts words into 300 dimensional vectors

In [26]:
# load the AraVec model
trained_model = gensim.models.Word2Vec.load("tweet_cbow_300/tweets_cbow_300")
print("We've",len(trained_model.wv.index2word),"vocabularies")

We've 331679 vocabularies


Loading Word Embeddings

In [45]:
# create a weight matrix for words in training docs
out_of_vocab = []
embedding_matrix = np.zeros((size_of_vocabulary, 300))
for index, word in enumerate(tokenizer.word_index):
    if word in trained_model.wv:
        embedding_vector = trained_model.wv[word]
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
    else:
        out_of_vocab.append(word)

In [47]:
len(set(out_of_vocab))

27777

Building the model. We will use LSTM model

In [48]:
#deep learning library
from keras.models import *
from keras.layers import *
from keras.callbacks import *

model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,weights=[embedding_matrix],input_length=45,trainable=True)) 

#lstm layer
model.add(LSTM(128))


#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True,verbose=1)  

print(model.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 45, 300)           24322200  
_________________________________________________________________
lstm_8 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_15 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 65        
Total params: 24,550,169
Trainable params: 24,550,169
Non-trainable params: 0
_________________________________________________________________
None


In [49]:
history = model.fit(np.array(train_x_seq),np.array(train_y),batch_size=256,epochs=10,
                    validation_data=(np.array(test_x_seq),np.array(test_y)),callbacks=[es,mc])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 29374 samples, validate on 12589 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.67345, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.67345
Epoch 3/10

Epoch 00003: val_acc improved from 0.67345 to 0.69577, saving model to best_model.h5
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.69577
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.69577
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.69577
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.69577
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.69577
Epoch 00008: early stopping


In [50]:
#loading best model
from keras.models import load_model
model = load_model('best_model.h5')

#evaluation 
_,val_acc = model.evaluate(test_x_seq,test_y, batch_size=128)
print(val_acc)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


0.6957661509513855


# Word Embeddings From Scratch

In [42]:
#deep learning library
from keras.models import *
from keras.layers import *
from keras.callbacks import *

model=Sequential()

#embedding layer
model.add(Embedding(size_of_vocabulary,300,input_length=45,trainable=True)) 

#lstm layer
model.add(LSTM(128))


#Dense Layer
model.add(Dense(64,activation='relu')) 
model.add(Dense(1,activation='sigmoid')) 

#Add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min',patience=3)  
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True)  

print(model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 45, 300)           24322200  
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_13 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 65        
Total params: 24,550,169
Trainable params: 24,550,169
Non-trainable params: 0
_________________________________________________________________
None


In [43]:
history = model.fit(np.array(train_x_seq),np.array(train_y),batch_size=256,epochs=10,
                    validation_data=(np.array(test_x_seq),np.array(test_y)),callbacks=[es,mc])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 29374 samples, validate on 12589 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.71141, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.71141
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.71141
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.71141
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.71141
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.71141
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.71141
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.71141
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.71141
Epoch 00009: early stopping


In [44]:
#loading best model
from keras.models import load_model
model = load_model('best_model.h5')

#evaluation 
_,val_acc = model.evaluate(test_x_seq,test_y, batch_size=128)
print(val_acc)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


0.7114147543907166
