#### Importing Necessary Libraries

Dataset Link : https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/overview

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, RNN, LSTM, GRU, Bidirectional, Embedding

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
####Reading Data
train_df = pd.read_csv('/content/drive/MyDrive/Datasets/Sentiment Analysis/train.tsv', sep = '\t')
test_df = pd.read_csv('/content/drive/MyDrive/Datasets/Sentiment Analysis/test.tsv', sep = '\t')
sample_df = pd.read_csv('/content/drive/MyDrive/Datasets/Sentiment Analysis/sampleSubmission.csv')

In [None]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [None]:
import contractions
from bs4 import BeautifulSoup 
import tqdm
import unicodedata
import re
import nltk
nltk.download('stopwords')
import warnings
warnings.filterwarnings('ignore')
stop_words = nltk.corpus.stopwords.words('english')
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text


def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def pre_process_corpus(docs):
    norm_docs = []
    for text in tqdm.tqdm(docs): 
        text = strip_html_tags(text)
        text = re.sub(r'[^a-zA-Z\s]','',text,re.I)
        text = text.lower()
        text = text.strip()
        text = remove_accented_chars(text)
        text = contractions.fix(text)
        #tokens = nltk.word_tokenize(text)
        #filtered_tokens = [token for token in tokens if token not in stop_words]
        #text = " ".join(filtered_tokens)
        text = re.sub(" +", ' ',text)
        text =text.strip()
        norm_docs.append(text)
    return norm_docs

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_data = pre_process_corpus(train_df['Phrase'])

100%|██████████| 156060/156060 [00:20<00:00, 7742.46it/s]


In [None]:
test_data = pre_process_corpus(test_df['Phrase'])

100%|██████████| 66292/66292 [00:08<00:00, 7967.39it/s]


In [None]:
def metrics(y_true,y_pred):
    print('Confusion Matrix:\n', confusion_matrix(y_true, y_pred))
    print('\n\nAccuracy Score:\n', accuracy_score(y_true, y_pred))
    print('\n\nClassification Report: \n', classification_report(y_true, y_pred))

In [None]:
#Tokenizing the text
tokenzer = tf.keras.preprocessing.text.Tokenizer(oov_token = '<UNK>')
tokenzer.fit_on_texts(train_data)

In [None]:
train_sequences = tokenzer.texts_to_sequences(train_data)
test_sequences = tokenzer.texts_to_sequences(test_data)

In [None]:
print("Vocabulary size ={}".format(len(tokenzer.word_index)))
print("Number of Documents={}".format(tokenzer.document_count))

Vocabulary size =16692
Number of Documents=156060


In [None]:
pd.Series(train_data).apply(lambda x : len(x.split())).max()

51

In [None]:
MAX_SEQUENCE_LENGTH = 51


#Padding the sentence to the maximum length.
train_pad_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding='post')
test_pad_sequneces = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

In [None]:
train_pad_sequences.shape

(156060, 51)

### Modelling

In [None]:
 y = train_df['Sentiment']

In [None]:
VOCAB_SIZE

16692

In [None]:
def deep_model(net_layer):
  SEED = 42
  np.random.seed(SEED)
  tf.random.set_seed(SEED)

  EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
  VOCAB_SIZE = len(tokenzer.word_index)
  model = Sequential()
  model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
  model.add(Bidirectional(net_layer(128,return_sequences = True)))
  model.add(Bidirectional(net_layer(256,activation = 'relu', return_sequences=False)))
  model.add(Dense(5,activation = 'softmax'))

  model.compile(loss = 'sparse_categorical_crossentropy',optimizer="adam",metrics =['accuracy'])
  return model

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 51, 300)           5007900   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 645       
Total params: 5,228,193
Trainable params: 5,228,193
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(train_pad_sequences, y, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f132cf90a10>

In [None]:
test_pred = model.predict_classes(test_pad_sequneces)

In [None]:
sample_df['Sentiment'] = test_pred

In [None]:
sample_df.to_csv('Predictions.csv', index = None)

In [None]:
from google.colab import files
files.download('Predictions.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
VOCAB_SIZE = len(tokenzer.word_index)
model = Sequential()
model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
model.add((GRU(128,return_sequences = False)))
model.add((Dense(256,activation = 'relu',)))
model.add(Dense(5,activation = 'softmax'))

model.compile(loss = 'sparse_categorical_crossentropy',optimizer="adam",metrics =['accuracy'])

In [None]:
model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 51, 300)           5007900   
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               165120    
_________________________________________________________________
dense_12 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_13 (Dense)             (None, 5)                 1285      
Total params: 5,207,329
Trainable params: 5,207,329
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(train_pad_sequences, y, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f1321342090>

In [None]:
test_pred = model.predict_classes(test_pad_sequneces)
sample_df['Sentiment'] = test_pred
sample_df.to_csv('Predictions1.csv', index = None)
from google.colab import files
files.download('Predictions1.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
VOCAB_SIZE = len(tokenzer.word_index)
model = Sequential()
model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
model.add(Bidirectional(GRU(128,return_sequences = False)))
model.add((Dense(256,activation = 'relu',)))
model.add(Dense(5,activation = 'softmax'))

model.compile(loss = 'sparse_categorical_crossentropy',optimizer="adam",metrics =['accuracy'])

In [None]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 51, 300)           5007900   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               330240    
_________________________________________________________________
dense_14 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_15 (Dense)             (None, 5)                 1285      
Total params: 5,405,217
Trainable params: 5,405,217
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(train_pad_sequences, y, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f1322ba3610>

In [None]:
test_pred = model.predict_classes(test_pad_sequneces)
sample_df['Sentiment'] = test_pred
sample_df.to_csv('Predictions2.csv', index = None)
from google.colab import files
files.download('Predictions2.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from gensim.models.fasttext import FastText
tokenized_corpus = [nltk.word_tokenize(doc) for doc in train_data]


# Set values for various parameters
feature_size = 49    # Word vector dimensionality  
window_context = 20  # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3        # Downsample setting for frequent words
sg = 1               # skip-gram model

ft_model = FastText(tokenized_corpus, size=feature_size, 
                     window=window_context, min_count = min_word_count,
                      sample=sample)
ft_model

<gensim.models.fasttext.FastText at 0x7f131b550510>

In [None]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector


def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [None]:
# get document level embeddings
ft_doc_features = averaged_word_vectorizer(corpus=tokenized_corpus, model=ft_model,
                                             num_features=feature_size)
pd.DataFrame(ft_doc_features)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48
0,-0.441281,0.507160,0.770722,0.963138,-0.685655,0.536615,-0.616827,-1.348408,-0.762274,-1.615198,0.014602,0.081998,-0.091562,0.356291,0.123390,-0.110576,-0.063605,1.041040,0.611392,-0.686466,1.179276,-0.152338,0.238797,-0.731713,-1.017957,0.016979,-0.051265,0.116101,-0.264996,0.012833,-1.429424,1.251031,-0.107272,0.775669,0.222352,-0.233242,-1.387988,0.991425,-0.177540,1.042039,0.685612,0.359440,-0.834138,0.344184,0.400826,0.005398,-0.232304,-0.166370,-0.883356
1,-0.168005,0.801699,0.939617,0.809995,-0.776177,0.614697,-0.655630,-1.702683,-0.813289,-1.584852,0.201309,0.520248,-0.362777,0.524063,-0.192547,0.304437,-0.290311,1.401913,0.635816,-0.841760,1.360745,-0.234164,0.342971,-1.065689,-1.372238,-0.050935,0.356988,-0.212063,-0.353867,0.328919,-2.032985,1.435735,-0.343450,0.234330,-0.197409,-0.088049,-1.322677,1.230937,-0.785732,1.143249,0.915367,-0.002461,-0.887440,0.332560,-0.138139,0.092353,-0.327582,0.154532,-1.386927
2,1.082305,-1.190333,1.734503,0.663169,-2.296338,-1.948801,-1.246404,-2.080420,-0.272343,-4.219652,0.457873,0.341550,-1.632678,-1.808780,-4.219624,1.107930,-2.445264,2.444481,1.580593,-1.196576,0.178263,0.582394,-1.228230,-1.858072,-2.075518,0.149812,1.171370,-0.388425,2.112467,-2.141758,0.097670,0.673594,0.636769,-1.167665,-1.950518,0.608850,0.544809,-0.201297,0.027539,3.750437,0.539078,-0.917777,-0.550623,-0.415097,-2.100217,-3.467077,0.540397,1.917396,-0.900434
3,1.697802,-4.951401,2.966700,1.320312,-3.661474,-4.547944,-1.725259,-3.498343,1.237312,-4.477345,-0.792848,-1.869729,-0.596482,-3.945305,-9.632492,1.499200,-2.689227,2.345210,4.558747,-3.663603,1.086571,0.667651,-2.246895,-4.176065,-5.119275,1.032008,1.634493,-1.357201,3.614936,-4.288655,-1.689688,1.469756,1.896605,-4.248006,-2.581291,0.954061,0.388410,-0.904007,0.603925,6.281253,1.905752,-2.221319,2.029108,0.194640,-3.796067,-7.287859,3.721044,4.659659,-0.350704
4,0.466809,2.570734,0.502305,0.006026,-0.931203,0.650341,-0.767550,-0.662498,-1.781999,-3.961959,1.708593,2.552829,-2.668874,0.327746,1.193245,0.716660,-2.201302,2.543753,-1.397560,1.270452,-0.730045,0.497138,-0.209565,0.459922,0.968239,-0.732384,0.708248,0.580352,0.609998,0.005139,1.885028,-0.122569,-0.623067,1.912675,-1.319745,0.263640,0.701208,0.501412,-0.548847,1.219622,-0.827596,0.385765,-3.130353,-1.024835,-0.404367,0.353705,-2.640249,-0.824868,-1.450164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156055,0.027831,-1.680416,-2.146590,3.390694,1.806233,-2.130845,3.805274,2.580997,-1.530487,-0.283533,0.712005,2.565607,2.812668,0.628975,0.440713,1.684730,0.379636,2.427021,2.849965,0.787010,-1.665470,-0.533214,-1.079587,-0.720003,-2.521146,-2.634955,0.061864,-1.760678,-1.765385,-1.314409,-1.507142,-0.756398,4.907889,-0.045379,0.754899,0.547549,-0.996275,2.882706,-1.665515,1.355999,2.346693,-1.072332,-0.395129,-0.140759,1.261219,-0.702689,1.237364,0.790287,-0.449721
156056,-0.435845,-0.241661,-0.797927,0.178291,-0.842554,0.078556,0.615375,-0.363099,0.598639,-0.745922,0.770274,-0.532758,-0.179107,0.463224,0.871964,0.509950,0.913565,0.530192,-0.365935,1.001296,-0.244665,0.656734,-0.325951,0.385162,0.265360,-0.563258,1.219538,0.361481,0.376103,-0.157878,0.430186,0.228215,-1.177875,-0.061690,0.238511,0.580967,0.063902,-0.360662,0.059176,0.456086,0.477954,0.725752,-1.703660,-0.449515,-0.125843,0.144414,-0.580625,-0.434410,0.029593
156057,0.290090,0.574622,-0.635888,-0.070264,-0.120952,0.122232,0.281980,0.310143,0.041001,-1.235314,0.733661,-0.016907,-0.353532,-0.192541,0.134693,0.447150,0.113940,0.704034,-0.360579,0.760385,-0.113371,0.059668,0.309657,0.063757,0.364116,-0.619797,0.283909,0.220115,0.506501,-0.107658,0.457066,0.260952,-0.180274,-0.295403,0.041801,0.553924,0.299237,-0.215105,-0.139460,0.717036,0.263782,0.450336,-1.210397,-0.127887,-0.479050,0.174391,-0.290044,-0.210351,0.119261
156058,0.215236,0.158858,-0.307407,-0.075581,0.313772,0.154443,0.233481,-0.041772,0.193552,-0.274997,-0.256786,-0.323671,-0.180062,-0.273472,-0.369169,0.308810,0.160816,0.226641,-0.061084,0.128424,-0.152972,0.088968,0.006102,-0.045651,0.295193,-0.253129,0.350407,-0.121626,0.264272,0.370952,0.198649,-0.155868,0.006966,-0.442073,0.597549,0.203542,0.067887,0.328318,-0.223108,0.476613,0.210406,0.241724,-0.407119,0.322760,-0.408473,0.015553,-0.029263,-0.056161,-0.006874


In [None]:
test_corpus = [nltk.word_tokenize(doc) for doc in test_data]

In [None]:
test_corpus = [nltk.word_tokenize(doc) for doc in test_data]
test_doc_features = averaged_word_vectorizer(corpus=test_corpus, model=ft_model,
                                             num_features=feature_size)

In [None]:
test_doc_features.shape

(66292, 49)

In [None]:
train_doc_features_re = ft_doc_features.reshape(ft_doc_features.shape[0],ft_doc_features.shape[1],1)
#X_test_pad_re = X_test_pad.reshape(599,49,1)

In [None]:
test_doc_features_re = test_doc_features.reshape(test_doc_features.shape[0],ft_doc_features.shape[1],1)


In [None]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
VOCAB_SIZE = len(tokenzer.word_index)
model = Sequential()
#model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
model.add((GRU(128, return_sequences = True, input_shape=(49,1))))
model.add(Bidirectional(GRU(128, return_sequences=False)))
model.add((Dense(256,activation = 'relu',)))
model.add(Dense(5,activation = 'softmax'))

model.compile(loss = 'sparse_categorical_crossentropy',optimizer="adam",metrics =['accuracy'])

In [None]:
model.summary()

Model: "sequential_35"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_32 (GRU)                 (None, 49, 128)           50304     
_________________________________________________________________
bidirectional_22 (Bidirectio (None, 256)               198144    
_________________________________________________________________
dense_51 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_52 (Dense)             (None, 5)                 1285      
Total params: 315,525
Trainable params: 315,525
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(train_doc_features_re, y, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f131895fcd0>

In [None]:
test_pred = model.predict_classes(test_doc_features_re)
sample_df['Sentiment'] = test_pred
sample_df.to_csv('Predictions_fast.csv', index = None)
from google.colab import files
files.download('Predictions_fast.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>