## Sentiment Analysis from the reviews

Dataset Link: https://datahack.analyticsvidhya.com/contest/linguipedia-codefest-natural-language-processing-1/

#### Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import wordninja #to split the words 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, RNN, LSTM, GRU, Bidirectional, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import files

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Datasets/Sentiment Analysis - Analytics Vidya/train_2kmZucJ.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Datasets/Sentiment Analysis - Analytics Vidya/test_oJQbWVk.csv')
sample_df = pd.read_csv('/content/drive/MyDrive/Datasets/Sentiment Analysis - Analytics Vidya/sample_submission_LnhVWA4.csv')

In [None]:
train_df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [None]:
import nltk
nltk.download('punkt')
import tqdm
from preprocess_engine import preprocessor_engine
#from preprocess_engine_colab import preprocessor_engine


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def corpus_pre_processor(corpus):
    norm_corpus = []
    for doc in tqdm.tqdm(corpus):
        #print(type(doc))
        norm_corpus.append(preprocessor_engine(doc, html_strip=False, accent_characters=False, fix_contract=True, remove_stop_words=True , remove_digits=True, lower=False))
        
    return norm_corpus

In [None]:
train_data = corpus_pre_processor(train_df['tweet'])

100%|██████████| 7920/7920 [01:04<00:00, 122.53it/s]


In [None]:
train_data[:10]

['fingerprint pregnancy test android app beautiful cute health iger iphoneonly iphonesia iphone',
 'finally transparant silicon case thank uncle yay sony xperia sonyexperia',
 'love talk makememorie unplug relax iphone smartphone wifi connect',
 'wire know george way iphone cute daventry home',
 'amazing service apple talk question pay stupid support',
 'iphone software update fuck phone big time stupid iphones',
 'happy instapic instadaily sony xperia xperiaz',
 'new type charger cable uk bay amazon etsy new year rob cross toby young evemun mcmafia taylor spectre newyear starting recipe technology samsunggalaxys iphonex',
 'bout shop listen music iphone justme music likeforlike followforfollow',
 'photo fun selfie pool water sony camera sun instagood boy cute outdoor']

In [None]:
test_data = corpus_pre_processor(test_df['tweet'])

100%|██████████| 1953/1953 [00:16<00:00, 115.90it/s]


In [None]:
def preprocess_wordninja(sentence):  
  def split_words(x):
    x=wordninja.split(x) # split the whatisthis --> what is this
    x= [word for word in x if len(word)>1]
    return x
  new_sentence=[ ' '.join(split_words(word)) for word in sentence.split() ]
  return ' '.join(new_sentence)
    

#text=preprocess_wordninja(train_data)

In [None]:
def split_the_data(text):
  process_data = []
  for sent in tqdm.tqdm(text):
    process_data.append(preprocess_wordninja(sent))
  return process_data

In [None]:
train_data_split = split_the_data(train_data)
test_data_split = split_the_data(test_data)

100%|██████████| 7920/7920 [00:02<00:00, 3078.28it/s]
100%|██████████| 1953/1953 [00:00<00:00, 3026.93it/s]


In [None]:
train_data_split[:10]

['fingerprint pregnancy test android app beautiful cute health iger iphone only iphones iphone',
 'finally trans paran silicon case thank uncle yay sony peri sony ex peri',
 'love talk make memo rie unplug relax iphone smartphone wi fi connect',
 'wire know george way iphone cute daventry home',
 'amazing service apple talk question pay stupid support',
 'iphone software update fuck phone big time stupid iphones',
 'happy insta pic insta daily sony peri xp riaz',
 'new type charger cable uk bay amazon ets new year rob cross to by young eve mun mc mafia taylor spectre new year starting recipe technology samsung galaxy iphone',
 'bout shop listen music iphone just me music like for like follow for follow',
 'photo fun self ie pool water sony camera sun insta good boy cute outdoor']

In [None]:
train_df['label'].value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [None]:
tokenzer_sp = tf.keras.preprocessing.text.Tokenizer(oov_token = '<UNK>')
tokenzer_sp.fit_on_texts(train_data_split)

train_sequences_sp = tokenzer_sp.texts_to_sequences(train_data_split)
test_sequences_sp = tokenzer_sp.texts_to_sequences(test_data_split)

print("Vocabulary size ={}".format(len(tokenzer_sp.word_index)))
print("Number of Documents={}".format(tokenzer_sp.document_count))

Vocabulary size =10117
Number of Documents=7920


In [None]:
pd.Series(train_data_split).apply(lambda x : len(x.split())).max()

47

In [None]:
MAX_SEQUENCE_LENGTH = 47

train_pad_sequences_sp = tf.keras.preprocessing.sequence.pad_sequences(train_sequences_sp, maxlen = MAX_SEQUENCE_LENGTH, padding='post')
test_pad_sequneces_sp = tf.keras.preprocessing.sequence.pad_sequences(test_sequences_sp, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

In [None]:
y = train_df['label']

###Computing class weights
from sklearn.utils import class_weight

weights = class_weight.compute_class_weight('balanced', np.unique(y), y)

class_weights = dict(zip(np.unique(y), weights))
print("Computed class weights: ", class_weights)

Computed class weights:  {0: 0.671869697997964, 1: 1.9545903257650543}


In [None]:
def deep_model(layer_name, filename=None, epochs=50, final_pred = True, class_weight= False):
    SEED = 42
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

    EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
    VOCAB_SIZE = len(tokenzer.word_index)
    model = Sequential()
    model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
    model.add((layer_name(128)))
    model.add((Dense(128,activation = 'relu')))
    model.add(Dense(1,activation = 'sigmoid'))

    model.compile(loss = tf.keras.losses.BinaryCrossentropy(),optimizer="adam",metrics =['accuracy'])
    model.summary()
    fit_the_model(model, filename=filename, epochs=epochs, verbose=1, final_pred = final_pred, class_weight=class_weight)

In [None]:
def predictions(model, file_name):
    test_pred = model.predict_classes(test_pad_sequneces_sp)
    sample_df['label'] = test_pred
    print(file_name)
    sample_df.to_csv(f'{file_name}pred.csv', index = False)
    files.download(f'{file_name}pred.csv') 

def fit_the_model(model, filename, epochs=50, verbose=1, final_pred = True, class_weight=class_weight):
    early_stop = EarlyStopping(monitor='val_loss', patience=5, mode='min')
    if class_weight:
      model.fit(train_pad_sequences_sp, y, epochs=epochs, validation_split=0.1, callbacks=[early_stop],verbose=verbose, class_weight = class_weights)
    else:
      model.fit(train_pad_sequences_sp, y, epochs=epochs, validation_split=0.1, callbacks=[early_stop],verbose=verbose)
    if final_pred:
      predictions(model, filename)
    else:
      pass

In [None]:
deep_model(LSTM, filename='LSTM ',)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 37, 300)           4376700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 4,612,989
Trainable params: 4,612,989
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50




LSTM 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
deep_model(LSTM, filename='LSTM ',)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 37, 300)           4376700   
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 4,612,989
Trainable params: 4,612,989
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch



LSTM 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
deep_model(GRU, filename='GRU_wh ',)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 37, 300)           4376700   
_________________________________________________________________
gru_3 (GRU)                  (None, 128)               165120    
_________________________________________________________________
dense_14 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 129       
Total params: 4,558,461
Trainable params: 4,558,461
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50




GRU_wh 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
deep_model(GRU, filename='GRU_new ', epochs=10)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 37, 300)           4376700   
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               165120    
_________________________________________________________________
dense_12 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 129       
Total params: 4,558,461
Trainable params: 4,558,461
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




GRU_new 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def stack_model(layer_name, filename, epochs=50, class_weight=False):
    SEED = 42
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

    EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
    VOCAB_SIZE = len(tokenzer.word_index)
    model = Sequential()
    model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
    model.add((layer_name(256, return_sequences = True)))
    model.add((layer_name(128, return_sequences = False)))
    model.add((Dense(128,activation = 'relu')))
    model.add(Dense(1,activation = 'sigmoid'))

    model.compile(loss = tf.keras.losses.BinaryCrossentropy(),optimizer="adam",metrics =['accuracy'])
    model.summary()
    fit_the_model(model, filename, epochs=epochs, verbose=1, class_weight=class_weight)

In [None]:
stack_model(LSTM, 'Stack ')

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 37, 300)           4376700   
_________________________________________________________________
lstm_4 (LSTM)                (None, 37, 256)           570368    
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense_16 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 129       
Total params: 5,160,829
Trainable params: 5,160,829
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/5



Stack 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
stack_model(LSTM, 'Stack_class_LSTM ', class_weight=True)

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 37, 300)           4376700   
_________________________________________________________________
lstm_6 (LSTM)                (None, 37, 256)           570368    
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense_18 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 129       
Total params: 5,160,829
Trainable params: 5,160,829
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50




Stack_class_LSTM 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
stack_model(GRU, 'Stack_GRU ', class_weight=False)

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 37, 300)           4376700   
_________________________________________________________________
gru_4 (GRU)                  (None, 37, 256)           428544    
_________________________________________________________________
gru_5 (GRU)                  (None, 128)               148224    
_________________________________________________________________
dense_20 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 129       
Total params: 4,970,109
Trainable params: 4,970,109
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50




Stack_GRU 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
stack_model(GRU, 'Stack_GRU_2 ', epochs=2)

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 37, 300)           4376700   
_________________________________________________________________
gru_8 (GRU)                  (None, 37, 256)           428544    
_________________________________________________________________
gru_9 (GRU)                  (None, 128)               148224    
_________________________________________________________________
dense_24 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 129       
Total params: 4,970,109
Trainable params: 4,970,109
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2




Stack_GRU_2 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def bidirect_model(layer_name, filename, epochs=50, final_pred = True, class_weight=False):
    SEED = 42
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

    EMBEDDING_DIM = 300 #Dimension for dense embedding for each token
    VOCAB_SIZE = len(tokenzer_sp.word_index)
    model = Sequential()
    model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
    model.add(Bidirectional(layer_name(256, return_sequences= True)))
    model.add(Bidirectional(layer_name(128, return_sequences= False)))
    model.add((Dense(256,activation = 'relu')))
    model.add(Dense(1,activation = 'sigmoid'))

    model.compile(loss = tf.keras.losses.BinaryCrossentropy(),optimizer="adam",metrics =['accuracy'])
    model.summary()
    fit_the_model(model, filename=filename, epochs=epochs, verbose=1, final_pred = final_pred, class_weight=class_weight)

In [None]:
bidirect_model(LSTM, 'LSTM ')

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 37, 300)           4376700   
_________________________________________________________________
bidirectional (Bidirectional (None, 37, 512)           1140736   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               656384    
_________________________________________________________________
dense_26 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_27 (Dense)             (None, 1)                 257       
Total params: 6,239,869
Trainable params: 6,239,869
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50




LSTM 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
bidirect_model(LSTM, 'LSTM_1 ', epochs=1)

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 37, 300)           4376700   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 37, 512)           1140736   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 256)               656384    
_________________________________________________________________
dense_30 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_31 (Dense)             (None, 1)                 257       
Total params: 6,239,869
Trainable params: 6,239,869
Non-trainable params: 0
_________________________________________________________________




LSTM_1 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
bidirect_model(GRU, 'GRU_BI ',)

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 37, 300)           4376700   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 37, 512)           857088    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 256)               493056    
_________________________________________________________________
dense_32 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_33 (Dense)             (None, 1)                 257       
Total params: 5,792,893
Trainable params: 5,792,893
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50




GRU_BI 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
bidirect_model(GRU, 'GRU_BI_1 ',epochs=1)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 47, 300)           3035400   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 47, 512)           857088    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 256)               493056    
_________________________________________________________________
dense_4 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 4,451,593
Trainable params: 4,451,593
Non-trainable params: 0
_________________________________________________________________




GRU_BI_1 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-py3-none-any.whl size=98051302 sha256=9664e22f92ef00d7ddb4e3a860d5e46df2b1eb088cd026e50e228e520563a4f4
  Stored in directory: /tmp/pip-ephem-wheel-cache-_3wose7q/wheels/69/c5/b8/4f1c029d89238734311b3269762ab2ee325a42da2ce8edb997
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


#### Glove Vector

In [None]:
import spacy

nlp = spacy.load('en_core_web_md')
total_vectors = len(nlp.vocab.vectors)

print('Total word vectors:', total_vectors)

Total word vectors: 20000


In [None]:
def glove_vector(text):
  process_vec = []
  for sent in text:
    process_vec.append(nlp(sent).vector)
  return process_vec

In [None]:
train_vector = glove_vector(train_data_split)

In [None]:
test_vector = glove_vector(test_data_split)

In [None]:
train_vector_re = np.array(train_vector).reshape(7920, 300, 1)

In [None]:
test_vector_re = np.array(test_vector).reshape(-1, 300, 1)

In [None]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
model = Sequential()
#model.add((Embedding(input_dim =VOCAB_SIZE+1,output_dim = EMBEDDING_DIM,input_length = MAX_SEQUENCE_LENGTH)))
model.add((GRU(256, return_sequences= True, input_shape = (300,1))))
model.add(Bidirectional(GRU(128, return_sequences= True)))
model.add(Bidirectional(GRU(128, return_sequences= False)))

model.add((Dense(256,activation = 'relu')))
model.add(Dense(1,activation = 'sigmoid'))

model.compile(loss = tf.keras.losses.BinaryCrossentropy(),optimizer="adam",metrics =['accuracy'])
model.summary()
    #fit_the_model(model, filename=filename, epochs=epochs, verbose=1, final_pred = final_pred, class_weight=class_weight)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_7 (GRU)                  (None, 300, 256)          198912    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 300, 256)          296448    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 256)               296448    
_________________________________________________________________
dense_6 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 257       
Total params: 857,857
Trainable params: 857,857
Non-trainable params: 0
_________________________________________________________________


In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, mode='min')

model.fit(train_vector_re, y, validation_split=0.1, epochs=50, callbacks = [early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50


<tensorflow.python.keras.callbacks.History at 0x7f26fc246f90>

In [None]:
test_pred = model.predict_classes(test_vector_re)



In [None]:
sample_df['label'] = test_pred
    #print(file_name)
sample_df.to_csv('glove_gru_pred.csv', index = False)
files.download('glove_gru_pred.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>