### Importing all the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.util import ngrams

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import classification_report,confusion_matrix

from collections import defaultdict
from collections import Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))

import re
from nltk.tokenize import word_tokenize
import gensim
import string

from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout, Bidirectional
from keras.initializers import Constant
from keras.optimizers import Adam

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


### Read data

In [8]:
df_train = pd.read_excel('train.xlsx')
df_test = pd.read_excel('test.xlsx')

In [9]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700,26


### Creating embeddings dictionary by loading GloVe (6B tokens) with 100-D vectors

In [10]:
dictionary_embedding={}
with open('glove.6B.100d.txt','r', encoding="utf8") as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        dictionary_embedding[word]=vectors

### Building vocabulary from text

In [11]:
def build_vocab(texts):
    """
    :param sntcs: list of list of words
    :return: dictionary of words and their count
    """
    sntcs = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sntc in tqdm(sntcs):
        for word in sntc:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

### Examine the extent to which the embeddings cover the vocabulary.

In [12]:
import operator

def check_coverage(vocab,embeddings_index):
    p = {}
    oov = {}
    q = 0
    i = 0
    for word in tqdm(vocab):
        try:
            p[word] = embeddings_index[word]
            q += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(p) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(q / (q + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

### Text preprocessing.

In [13]:
df = pd.concat([df_train,df_test])

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',str(text))

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    pattern_of_emoji = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return pattern_of_emoji.sub(r'', text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

def lower(text):
    words = text.split(" ")
    lower = " ".join([w.lower() for w in words])
    return lower

df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
df['text']=df['text'].apply(lambda x: remove_emoji(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))
df['text']=df['text'].apply(lambda x : lower(x))

### Label Encoding

In [14]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the "sentiment" column
df_train['sentiment_encoded'] = label_encoder.fit_transform(df_train['sentiment'])
df_test['sentiment_encoded'] = label_encoder.fit_transform(df_test['sentiment'])

### Converting the data into a format suitable for input to the LSTM network. If a sntc exceeds 32 words, truncate it. Later, creating an embedding matrix.

In [15]:
MAX_LEN = 32
obj_tokenizer = Tokenizer()
obj_tokenizer.fit_on_texts(df['text'].values.tolist())
sequences = obj_tokenizer.texts_to_sequences(df['text'].values)
text_pad = pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')
word_index = obj_tokenizer.word_index

words_count = len(word_index)+1
dim = 100

words_unknown = np.random.uniform(-1,1,size=dim).astype('float32')
words_unknown = words_unknown.reshape(1,dim)

embedding_matrix = np.zeros((words_count, dim))
for word, i in tqdm(word_index.items()):
    if i > words_count:
        continue
    emb_vec=dictionary_embedding.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec
    else:
        embedding_matrix[i]=words_unknown

embedding_matrix.shape

100%|██████████| 30103/30103 [00:00<00:00, 406964.47it/s]


(30104, 100)

### Define model - Bidirectional LSTM layer

In [16]:
model=Sequential()

embedding = Embedding(words_count,dim,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(Bidirectional(LSTM(256, dropout=0.25, recurrent_dropout=0.2)))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
# model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 100)           3010400   
                                                                 
 bidirectional (Bidirection  (None, 512)               731136    
 al)                                                             
                                                                 
 dense (Dense)               (None, 1024)              525312    
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                        

#### - Specify callback functions for the LSTM model.
#### - Use the ModelCheckpoint callback to save the best performing model during training.
#### - Using the EarlyStopping callback to stop training early if there is no improvement in the model's performance.

In [17]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
def get_callbacks():
    path_chckpt ='checkpoint_keras.h5'
    dir_log='logs'
    callback_chckpt = ModelCheckpoint(filepath=path_chckpt,
                                         monitor='val_accuracy',
                                         verbose=1,
                                         save_weights_only=False,
                                         save_best_only=True,
                                         mode='max',
                                         period=1)
    callbk_early_stopping = EarlyStopping(monitor='val_accuracy',
                                           patience=5,
                                           verbose=1)
    callbacks = [callback_chckpt, callbk_early_stopping]
    return callbacks

### Divide the data into training and validation (testing) sets, with a split ratio of 80:20. Train the model.

In [None]:
train = text_pad[:df_train.shape[0]]
test = text_pad[df_train.shape[0]:]

X_train,X_test,y_train,y_test=train_test_split(train,df_train['sentiment_encoded'].values,test_size=0.2,random_state=40)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

history=model.fit(X_train,y_train,
                  batch_size=64,
                  epochs=25,
                  validation_data=(X_test,y_test),
                  verbose=1,
                  callbacks = get_callbacks())

Shape of train (18271, 32)
Shape of Validation  (4568, 32)
Epoch 1/25
Epoch 1: val_accuracy improved from -inf to 0.78919, saving model to checkpoint_keras.h5
Epoch 2/25
Epoch 2: val_accuracy improved from 0.78919 to 0.80823, saving model to checkpoint_keras.h5
Epoch 3/25
Epoch 3: val_accuracy improved from 0.80823 to 0.81502, saving model to checkpoint_keras.h5
Epoch 4/25
Epoch 4: val_accuracy did not improve from 0.81502
Epoch 5/25
Epoch 5: val_accuracy improved from 0.81502 to 0.82662, saving model to checkpoint_keras.h5
Epoch 6/25
Epoch 6: val_accuracy improved from 0.82662 to 0.83800, saving model to checkpoint_keras.h5
Epoch 7/25
Epoch 7: val_accuracy improved from 0.83800 to 0.84764, saving model to checkpoint_keras.h5
Epoch 8/25
Epoch 8: val_accuracy did not improve from 0.84764
Epoch 9/25
Epoch 9: val_accuracy improved from 0.84764 to 0.85661, saving model to checkpoint_keras.h5
Epoch 10/25
Epoch 10: val_accuracy did not improve from 0.85661
Epoch 11/25
Epoch 11: val_accuracy 

### Display the evaluation metrics for the validation data.

In [None]:
from sklearn import metrics

pred_y = model.predict(X_test)
pred_y = pred_y.round().astype('int')

print(metrics.accuracy_score(y_test,pred_y))
print(metrics.confusion_matrix(y_test,pred_y))
print(metrics.classification_report(y_test,pred_y))

0.8892294220665499
[[2359  255]
 [ 251 1703]]
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      2614
           1       0.87      0.87      0.87      1954

    accuracy                           0.89      4568
   macro avg       0.89      0.89      0.89      4568
weighted avg       0.89      0.89      0.89      4568



### Using the trained model to analyze the sentiment of the test data.

In [20]:
test_pred = model.predict(test)
test_pred_int = test_pred.round().astype('int')
df_test['pred'] = test_pred_int

In [None]:
df_test.to_csv('/content/output.csv', index=False)