In [207]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import  one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [208]:
df_train = pd.read_csv('Corona_NLP_train.csv', encoding='latin1')
df_test = pd.read_csv('Corona_NLP_test.csv', encoding='latin1')

In [209]:
df = pd.concat([df_train, df_test])

In [210]:
df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...,...,...,...,...
3793,3794,48746,Israel ??,16-03-2020,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,3795,48747,"Farmington, NM",16-03-2020,Did you panic buy a lot of non-perishable item...,Negative
3795,3796,48748,"Haverford, PA",16-03-2020,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,3797,48749,,16-03-2020,Gov need to do somethings instead of biar je r...,Extremely Negative


In [211]:
df = df.drop(columns=['UserName', 'ScreenName', 'TweetAt', 'Location'], axis=1)

In [212]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,Did you panic buy a lot of non-perishable item...,Negative
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,Gov need to do somethings instead of biar je r...,Extremely Negative


In [213]:
df['Sentiment'].value_counts()

Sentiment
Positive              12369
Negative              10958
Neutral                8332
Extremely Positive     7223
Extremely Negative     6073
Name: count, dtype: int64

In [214]:
df.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [215]:
df.duplicated().sum()

0

In [216]:
# Checking for imbalance dataset
df['Sentiment'].value_counts()

Sentiment
Positive              12369
Negative              10958
Neutral                8332
Extremely Positive     7223
Extremely Negative     6073
Name: count, dtype: int64

In [217]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative
...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive
3794,Did you panic buy a lot of non-perishable item...,Negative
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral
3796,Gov need to do somethings instead of biar je r...,Extremely Negative


In [218]:
X = df.drop('Sentiment', axis=1)
y = df['Sentiment']

In [219]:
X

Unnamed: 0,OriginalTweet
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...
1,advice Talk to your neighbours family to excha...
2,Coronavirus Australia: Woolworths to give elde...
3,My food stock is not the only one which is emp...
4,"Me, ready to go at supermarket during the #COV..."
...,...
3793,Meanwhile In A Supermarket in Israel -- People...
3794,Did you panic buy a lot of non-perishable item...
3795,Asst Prof of Economics @cconces was on @NBCPhi...
3796,Gov need to do somethings instead of biar je r...


In [220]:
y

0                  Neutral
1                 Positive
2                 Positive
3                 Positive
4       Extremely Negative
               ...        
3793              Positive
3794              Negative
3795               Neutral
3796    Extremely Negative
3797    Extremely Positive
Name: Sentiment, Length: 44955, dtype: object

In [221]:
voc_size = 5000

In [222]:
messages = X.copy()

In [223]:
messages.reset_index(inplace=True)

In [224]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['OriginalTweet'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [225]:
corpus

['menyrbi phil gahan chrisitv http co ifz fan pa http co xx ghgfzcc http co nlzdxno',
 'advic talk neighbour famili exchang phone number creat contact list phone number neighbour school employ chemist gp set onlin shop account poss adequ suppli regular med order',
 'coronaviru australia woolworth give elderli disabl dedic shop hour amid covid outbreak http co binca vp p',
 'food stock one empti pleas panic enough food everyon take need stay calm stay safe covid franc covid covid coronaviru confin confinementot confinementgener http co zrlg z j',
 'readi go supermarket covid outbreak paranoid food stock litterali empti coronaviru seriou thing pleas panic caus shortag coronavirusfr restezchezv stayathom confin http co usmualq n',
 'news region first confirm covid case came sullivan counti last week peopl flock area store purchas clean suppli hand sanit food toilet paper good tim dodson report http co cfxch lu',
 'cashier groceri store share insight covid prove credibl comment civic class

In [226]:
onehot_repr = [one_hot(words, voc_size) for words in corpus]
print(onehot_repr)

[[4947, 2083, 1089, 3338, 2460, 1585, 1903, 4580, 3212, 2460, 1585, 4751, 2113, 2460, 1585, 479], [3885, 2961, 1133, 2865, 780, 362, 3365, 3734, 3407, 4424, 362, 3365, 1133, 2325, 2389, 2863, 3975, 2461, 1231, 2529, 81, 3486, 3783, 2432, 1524, 625, 2561], [1307, 1510, 1601, 1408, 4281, 2175, 4245, 2529, 4522, 2084, 4433, 1075, 2460, 1585, 1849, 3676, 3173], [1433, 1613, 121, 2449, 2837, 4664, 2914, 1433, 1094, 3084, 176, 1452, 766, 1452, 336, 4433, 4376, 4433, 4433, 1307, 1204, 4999, 3779, 2460, 1585, 4341, 545, 4469], [4906, 4564, 1430, 4433, 1075, 1592, 1433, 1613, 2447, 2449, 1307, 1861, 4400, 2837, 4664, 1386, 2793, 322, 1696, 3709, 1204, 2460, 1585, 3978, 1237], [4563, 3435, 2251, 2795, 4433, 1234, 4416, 2650, 4665, 1380, 2708, 3522, 2456, 3257, 4228, 149, 1124, 2432, 105, 3157, 1433, 4954, 4198, 4180, 1779, 2461, 3322, 2460, 1585, 4507, 460], [1592, 1078, 4228, 4894, 1094, 4433, 1487, 716, 1698, 4189, 1502, 4578, 2961, 2460, 1585, 1838], [1430, 3477, 3765, 4954, 4198, 3478, 2657,

In [227]:
sent_length = 30
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)

In [228]:
embedded_docs

array([[   0,    0,    0, ..., 2460, 1585,  479],
       [   0,    0,    0, ..., 1524,  625, 2561],
       [   0,    0,    0, ..., 1849, 3676, 3173],
       ...,
       [   0,    0,    0, ..., 1585, 4385, 3096],
       [   0,    0,    0, ..., 1648, 4683, 4433],
       [   0,    0,    0, ..., 2460, 1585, 1892]])

In [229]:
embedding_vector_features = 40
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features, input_length=sent_length))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(5, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary)

<bound method Model.summary of <keras.engine.sequential.Sequential object at 0x0000026A14C322C0>>


In [230]:
X_final = np.array(embedded_docs)
y_final = np.array(y)

In [231]:
labelencoder = LabelEncoder()
labelencoder.fit(['Positive', 'Negative', 'Neutral', 'Extremely Positive', 'Extremely Negative'])
y_final = labelencoder.transform(y_final)

In [232]:
y_final

array([3, 4, 4, ..., 3, 0, 1])

In [233]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [234]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

In [235]:
y_train

array([0, 0, 1, ..., 0, 3, 0])

In [236]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: early stopping


<keras.callbacks.History at 0x26ac6f3fb50>

In [237]:
y_pred = model.predict(X_test)



In [238]:
y_pred

array([[1.02481048e-03, 8.83996069e-01, 6.09048968e-03, 9.95215215e-03,
        9.89365056e-02],
       [6.99312752e-03, 2.22911891e-02, 1.41586065e-01, 3.37465137e-01,
        4.91664469e-01],
       [7.14212239e-01, 3.75486277e-02, 1.85737088e-01, 1.72932800e-02,
        4.52088006e-02],
       ...,
       [9.17232828e-04, 9.20756578e-01, 4.92743030e-03, 7.06404215e-03,
        6.63346648e-02],
       [8.31483211e-03, 6.86542094e-02, 1.10281006e-01, 2.13202089e-01,
        5.99547863e-01],
       [6.37689605e-04, 2.44894531e-03, 2.30297595e-02, 8.69877517e-01,
        1.04006104e-01]], dtype=float32)

In [239]:
y_pred = np.where(y_pred>=0.5, 1, 0)

In [240]:
y_pred

array([[0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0]])