In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv
/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [3]:
df = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv",encoding = "latin-1")

test = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv",encoding = "latin-1")

In [4]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [5]:
df["Sentiment"].value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

# Data Cleaning

In [6]:
 df["Sentiment"] =  df["Sentiment"].map({"Positive":"Positive", "Extremely Positive":"Positive",
"Negative":"Negative","Extremely Negative":"Negative",
                                        "Neutral":"Neutral"})

In [7]:
df["Sentiment"].value_counts()

Positive    18046
Negative    15398
Neutral      7713
Name: Sentiment, dtype: int64

In [8]:
#pip install contractions

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from string import punctuation
from nltk.stem import WordNetLemmatizer
#import contractions

In [10]:
stop_word = set(stopwords.words("english"))

In [11]:
punt = punctuation.replace("!","").replace("?","")

## Tokenizes and normalizes

In [12]:
def tweet_tokenize(tweet):
    tokenizer = TweetTokenizer(reduce_len=True)
    lemmatizer = WordNetLemmatizer()
    tokens = tokenizer.tokenize(tweet)
    
    pruned = []
    
    for token in tokens:
        if token not in punctuation and not token.startswith("http"):
            if token.startswith("@"):
                token = token.replace("@","")
                pruned.append(lemmatizer.lemmatize(token.lower()))

            else:
                
                pruned.append(lemmatizer.lemmatize(token.lower()))
            
    return pruned

In [13]:
clean_tweet = df["OriginalTweet"].apply(lambda x: tweet_tokenize(x))

In [14]:
clean_tweet.head()

0           [menyrbie, phil_gahan, chrisitv, and, and]
1    [advice, talk, to, your, neighbour, family, to...
2    [coronavirus, australia, woolworth, to, give, ...
3    [my, food, stock, is, not, the, only, one, whi...
4    [me, ready, to, go, at, supermarket, during, t...
Name: OriginalTweet, dtype: object

## Split dataset into train and test

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test = train_test_split(clean_tweet,df["Sentiment"],test_size=0.1, random_state=2023, stratify=df["Sentiment"])

In [17]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((37041,), (4116,), (37041,), (4116,))

## Lets do one hot encod to target 

In [18]:
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer

In [19]:
# Reshape column 
y_train = y_train.to_numpy().reshape(-1,1)
y_test = y_test.to_numpy().reshape(-1,1)

In [20]:
one_hot_encode = OneHotEncoder(sparse=False)

In [21]:
y_train = one_hot_encode.fit_transform(y_train)
y_test = one_hot_encode.transform(y_test)

# padding the training data

In [22]:
X_train = X_train.to_list()
X_test = X_test.to_list()

In [23]:
tokenizer = Tokenizer()

In [24]:
tokenizer.fit_on_texts(X_train)

In [25]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [26]:
max_input_length = np.max([len(x) for x in X_train])

In [27]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding= "post", maxlen = max_input_length)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding= "post", maxlen = max_input_length)

## Import LSTM**

In [28]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, SpatialDropout1D

In [29]:
max_vocab = len(tokenizer.word_index) + 1
input_data = Input((max_input_length,),name= "input_layer")
embedding_output = Embedding(max_vocab,100,mask_zero = True, name = "embedding_layer")(input_data)

In [30]:
dropout = SpatialDropout1D(0.5, name = "dropout_layer")(embedding_output)

In [31]:
lstm_output = Bidirectional(LSTM(100, dropout = 0.5, recurrent_dropout = 0.2, return_state = False, 
                                name = "lstm_layer"))(dropout)

In [32]:
output_data = Dense(3, activation = "softmax")(lstm_output)

define Learning rate

In [33]:
lr = tf.keras.optimizers.schedules.ExponentialDecay(0.01, decay_steps=50, decay_rate=0.9, staircase=False)

In [34]:
def get_lr_metric(optimizer):
    def lr(y_true, y_pred):
        return optimizer._decayed_lr(tf.float32)
    return lr

In [35]:
optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
lr_metric = get_lr_metric(optimizer)

In [36]:
model = tf.keras.Model(input_data, output_data)

In [37]:
model.compile(optimizer = optimizer, loss = "categorical_crossentropy", metrics = ["accuracy", lr_metric])
model.summary

<bound method Model.summary of <keras.engine.functional.Functional object at 0x7bf2715f1110>>

In [38]:
model.fit(X_train,y_train, batch_size = 512, epochs = 10, validation_data = (X_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7bf27231ced0>

In [39]:
 test["Sentiment"] =  test["Sentiment"].map({"Positive":"Positive", "Extremely Positive":"Positive",
"Negative":"Negative","Extremely Negative":"Negative",
                                        "Neutral":"Neutral"})

In [40]:
# normalize and toknize to test data
tweets = test["OriginalTweet"]
tweets = tweets.apply(lambda x : tweet_tokenize(x))
tweets = tweets.to_list()
tweets = tokenizer.texts_to_sequences(tweets)
tweets = tf.keras.preprocessing.sequence.pad_sequences(tweets, padding = "post", maxlen = max_input_length)

In [41]:
# predictions 
pred = model.predict(tweets)
sentiment = one_hot_encode.inverse_transform(pred)
test["Prediction"] = sentiment

In [42]:
test.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Prediction
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Negative,Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Positive,Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,Negative


In [43]:
accuracy = np.sum(test["Sentiment"] == test["Prediction"]) / len(test)
print("accuracy = {}".format(accuracy))

accuracy = 0.8433385992627699


In [44]:
# Wrong prediction 
test[test["Sentiment"] != test["Prediction"]]

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Prediction
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,Negative
5,6,44958,Los Angeles,03-03-2020,Do you remember the last time you paid $2.99 a...,Neutral,Positive
7,8,44960,"Geneva, Switzerland",03-03-2020,"@DrTedros ""We canÂt stop #COVID19 without pro...",Neutral,Negative
15,16,44968,Bengaluru,04-03-2020,#AirSewa \r\r\n\r\r\n@flyspicejet is not provi...,Negative,Positive
34,35,44987,"Angmering, West Sussex",08-03-2020,So not due to #Brexit at all then &gt;&gt; Sup...,Neutral,Negative
...,...,...,...,...,...,...,...
3770,3771,48723,"West Virginia, USA",16-03-2020,"PSA: Stop panicking about COVID-19, you don't ...",Negative,Positive
3779,3780,48732,,16-03-2020,Stuck inside? How about getting some reading ...,Positive,Negative
3780,3781,48733,"Kansas, USA",16-03-2020,@GovLauraKelly PLEASE CLOSE ALL RETAIL that is...,Positive,Negative
3788,3789,48741,Pakistan,16-03-2020,You never eaten the pigs cat dog or food from ...,Neutral,Positive
