In [1]:
#Importing all necessary libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

In [2]:
# making dataframe of traininig csv file
train_df=pd.read_csv("Corona_NLP_train.csv")
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [3]:
# making dataframe of testing csv file
test_df=pd.read_csv("Corona_NLP_test.csv")
test_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [4]:
# taking necessary columns from dataset
train_df=train_df[['OriginalTweet', 'Sentiment']]
test_df=test_df[['OriginalTweet', 'Sentiment']]

In [5]:
# checking for Null values in training data 
train_df.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [6]:
# checking for Null values in testing data 
test_df.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [7]:
# checking shape of data
print(train_df.shape)
print(test_df.shape)

(41157, 2)
(3798, 2)


In [8]:
# initializing lemmatization
lemmatizer = WordNetLemmatizer()

In [9]:
# storing punctuations in the variable 
puncs_ = string.punctuation.replace('@','')
puncs = puncs_.replace('#','')
puncs

# defining a function for preprocessing all the tweets data 
def textClean(text):
    # coversion of all words to lower case
    lower = [char.lower() for char in text if char not in puncs]
    lower = ''.join(lower)
    lower = ' '.join(lower.split())
    
    # deleting all @mentions and #tags from tweets
    for char in lower:
        if lower.find('@')==-1 and lower.find('#')==-1: # break loop once @ and # is over
            break
        if (char=='@' or char=='#'):
            try:
                char_index = lower.index(char)
            except ValueError:
                break
                
            del_word = ''
            while char not in string.whitespace:
                del_word = del_word+lower[char_index]
                char_index = char_index + 1
                try:
                    char = lower[char_index] 
                except IndexError:
                    char = ' '
                except:
                    print("Something else went wrong")
            lower = lower.replace(del_word,'',1)
    lower = [char for char in lower if char not in string.punctuation and char not in string.digits]
    lower = ''.join(lower)
    
#   converting words to tokens
    tokens = word_tokenize(lower)
    
#   deleting http words
    nohttp = [word for word in tokens if word[0:4]!='http']
    
#   eleminating stop words from tweets
    nostop = [word for word in nohttp if word not in stopwords.words('english')]
    
#   applying lemmatization 
    lemmatized = [lemmatizer.lemmatize(word) for word in nostop]
    return lemmatized 

In [10]:
# printing the first five tweets preprocessed words
tweet_words = train_df.OriginalTweet.head().apply(textClean)
for i in tweet_words:
    print(i)

[]
['advice', 'talk', 'neighbour', 'family', 'exchange', 'phone', 'number', 'create', 'contact', 'list', 'phone', 'number', 'neighbour', 'school', 'employer', 'chemist', 'gp', 'set', 'online', 'shopping', 'account', 'po', 'adequate', 'supply', 'regular', 'med', 'order']
['coronavirus', 'australia', 'woolworth', 'give', 'elderly', 'disabled', 'dedicated', 'shopping', 'hour', 'amid', 'covid', 'outbreak']
['food', 'stock', 'one', 'empty', 'please', 'dont', 'panic', 'enough', 'food', 'everyone', 'take', 'need', 'stay', 'calm', 'stay', 'safe']
['ready', 'go', 'supermarket', 'outbreak', 'im', 'paranoid', 'food', 'stock', 'litteraly', 'empty', 'serious', 'thing', 'please', 'dont', 'panic', 'cause', 'shortage']


In [11]:
# converting text to vector
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=textClean)
x = vectorizer.fit_transform(train_df['OriginalTweet'])
y = train_df['Sentiment']
print(x.shape,y.shape)

(41157, 34673) (41157,)


In [27]:
# converting data into training and testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [28]:
# initializing naive bayes model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

MultinomialNB()

In [29]:
# checking the accuracy and report on training data 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
train_pred = model.predict(x_train)
print(classification_report(y_train, train_pred))

print("\nConfusion Matrix: \n", confusion_matrix(y_train, train_pred))
print("\nAccuracy: \n", accuracy_score(y_train, train_pred))

                    precision    recall  f1-score   support

Extremely Negative       0.86      0.65      0.74      4387
Extremely Positive       0.81      0.69      0.75      5293
          Negative       0.68      0.77      0.72      7931
           Neutral       0.92      0.53      0.67      6187
          Positive       0.61      0.85      0.71      9127

          accuracy                           0.72     32925
         macro avg       0.78      0.70      0.72     32925
      weighted avg       0.75      0.72      0.72     32925


Confusion Matrix: 
 [[2835   30 1086   35  401]
 [  22 3660  210   27 1374]
 [ 306  159 6070  115 1281]
 [  56  147  900 3281 1803]
 [  71  512  686  123 7735]]

Accuracy: 
 0.7162034927866363


In [30]:
# checking the accuracy and report on testing data 
test_pred = model.predict(x_test)
print(classification_report(y_test, test_pred))

print("\nConfusion Matrix: \n", confusion_matrix(y_test, test_pred))
print("\nAccuracy: \n", accuracy_score(y_test, test_pred))

                    precision    recall  f1-score   support

Extremely Negative       0.60      0.39      0.47      1094
Extremely Positive       0.57      0.43      0.49      1331
          Negative       0.44      0.53      0.48      1986
           Neutral       0.69      0.35      0.46      1526
          Positive       0.41      0.61      0.49      2295

          accuracy                           0.48      8232
         macro avg       0.54      0.46      0.48      8232
      weighted avg       0.52      0.48      0.48      8232


Confusion Matrix: 
 [[ 426    7  515   21  125]
 [   8  572   80   20  651]
 [ 204   70 1048   83  581]
 [  25   40  304  532  625]
 [  45  321  416  120 1393]]

Accuracy: 
 0.48238581146744414


In [31]:
# converting text to vector
x_ = vectorizer.transform(test_df['OriginalTweet'])
y_=test_df['Sentiment']
print(x_.shape,y_.shape)

(3798, 34673) (3798,)


In [32]:
y_pred = model.predict(x_)
print(classification_report(y_, y_pred))

print("\nConfusion Matrix: \n", confusion_matrix(y_, y_pred))
print("\nFinal Testing Accuracy: \n", accuracy_score(y_, y_pred))

                    precision    recall  f1-score   support

Extremely Negative       0.61      0.32      0.42       592
Extremely Positive       0.64      0.36      0.46       599
          Negative       0.44      0.50      0.47      1041
           Neutral       0.68      0.22      0.33       619
          Positive       0.37      0.70      0.49       947

          accuracy                           0.45      3798
         macro avg       0.55      0.42      0.43      3798
      weighted avg       0.52      0.45      0.44      3798


Confusion Matrix: 
 [[188   4 308   5  87]
 [  3 218  38   3 337]
 [ 85  22 522  40 372]
 [  8  10 158 134 309]
 [ 24  87 160  16 660]]

Final Testing Accuracy: 
 0.45339652448657186
