<a href="https://colab.research.google.com/github/11AJ/ML_projects_TGC/blob/main/NLP_Covid_Tweets_Sentiment_Analysis_Classifier_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Covid Tweets Sentiment Classifier Project

1) Read the dataset "Corona_NLP_train.csv" with encode="latin1" like the code mentioned below <br>
pd.read_csv('Corona_NLP_train.csv', encode='latin1')

2) Remove/handle null values if required<br>
3) Preprocess the Covid tweets based on following parameter<br>
        
       a) Remove Punctuations
       b) Convert tweets to lower case.
       c) Word tokenize the tweets
       d) Remove stop words
       e) Apply stemming or lemmatization
       
       
4) Convert the 'Extremely Positive' and 'Extremely Negative' Sentiments to 'Positive' and 'Negative' sentiments respectively<br>
5) Apply TfIdfVectorizer and convert the tweets to an array with max_features=5000<br>
6) Split the dataset into train and test data with test size of 20%.<br>
7) Create a Multinomial NaiveBayes Classifier and apply it on train dataset and use the model to predict it on test dataset.<br>
8) Create a SVM Classifier with linear kernel and apply it on train dataset and use the model to predict it on test dataset.<br>
9) Create a RandomForest Classifier and apply it on train dataset and use the model to predict it on test dataset.<br>
10) Generate confusion matrix and classification_report for Q7, Q8 and Q9.<br>
11) Apply CountVectorizer and convert the tweets to an array with max_features=5000<br>
12) Repeat Q6 to Q10 with training data derived from array generated from Countvectorizer.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import nltk
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
import re

In [3]:
df=pd.read_csv('/content/Corona_NLP_train.csv',encoding='latin1')

In [4]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [5]:
df.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [6]:
df['Location'].shape

(41157,)

In [7]:
df['Location'].fillna('London',inplace=True)
df['Sentiment'].fillna(0,inplace=True)

In [8]:
df.isnull().sum()

UserName         0
ScreenName       0
Location         0
TweetAt          0
OriginalTweet    0
Sentiment        0
dtype: int64

In [9]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,London,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,London,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [10]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
sw=set(stopwords.words('english'))

In [12]:
from nltk.stem import WordNetLemmatizer
lm=WordNetLemmatizer()

In [13]:
filtered_text=[]
for i in df['OriginalTweet']:
  st=re.sub('[^a-zA-Z0-9]',' ',i)
  st=st.lower()
  st=word_tokenize(st)
  st=[i for i in st if i not in sw]
  st=[lm.lemmatize(i) for i in st]
  st=' '.join(st)
  filtered_text.append(st)

In [14]:
print(len(filtered_text))

41157


In [15]:
df['Sentiment'].value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

In [16]:
df['Sentiment']=df['Sentiment'].replace({'Extremely Positive':'Positive','Extremely Negative':'Negative'})

In [17]:
df['Sentiment'].value_counts()

Positive    18046
Negative    15398
Neutral      7713
Name: Sentiment, dtype: int64

In [18]:
df['Sentiment']=df['Sentiment'].replace({'Positive':1,'Neutral':0,'Negative':-1})

In [19]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,0
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,1
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,1
3,3802,48754,London,16-03-2020,My food stock is not the only one which is emp...,1
4,3803,48755,London,16-03-2020,"Me, ready to go at supermarket during the #COV...",-1


In [20]:
x=filtered_text
y=df['Sentiment']
print(len(x))
print(y.shape)

41157
(41157,)


In [21]:
from sklearn.model_selection import train_test_split

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000)
x_cv=cv.fit_transform(x)
x_train,x_test,y_train,y_test=train_test_split(x_cv,y,test_size=.2)

In [23]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(32925, 5000)
(32925,)
(8232, 5000)
(8232,)


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=5000)
tfidf_cv=tfidf.fit_transform(x)
a_train,a_test,b_train,b_test=train_test_split(tfidf_cv,y,test_size=.2)

In [25]:
print(a_train.shape)
print(b_train.shape)
print(a_test.shape)
print(b_test.shape)

(32925, 5000)
(32925,)
(8232, 5000)
(8232,)


**MULTINOMIAL NB**

In [26]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()

In [27]:
mnb.fit(a_train,b_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
print(mnb.score(a_test,b_test))

0.685374149659864


In [43]:
mnb_pred=mnb.predict(a_test)

In [44]:
from sklearn.metrics import classification_report,confusion_matrix

In [45]:
print(confusion_matrix(b_test,mnb_pred))
print(classification_report(b_test,mnb_pred))

[[2402   39  669]
 [ 361  399  719]
 [ 431   42 3170]]
              precision    recall  f1-score   support

          -1       0.75      0.77      0.76      3110
           0       0.83      0.27      0.41      1479
           1       0.70      0.87      0.77      3643

    accuracy                           0.73      8232
   macro avg       0.76      0.64      0.65      8232
weighted avg       0.74      0.73      0.70      8232



Fitting x_train,y_train

In [46]:
mnb.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [47]:
print(mnb.score(x_test,y_test))

0.6913265306122449


In [48]:
mnb_pred_x=mnb.predict(x_test)

In [49]:
print(confusion_matrix(y_test,mnb_pred_x))
print(classification_report(y_test,mnb_pred_x))

[[2255  307  613]
 [ 290  876  375]
 [ 582  374 2560]]
              precision    recall  f1-score   support

          -1       0.72      0.71      0.72      3175
           0       0.56      0.57      0.57      1541
           1       0.72      0.73      0.72      3516

    accuracy                           0.69      8232
   macro avg       0.67      0.67      0.67      8232
weighted avg       0.69      0.69      0.69      8232



**SVM**

In [50]:
from sklearn.svm import SVC

In [51]:
svm=SVC(kernel='linear')
svm.fit(a_train,b_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [52]:
print(svm.score(a_train,b_train))
print(svm.score(a_test,b_test))

0.8735307517084282
0.8041788143828961


In [53]:
svm_pred=svm.predict(a_test)

In [54]:
print(confusion_matrix(b_test,svm_pred))
print(classification_report(b_test,svm_pred))

[[2470  228  412]
 [ 220 1050  209]
 [ 349  194 3100]]
              precision    recall  f1-score   support

          -1       0.81      0.79      0.80      3110
           0       0.71      0.71      0.71      1479
           1       0.83      0.85      0.84      3643

    accuracy                           0.80      8232
   macro avg       0.79      0.79      0.79      8232
weighted avg       0.80      0.80      0.80      8232



 fiting x_train,y_train

In [55]:
svm.fit(x_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [56]:
print(svm.score(x_train,y_train))
print(svm.score(x_test,y_test))

0.9201214882308276
0.7917881438289601


In [57]:
svm_pred_x=svm.predict(x_test)

In [58]:
print(confusion_matrix(y_test,svm_pred_x))
print(classification_report(y_test,svm_pred_x))

[[2564  238  373]
 [ 244 1068  229]
 [ 422  208 2886]]
              precision    recall  f1-score   support

          -1       0.79      0.81      0.80      3175
           0       0.71      0.69      0.70      1541
           1       0.83      0.82      0.82      3516

    accuracy                           0.79      8232
   macro avg       0.78      0.77      0.77      8232
weighted avg       0.79      0.79      0.79      8232

