<a href="https://colab.research.google.com/github/ShivaKumar011/Sentiment-analysis-covid-tweets/blob/main/ML_Major_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords

###**1)Read the dataset with encoding parameter set to ‘latin1’**

In [None]:
df = pd.read_csv('/content/Corona_NLP (2).csv', encoding='latin1')
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
df.shape

(41157, 6)

###**2)Remove handle null values (if any)**

In [None]:
df.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [None]:
df.dropna(subset=['Location'], inplace=True)

In [None]:
df.isnull()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
...,...,...,...,...,...,...
41147,False,False,False,False,False,False
41149,False,False,False,False,False,False
41150,False,False,False,False,False,False
41152,False,False,False,False,False,False


In [None]:
df.isnull().sum()

UserName         0
ScreenName       0
Location         0
TweetAt          0
OriginalTweet    0
Sentiment        0
dtype: int64

###**3. Preprocess the Covid tweets based on the following parameter:**
**a) Tokenizing words
b) Convert words to lower case
c) Removing Punctuations
d) Removing Stop words
e) Stemming or lemmatizing the words**

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
lb = WordNetLemmatizer()
sw = stopwords.words('english')
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
from nltk import corpus
corpus = []
for i in df['OriginalTweet']:
    txt = re.sub('[^A-Za-z0-9]',' ',i)    #Removing punctuations
    txt = txt.lower()                     #Converting to lower case
    txt = word_tokenize(txt)              #Tokenizing the words
    txt = [i for i in txt if i not in sw] #Removing stop words
    txt = [lb.lemmatize(i) for i in txt]  #Lemmatization
    txt = " ".join(txt)
    corpus.append(txt)


In [None]:
print(corpus[:20])

['menyrbie phil gahan chrisitv http co ifz9fan2pa http co xx6ghgfzcc http co i2nlzdxno8', 'advice talk neighbour family exchange phone number create contact list phone number neighbour school employer chemist gp set online shopping account po adequate supply regular med order', 'coronavirus australia woolworth give elderly disabled dedicated shopping hour amid covid 19 outbreak http co binca9vp8p', 'news region first confirmed covid 19 case came sullivan county last week people flocked area store purchase cleaning supply hand sanitizer food toilet paper good tim dodson report http co cfxch7a2lu', 'cashier grocery store sharing insight covid 19 prove credibility commented civics class know talking http co iefdnehgdo', 'supermarket today buy toilet paper rebel toiletpapercrisis covid 19 http co evxkqlidaz', 'due covid 19 retail store classroom atlanta open walk business class next two week beginning monday march 16 continue process online phone order normal thank understanding http co kw

###**4)Convert the 'Extremely Positive' and 'Extremely Negative' Sentiments to 'Positive' and'Negative' sentiments respectively**

In [None]:
df['Sentiment'].value_counts()

Positive    14383
Negative    12012
Neutral      6172
Name: Sentiment, dtype: int64

In [None]:
df['Sentiment'] = df['Sentiment'].replace({'Extremely Positive':'Positive','Extremely Negative':'Negative'})
df['Sentiment'].value_counts()

Positive    14383
Negative    12012
Neutral      6172
Name: Sentiment, dtype: int64

###**5)Transform the words into vectors using TF-IDF Vectorizer**

In [None]:
tf = TfidfVectorizer(max_features=5000)
tf_res = tf.fit_transform(corpus).toarray()

In [None]:
print(tf_res[:10])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
x = tf_res
y = df['Sentiment']
print(len(x))
print(y.shape)

32567
(32567,)


###**6)Split data into training and test data**

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(24425, 5000)
(8142, 5000)
(24425,)
(8142,)


###**7)Apply the following models on the training dataset and generate the predicted value for the test dataset**
**a) Multinomial Naïve Bayes Classification
b) RandomForest Classification
c) KNN Classification**

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
#a) Multinomial Naïve Bayes Classification
m1 = MultinomialNB()
m1.fit(x_train,y_train)

MultinomialNB()

In [None]:
print('Training Score',m1.score(x_train,y_train))
print('Testing Score',m1.score(x_test,y_test))

Training Score 0.7254042988741044
Testing Score 0.6584377302873987


In [None]:
#b) RandomForest Classification
m2 =RandomForestClassifier(n_estimators=20)
m2.fit(x_train, y_train)

RandomForestClassifier(n_estimators=20)

In [None]:
print('Training Score',m2.score(x_train,y_train))
print('Testing Score',m2.score(x_test,y_test))

Training Score 0.998157625383828
Testing Score 0.6864406779661016


In [None]:
#c) KNN Classification
m3 = KNeighborsClassifier(n_neighbors=20)
m3.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=20)

In [None]:
print('Training Score',m3.score(x_train,y_train))
print('Testing Score',m3.score(x_test,y_test))

Training Score 0.6419242579324462
Testing Score 0.5795873249815771


###**8)Predict the Sentiment for test data**

In [None]:
#8)Predict the Sentiment for test data
ypred_m1 = m1.predict(x_test)
print(ypred_m1)

['Positive' 'Negative' 'Negative' ... 'Positive' 'Negative' 'Negative']


In [None]:
ypred_m2 = m2.predict(x_test)
print(ypred_m2)

['Negative' 'Positive' 'Negative' ... 'Positive' 'Negative' 'Positive']


In [None]:
ypred_m3 = m3.predict(x_test)
print(ypred_m3)

['Positive' 'Positive' 'Negative' ... 'Positive' 'Negative' 'Negative']


###**9)Compute Confusion matrix and classification report for each of these models**

In [None]:
#a) Multinomial Naïve Bayes Classification
cm_m1 = confusion_matrix(y_test,ypred_m1)
print(cm_m1)
print(classification_report(y_test,ypred_m1))

[[2035   56  925]
 [ 389  261  890]
 [ 481   40 3065]]
              precision    recall  f1-score   support

    Negative       0.70      0.67      0.69      3016
     Neutral       0.73      0.17      0.28      1540
    Positive       0.63      0.85      0.72      3586

    accuracy                           0.66      8142
   macro avg       0.69      0.57      0.56      8142
weighted avg       0.67      0.66      0.63      8142



In [None]:
#b) RandomForest Classification
cm_m2 = confusion_matrix(y_test,ypred_m2)
print(cm_m2)
print(classification_report(y_test,ypred_m2))

[[2067  278  671]
 [ 335  824  381]
 [ 607  281 2698]]
              precision    recall  f1-score   support

    Negative       0.69      0.69      0.69      3016
     Neutral       0.60      0.54      0.56      1540
    Positive       0.72      0.75      0.74      3586

    accuracy                           0.69      8142
   macro avg       0.67      0.66      0.66      8142
weighted avg       0.68      0.69      0.68      8142



In [None]:
#c) KNN Classification
cm_m3 = confusion_matrix(y_test,ypred_m3)
print(cm_m3)
print(classification_report(y_test,ypred_m3))

[[1742  614  660]
 [ 306  869  365]
 [ 653  825 2108]]
              precision    recall  f1-score   support

    Negative       0.64      0.58      0.61      3016
     Neutral       0.38      0.56      0.45      1540
    Positive       0.67      0.59      0.63      3586

    accuracy                           0.58      8142
   macro avg       0.56      0.58      0.56      8142
weighted avg       0.61      0.58      0.59      8142



###**10)Report the model with the best accuracy.**
###*Out of Multinomial Naive Bayes Classification,RandomForest Classification ,KNN Classification may be RandomForest Classification is the best accuracy model*