In [3]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
tweets = pd.read_csv('/content/drive/MyDrive/Datascience/Coronavirus Tweets NLP/Corona_NLP_train.csv', engine= 'python')

In [5]:
tweets.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [6]:
tweets['Sentiment'].value_counts()

Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: Sentiment, dtype: int64

In [7]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [8]:
corpus = []
for i in range(0, len(tweets)):
  review = re.sub('[^a-zA-Z]', ' ', tweets['OriginalTweet'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '. join(review)
  corpus.append(review)

In [9]:
corpus[0:20]

['menyrbi phil gahan chrisitv http co ifz fan pa http co xx ghgfzcc http co nlzdxno',
 'advic talk neighbour famili exchang phone number creat contact list phone number neighbour school employ chemist gp set onlin shop account poss adequ suppli regular med order',
 'coronaviru australia woolworth give elderli disabl dedic shop hour amid covid outbreak http co binca vp p',
 'food stock one empti pleas panic enough food everyon take need stay calm stay safe covid franc covid covid coronaviru confin confinementot confinementgener http co zrlg z j',
 'readi go supermarket covid outbreak paranoid food stock litterali empti coronaviru seriou thing pleas panic caus shortag coronavirusfr restezchezv stayathom confin http co usmualq n',
 'news region first confirm covid case came sullivan counti last week peopl flock area store purchas clean suppli hand sanit food toilet paper good tim dodson report http co cfxch lu',
 'cashier groceri store share insight covid prove credibl comment civic class

In [10]:
#Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 5000, stop_words = 'english')
X = cv.fit_transform(corpus).toarray()

In [11]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
X.shape

(41157, 5000)

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(tweets['Sentiment'])
tweets['Target'] = pd.DataFrame(y)

In [14]:
tweets.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,Target
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,3
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,4
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,4
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,4
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,0


In [15]:
y = tweets['Target']

In [16]:
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [17]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha = 0.2).fit(X_train,y_train)

In [18]:
pred = model.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, pred)

In [20]:
cm

array([[593,  19, 360,  47,  75],
       [ 23, 714,  87,  46, 461],
       [360, 119, 843, 248, 416],
       [ 54,  62, 236, 888, 286],
       [115, 418, 424, 363, 975]])

In [21]:
test_score = accuracy_score(y_test, pred)
test_score

0.4874878522837707

In [22]:
train_score = model.score(X_train, y_train)
train_score

0.6264844343204252

In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.52      0.54      0.53      1094
           1       0.54      0.54      0.54      1331
           2       0.43      0.42      0.43      1986
           3       0.56      0.58      0.57      1526
           4       0.44      0.42      0.43      2295

    accuracy                           0.49      8232
   macro avg       0.50      0.50      0.50      8232
weighted avg       0.49      0.49      0.49      8232



In [25]:
#TF - iDF
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features= 5000, stop_words = 'english')
x_new = tf.fit_transform(corpus).toarray()

In [26]:
x_new

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_new, y, test_size = 0.2, random_state = 0)

In [28]:
from sklearn.naive_bayes import MultinomialNB
model_tfidf = MultinomialNB(alpha= 0.1)
model_tfidf.fit(X_train, y_train)

pred_tfid = model_tfidf.predict(X_test)


from sklearn.metrics import confusion_matrix
print(confusion_matrix(pred_tfid,y_test))


[[ 332    3  124   15   39]
 [   4  383   36   30  159]
 [ 582   96 1023  285  428]
 [  31   30  149  640  229]
 [ 145  819  654  556 1440]]


In [29]:
test_score = accuracy_score(y_test, pred_tfid)
test_score

0.46379980563654033

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_tfid))

              precision    recall  f1-score   support

           0       0.65      0.30      0.41      1094
           1       0.63      0.29      0.39      1331
           2       0.42      0.52      0.46      1986
           3       0.59      0.42      0.49      1526
           4       0.40      0.63      0.49      2295

    accuracy                           0.46      8232
   macro avg       0.54      0.43      0.45      8232
weighted avg       0.51      0.46      0.46      8232

