# Logistic regression

In [1]:
#import library packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore") 

In [3]:
#Load given dataset
df = pd.read_csv('corona.csv')


In [4]:
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,"Me, ready to go at supermarket during the #COV...",Extremely Negative
2,Was at the supermarket today. Didn't buy toile...,Neutral
3,All month there hasn't been crowding in the su...,Neutral
4,"Due to the Covid-19 situation, we have increas...",Extremely Positive
...,...,...
2412,"Oil prices at 2002 already, are we back almost...",Neutral
2413,Why is Government not transmitting benefits of...,Positive
2414,"""As long as we're not seeing markets I would c...",Extremely Positive
2415,Will school fees be refunded if the #coronavir...,Neutral


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2417 entries, 0 to 2416
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OriginalTweet  2417 non-null   object
 1   Sentiment      2417 non-null   object
dtypes: object(2)
memory usage: 37.9+ KB


In [7]:

# Data cleaning and preprocessing

import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SPIRO11\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [9]:
corpus=[]
for i in range(0, len(df)):
  review=re.sub('[^a-zA-Z]0-9',' ', str(df['OriginalTweet'][i]))
  review=review.lower()
  review=review.split()

  review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
  review=' '.join(review)
  corpus.append(review)

In [10]:
corpus

['@menyrbi @phil_gahan @chrisitv https://t.co/ifz9fan2pa https://t.co/xx6ghgfzcc https://t.co/i2nlzdxno8',
 "me, readi go supermarket #covid19 outbreak. i'm paranoid, food stock litterali empty. #coronaviru seriou thing, please, panic. caus shortage... #coronavirusfr #restezchezv #stayathom #confin https://t.co/usmualq72n",
 'supermarket today. buy toilet paper. #rebel #toiletpapercrisi #covid_19 https://t.co/evxkqlidaz',
 'month crowd supermarket restaurants, howev reduc hour close mall mean everyon use entranc depend singl supermarket. #manila #lockdown #covid2019 #philippin https://t.co/hxws9lanf9',
 'due covid-19 situation, increas demand food products. wait time may longer onlin orders, particularli beef share freezer packs. thank patienc time.',
 '#horningsea care community. let\x92 look less capabl villag ensur stay healthy. bring shop doors, help onlin shop self isol symptom expos somebodi has. https://t.co/lsgrxxhjhh',
 '@eyeonthearct 16mar20 russia consum surveil watchdog rep

In [11]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,"Me, ready to go at supermarket during the #COV...",Extremely Negative
2,Was at the supermarket today. Didn't buy toile...,Neutral
3,All month there hasn't been crowding in the su...,Neutral
4,"Due to the Covid-19 situation, we have increas...",Extremely Positive
...,...,...
2412,"Oil prices at 2002 already, are we back almost...",Neutral
2413,Why is Government not transmitting benefits of...,Positive
2414,"""As long as we're not seeing markets I would c...",Extremely Positive
2415,Will school fees be refunded if the #coronavir...,Neutral


In [12]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X=tv.fit_transform(corpus).toarray()
y=np.array(df['Sentiment'])

In [13]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
X.shape

(2417, 2500)

In [16]:
# Since data is imbalanced
# Trying over sampling

from imblearn.over_sampling import RandomOverSampler

rs=RandomOverSampler()
X,y=rs.fit_resample(X,y)

X.shape,y.shape

((3850, 2500), (3850,))

In [17]:

# Train Test Split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [19]:
predict = lr.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score
print('Accuracy of Logistic Regression',accuracy_score(y_test,predict)*100)

Accuracy of Logistic Regression 75.58441558441558


In [21]:
from sklearn.metrics import confusion_matrix
print('Confusuion matrix of Logistic Regression\n',confusion_matrix(y_test,predict))

Confusuion matrix of Logistic Regression
 [[105  12  23  14   5]
 [ 19 103   3  14   7]
 [ 19   6  96  18   2]
 [ 11  15  12 120   2]
 [  3   1   1   1 158]]


In [22]:
from sklearn.metrics import classification_report
print('Classification report of Logistic Regression\n\n',classification_report(y_test,predict))

Classification report of Logistic Regression

                     precision    recall  f1-score   support

Extremely Negative       0.67      0.66      0.66       159
Extremely Positive       0.75      0.71      0.73       146
          Negative       0.71      0.68      0.70       141
           Neutral       0.72      0.75      0.73       160
          Positive       0.91      0.96      0.93       164

          accuracy                           0.76       770
         macro avg       0.75      0.75      0.75       770
      weighted avg       0.75      0.76      0.75       770

