# Random Forest

In [1]:
#import library packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore") 

In [3]:
#Load given dataset
df = pd.read_csv('corona-Copy1.csv')


In [4]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,"Me, ready to go at supermarket during the #COV...",Extremely Negative
2,Was at the supermarket today. Didn't buy toile...,Neutral
3,All month there hasn't been crowding in the su...,Neutral
4,"Due to the Covid-19 situation, we have increas...",Extremely Positive
...,...,...
2412,"Oil prices at 2002 already, are we back almost...",Neutral
2413,Why is Government not transmitting benefits of...,Positive
2414,"""As long as we're not seeing markets I would c...",Extremely Positive
2415,Will school fees be refunded if the #coronavir...,Neutral


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2417 entries, 0 to 2416
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OriginalTweet  2417 non-null   object
 1   Sentiment      2417 non-null   object
dtypes: object(2)
memory usage: 37.9+ KB


In [6]:

# Data cleaning and preprocessing

import re
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [7]:

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [8]:
corpus=[]
for i in range(0, len(df)):
    review=re.sub('[^a-zA-Z0-9]',' ', str(df['OriginalTweet'][i]))
    review=review.lower()
    review=review.split()

    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [9]:
corpus

['menyrbi phil gahan chrisitv http co ifz9fan2pa http co xx6ghgfzcc http co i2nlzdxno8',
 'readi go supermarket covid19 outbreak paranoid food stock litterali empti coronaviru seriou thing pleas panic caus shortag coronavirusfr restezchezv stayathom confin http co usmualq72n',
 'supermarket today buy toilet paper rebel toiletpapercrisi covid 19 http co evxkqlidaz',
 'month crowd supermarket restaur howev reduc hour close mall mean everyon use entranc depend singl supermarket manila lockdown covid2019 philippin http co hxws9lanf9',
 'due covid 19 situat increas demand food product wait time may longer onlin order particularli beef share freezer pack thank patienc time',
 'horningsea care commun let look less capabl villag ensur stay healthi bring shop door help onlin shop self isol symptom expos somebodi http co lsgrxxhjhh',
 'eyeonthearct 16mar20 russia consum surveil watchdog report case high arctic man travel iran covid 19 101 observ http co 4wnrrk9okc http co ld05k5eyn',
 'amazon gl

In [10]:
df

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,"Me, ready to go at supermarket during the #COV...",Extremely Negative
2,Was at the supermarket today. Didn't buy toile...,Neutral
3,All month there hasn't been crowding in the su...,Neutral
4,"Due to the Covid-19 situation, we have increas...",Extremely Positive
...,...,...
2412,"Oil prices at 2002 already, are we back almost...",Neutral
2413,Why is Government not transmitting benefits of...,Positive
2414,"""As long as we're not seeing markets I would c...",Extremely Positive
2415,Will school fees be refunded if the #coronavir...,Neutral


In [11]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(max_features=2000)
X=tv.fit_transform(corpus).toarray()
y=np.array(df['Sentiment'])

In [12]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
X.shape

(2417, 2000)

In [14]:
# Since data is imbalanced
# Trying over sampling

from imblearn.over_sampling import RandomOverSampler

rs=RandomOverSampler()
X,y=rs.fit_resample(X,y)

X.shape,y.shape

((3850, 2000), (3850,))

In [21]:

# Train Test Split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1, random_state=42, stratify=y)

In [22]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=42)
RFC.fit(X_train,y_train)

In [23]:
predict = RFC.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score
print('Accuracy of Random Forest Classifier',accuracy_score(y_test,predict)*100)

Accuracy of Random Forest Classifier 85.97402597402596


In [43]:
from sklearn.metrics import confusion_matrix
print('Confusuion matrix of Random Forest Classifier\n',confusion_matrix(y_test,predict))

Confusuion matrix of Random Forest Classifier
 [[54 10  4  9  0]
 [10 58  0  9  0]
 [ 2  1 69  5  0]
 [ 3  6  0 68  0]
 [ 0  0  0  0 77]]


In [44]:
from sklearn.metrics import classification_report
print('Classification report of Random Forest Classifier\n\n',classification_report(y_test,predict))

Classification report of Random Forest Classifier

                     precision    recall  f1-score   support

Extremely Negative       0.78      0.70      0.74        77
Extremely Positive       0.77      0.75      0.76        77
          Negative       0.95      0.90      0.92        77
           Neutral       0.75      0.88      0.81        77
          Positive       1.00      1.00      1.00        77

          accuracy                           0.85       385
         macro avg       0.85      0.85      0.85       385
      weighted avg       0.85      0.85      0.85       385



In [19]:
import joblib
joblib.dump(tv,'tv2.pkl')

['tv2.pkl']

In [20]:
import joblib
joblib.dump(RFC,'rfc1.pkl')

['rfc1.pkl']