In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier


In [2]:
train_df = pd.read_csv('COVID-19_Sentiments.csv')
test_df = pd.read_csv('COVID-19_test.csv')
df = pd.concat([train_df, test_df], ignore_index=True)

In [3]:
df.head()

Unnamed: 0,Text_Id,Text,Date,Location,Sentiments
0,1241032866567356417,RT @theskindoctor13: Shaheen Bagh is still on....,Fri Mar 20 16:04:27 +0000 2020,"Uttar Pradesh, India",0.0
1,1241032867699765249,RT @theskindoctor13: Shaheen Bagh is still on....,Fri Mar 20 16:04:27 +0000 2020,Mumbai,0.0
2,1241032875102703616,"RT @SmokingSkills_: Daughter of an IAS, son of...",Fri Mar 20 16:04:29 +0000 2020,"Jodhpur, India",0.35
3,1241032877099237379,RT @narendramodi: The young actors have someth...,Fri Mar 20 16:04:29 +0000 2020,"Gurugram, Bharat",0.125
4,1241032870405128192,RT @theskindoctor13: Shaheen Bagh is still on....,Fri Mar 20 16:04:28 +0000 2020,"New Delhi, India",0.0


In [4]:
df.tail()

Unnamed: 0,Text_Id,Text,Date,Location,Sentiments
678421,1270215754621661184,We are student not a testing kit.\nIf we had c...,Tue Jun 09 04:46:50 +0000 2020,"Bidar, India",0.0
678422,1270215743376961541,RT @sharmanagendar: Amazing views of Twitter C...,Tue Jun 09 04:46:47 +0000 2020,"delhi , india",0.5
678423,1270215758413361153,RT @SitaramYechury: When the crude prices fell...,Tue Jun 09 04:46:50 +0000 2020,"New Delhi, India",-0.7
678424,1270215743842344962,"RT @vimoh: Amit Shah: On Corona, we may have f...",Tue Jun 09 04:46:47 +0000 2020,India,0.0
678425,1270215747113857024,RT @ANI: Government of India issues fresh guid...,Tue Jun 09 04:46:48 +0000 2020,"Uttar Pradesh, India",0.15


In [5]:
df['text_len'] = df['Text'].apply(lambda x: len(x.split()))

In [6]:
df.head()

Unnamed: 0,Text_Id,Text,Date,Location,Sentiments,text_len
0,1241032866567356417,RT @theskindoctor13: Shaheen Bagh is still on....,Fri Mar 20 16:04:27 +0000 2020,"Uttar Pradesh, India",0.0,23
1,1241032867699765249,RT @theskindoctor13: Shaheen Bagh is still on....,Fri Mar 20 16:04:27 +0000 2020,Mumbai,0.0,23
2,1241032875102703616,"RT @SmokingSkills_: Daughter of an IAS, son of...",Fri Mar 20 16:04:29 +0000 2020,"Jodhpur, India",0.35,25
3,1241032877099237379,RT @narendramodi: The young actors have someth...,Fri Mar 20 16:04:29 +0000 2020,"Gurugram, Bharat",0.125,21
4,1241032870405128192,RT @theskindoctor13: Shaheen Bagh is still on....,Fri Mar 20 16:04:28 +0000 2020,"New Delhi, India",0.0,23


In [7]:
# Modifying Date
df['Date'] = pd.to_datetime(df['Date']).dt.date

In [8]:
df['month'] = df['Date'].apply(lambda x: x.month)
df['days'] = df['Date'].apply(lambda x: x.day)

In [9]:
# Creating new_sentiments column out of polarities in sentiments
New_Sentiments_ls = []

for senti in df['Sentiments']:
    if senti < 0:
        New_Sentiments_ls.append(-1)
    elif senti > 0:
        New_Sentiments_ls.append(1)
    else:
        New_Sentiments_ls.append(0)

Sentiments = pd.DataFrame(New_Sentiments_ls, columns=['New_Sentiments'])

df = pd.concat([df, Sentiments], axis=1)

In [10]:
df.head()

Unnamed: 0,Text_Id,Text,Date,Location,Sentiments,text_len,month,days,New_Sentiments
0,1241032866567356417,RT @theskindoctor13: Shaheen Bagh is still on....,2020-03-20,"Uttar Pradesh, India",0.0,23,3,20,0
1,1241032867699765249,RT @theskindoctor13: Shaheen Bagh is still on....,2020-03-20,Mumbai,0.0,23,3,20,0
2,1241032875102703616,"RT @SmokingSkills_: Daughter of an IAS, son of...",2020-03-20,"Jodhpur, India",0.35,25,3,20,1
3,1241032877099237379,RT @narendramodi: The young actors have someth...,2020-03-20,"Gurugram, Bharat",0.125,21,3,20,1
4,1241032870405128192,RT @theskindoctor13: Shaheen Bagh is still on....,2020-03-20,"New Delhi, India",0.0,23,3,20,0


In [11]:
X = df.drop(['Text', 'Date', 'Location', 'Sentiments'], axis=1)
y = df.New_Sentiments

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)

In [13]:
classifier=DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [14]:
y_pred=classifier.predict(X_test)
classifier.score(X_train, y_train)*100

100.0

In [15]:
classifier.score(X_test, y_test)*100

100.0

In [16]:
explained_variance_score(y_test, y_pred)

1.0

In [17]:
from sklearn.ensemble import RandomForestClassifier
ran_classifier = RandomForestClassifier()
ran_classifier.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
y_pred=classifier.predict(X_test)
classifier.score(X_train, y_train)*100

100.0

In [19]:
classifier.score(X_test, y_test)*100

100.0

In [20]:
explained_variance_score(y_test, y_pred)

1.0