In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('Corona_NLP_train.csv',  encoding='latin1')
test = pd.read_csv('Corona_NLP_test.csv', encoding='latin1')
data = pd.concat([train, test], ignore_index=True)

In [3]:
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [4]:
data.shape

(44955, 6)

In [5]:
data.drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'], axis=1, inplace=True)

In [6]:
data['OriginalTweet'][4]

"Me, ready to go at supermarket during the #COVID19 outbreak.\r\r\n\r\r\nNot because I'm paranoid, but because my food stock is litteraly empty. The #coronavirus is a serious thing, but please, don't panic. It causes shortage...\r\r\n\r\r\n#CoronavirusFrance #restezchezvous #StayAtHome #confinement https://t.co/usmuaLq72n"

In [7]:
data['Sentiment'].value_counts()

Sentiment
Positive              12369
Negative              10958
Neutral                8332
Extremely Positive     7223
Extremely Negative     6073
Name: count, dtype: int64

In [8]:
data['Sentiment'].replace({'Positive' : 0, 'Negative' : 1, 'Neutral' : 2, 'Extremely Positive' : 0, 'Extremely Negative' : 1}, inplace= True)

In [9]:
data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,2
1,advice Talk to your neighbours family to excha...,0
2,Coronavirus Australia: Woolworths to give elde...,0
3,My food stock is not the only one which is emp...,0
4,"Me, ready to go at supermarket during the #COV...",1


In [10]:
def cleaner(text):
    
    # remove urls
    text = re.sub(r'http\S+', ' ', text)
    
    # remove html tags
    text = re.sub(r'<.*?>',' ', text)
    
    # remove digits
    text = re.sub(r'\d+',' ', text)
    
    # remove hashtags
    text = re.sub(r'#\w+',' ', text)
    
    # remove mentions
    text = re.sub(r'@\w+',' ', text)
    
    # remove \r\r\n\r\r\n
    text = re.sub(r'[\r\n]+', ' ', text)
    
    # remove ellipsis
    text = re.sub(r'\.\.\.', ' ', text)
    
    return  text

data['OriginalTweet'] = data['OriginalTweet'].apply(cleaner)

In [11]:
data['OriginalTweet'][4]

"Me, ready to go at supermarket during the    outbreak. Not because I'm paranoid, but because my food stock is litteraly empty. The   is a serious thing, but please, don't panic. It causes shortage           "

In [12]:
def convert_lower(text):
    return text.lower()

data['OriginalTweet'] = data['OriginalTweet'].apply(convert_lower)

In [13]:
def remove_stopwords(text):
    x = []
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y = x[:]
    x.clear()
    return y

data['OriginalTweet'] = data['OriginalTweet'].apply(remove_stopwords) 

In [14]:
data['OriginalTweet'][4]

['me,',
 'ready',
 'go',
 'supermarket',
 'outbreak.',
 "i'm",
 'paranoid,',
 'food',
 'stock',
 'litteraly',
 'empty.',
 'serious',
 'thing,',
 'please,',
 'panic.',
 'causes',
 'shortage']

In [15]:
ps = PorterStemmer()

y = []

def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z = y[:]
    y.clear()
    return z

data['OriginalTweet'] = data['OriginalTweet'].apply(stem_words)

In [16]:
def list_to_string(text):
    return ' '.join(text)

data['OriginalTweet'] = data['OriginalTweet'].apply(list_to_string)

In [17]:
cv = CountVectorizer(max_features=5000)

X = cv.fit_transform(data['OriginalTweet']).toarray()

In [18]:
X.shape

(44955, 5000)

In [19]:
y = data.iloc[:,-1].values

In [20]:
y.shape

(44955,)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((35964, 5000), (8991, 5000), (35964,), (8991,))

In [23]:
clf4 = SVC()
clf5 = DecisionTreeClassifier()
clf6 = RandomForestClassifier()

In [None]:
clf4.fit(X_train, y_train)
clf5.fit(X_train, y_train)
clf6.fit(X_train, y_train)

In [None]:
y_pred4 = clf4.predict(X_test)
y_pred5 = clf5.predict(X_test)
y_pred6 = clf6.predict(X_test)


In [None]:
print('Support Vector Classifier:')
print('Accuracy Score: ',accuracy_score(y_test,y_pred4))
print('Confusion Matrix:\n ',confusion_matrix(y_test,y_pred4))
print('Classification Report:\n ',classification_report(y_test,y_pred4))


print('Decision Tree:')
print('Accuracy Score: ',accuracy_score(y_test,y_pred5))
print('Confusion Matrix:\n ',confusion_matrix(y_test,y_pred5))
print('Classification Report:\n ',classification_report(y_test,y_pred5))


print('Random Forest:')
print('Accuracy Score: ',accuracy_score(y_test,y_pred6))
print('Confusion Matrix:\n ',confusion_matrix(y_test,y_pred6))
print('Classification Report:\n ',classification_report(y_test,y_pred6))
