## Importing Libraries

In [190]:
import pandas as pd 
import numpy as np 
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import string 
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.stem.porter import *
from sklearn.svm import SVC

## Importing Dataset

In [126]:
data = pd.read_csv('/project/Tweets.csv')
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.head(5)

Unnamed: 0,id,text,label
0,1381014933706526725,I got the same vaccine shot @MariahCarey got t...,0
1,1381014934469885963,OMFGGGG TWT IS THHE DUMBEST APP https://t.co/H...,0
2,1381014935434625024,@BabsR10 My husband and I received the first P...,0
3,1381014935887622147,Another quackcine... https://t.co/ZqAKzLSZOs,0
4,1381014936256536576,Nobody is going to boycott the Yankees or the ...,0


## Data Cleaning and Processing

In [127]:
#data cleaning function to remove pattern
remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

## Remove twitter handles :@

In [128]:
data['text'] = np.vectorize(remove_pattern)(data['text'], "@[/w]*")
data.head()

Unnamed: 0,id,text,label
0,1381014933706526725,I got the same vaccine shot MariahCarey got th...,0
1,1381014934469885963,OMFGGGG TWT IS THHE DUMBEST APP https://t.co/H...,0
2,1381014935434625024,BabsR10 My husband and I received the first Pf...,0
3,1381014935887622147,Another quackcine... https://t.co/ZqAKzLSZOs,0
4,1381014936256536576,Nobody is going to boycott the Yankees or the ...,0


## Remove http

In [129]:
data['text'] = np.vectorize(remove_pattern)(data['text'], r"http\S+")
data.head()                                

Unnamed: 0,id,text,label
0,1381014933706526725,I got the same vaccine shot MariahCarey got th...,0
1,1381014934469885963,OMFGGGG TWT IS THHE DUMBEST APP,0
2,1381014935434625024,BabsR10 My husband and I received the first Pf...,0
3,1381014935887622147,Another quackcine...,0
4,1381014936256536576,Nobody is going to boycott the Yankees or the ...,0


## Remove special characters, numbers, panctuations

In [130]:
data['text'] = data['text'].str.replace("[^a-zA-Z#]", " ")
data.head()

Unnamed: 0,id,text,label
0,1381014933706526725,I got the same vaccine shot MariahCarey got th...,0
1,1381014934469885963,OMFGGGG TWT IS THHE DUMBEST APP,0
2,1381014935434625024,BabsR My husband and I received the first Pf...,0
3,1381014935887622147,Another quackcine,0
4,1381014936256536576,Nobody is going to boycott the Yankees or the ...,0


## Tokenize the tweets 

In [131]:
tokenized_tweet = data['text'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [I, got, the, same, vaccine, shot, MariahCarey...
1               [OMFGGGG, TWT, IS, THHE, DUMBEST, APP]
2    [BabsR, My, husband, and, I, received, the, fi...
3                                 [Another, quackcine]
4    [Nobody, is, going, to, boycott, the, Yankees,...
Name: text, dtype: object

In [132]:
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
tokenized_tweet.head()

0    [I, got, the, same, vaccin, shot, mariahcarey,...
1               [omfgggg, twt, IS, thhe, dumbest, app]
2    [babsr, My, husband, and, I, receiv, the, firs...
3                                    [anoth, quackcin]
4    [nobodi, is, go, to, boycott, the, yanke, or, ...
Name: text, dtype: object

## Joining tokenized tweets in original data

In [133]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] =  ' '.join(tokenized_tweet[i])
data['text'] = tokenized_tweet
data.head()

Unnamed: 0,id,text,label
0,1381014933706526725,I got the same vaccin shot mariahcarey got the...,0
1,1381014934469885963,omfgggg twt IS thhe dumbest app,0
2,1381014935434625024,babsr My husband and I receiv the first pfizer...,0
3,1381014935887622147,anoth quackcin,0
4,1381014936256536576,nobodi is go to boycott the yanke or the met m...,0


## Creating two columns for the length and punctuation

In [134]:
def count_punc(text):
    count = sum([i for char in text if char in string.punctuation])
    return round((count/len(text) - text.count(" ")), 3)*100


In [135]:
data['body_len'] = data['text'].apply(lambda x: len(x) - x.count(" "))
data['punc%'] = data['text'].apply(lambda x: count_punc(x))
data.head()

Unnamed: 0,id,text,label,body_len,punc%
0,1381014933706526725,I got the same vaccin shot mariahcarey got the...,0,45,-1000.0
1,1381014934469885963,omfgggg twt IS thhe dumbest app,0,26,-500.0
2,1381014935434625024,babsr My husband and I receiv the first pfizer...,0,110,-2900.0
3,1381014935887622147,anoth quackcin,0,13,-100.0
4,1381014936256536576,nobodi is go to boycott the yanke or the met m...,0,191,-5000.0


# Feature Engineering 

##  CountVectorizer

In [189]:
CountVectorizer = CountVectorizer(stop_words = 'english')
cv = CountVectorizer.fit_transform(data['text'])
X_countdf_feat = pd.concat([data['body_len'], data['punc%'], pd.DataFrame(tfidf.toarray())], axis =1)
X_countdf_feat

Unnamed: 0,body_len,punc%,0,1,2,3,4,5,6,7,...,633,634,635,636,637,638,639,640,641,642
0,45,-1000.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,26,-500.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,110,-2900.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,13,-100.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,191,-5000.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.208617,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,142,-3000.0,0.221734,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
96,93,-2000.0,0.000000,0.0,0.0,0.0,0.337056,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
97,91,-2000.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
98,97,-2000.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


# ML models

##  Logistic Regression

In [183]:
X= X_countdf_feat
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [187]:
#fit Log Regression Model
clf_1= LogisticRegression()
clf_1.fit(X_train,y_train)
clf_1.score(X_test,y_test)
y_pred_1 = clf_1.predict(X_test)
print(classification_report(y_test, y_pred_1))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92        24
           1       1.00      0.56      0.71         9

    accuracy                           0.88        33
   macro avg       0.93      0.78      0.82        33
weighted avg       0.90      0.88      0.87        33



##  Support Vector Classifier

In [186]:
#fit SVM model
clf_2 = SVC()
clf_2.fit(X_train,y_train)
clf_2.score(X_test,y_test)
y_pred_2 = clf_2.predict(X_test)
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

           0       0.73      1.00      0.84        24
           1       0.00      0.00      0.00         9

    accuracy                           0.73        33
   macro avg       0.36      0.50      0.42        33
weighted avg       0.53      0.73      0.61        33



  _warn_prf(average, modifier, msg_start, len(result))


In [191]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(CountVectorizer , f)
    
with open('sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf_1, f)