# Different machine learning models


In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

# reading data

In [2]:
train  = pd.read_csv("2TwitterDataset250k.csv")

In [3]:
(train["Label"]==0).sum()

125000

# Removing Twitter Handles (@user)

In [4]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 
# remove twitter handles (@user)
train['tidy_tweet'] = np.vectorize(remove_pattern)(train['Tweet'], "@[\w]*")

In [5]:
train['tidy_tweet'].head()

0                   OMH.  I'm so sorry to hear that   
1                              ...i resized it though 
2    gotta wait till tomorrow to see the new moon t...
3    Dumb Fox announcers were saying Jeff was OK an...
4    All these BBQ's means I'm basically living on ...
Name: tidy_tweet, dtype: object

# remove special characters, numbers, punctuations

In [6]:
# remove special characters, numbers, punctuations
train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [7]:
train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# tokenisation 

In [8]:
tokenized_tweet1 = train['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet1.tail()

249995    [seriously, more, free, time, when, worked, of...
249996        [Maybe, bothered, revising, then, that, help]
249997                        [good, know, feeling, better]
249998    [#andyhurleyday, #andyhurleyday, #andyhurleyda...
249999    [Morning, Kids, slept, Praise, Went, sleep, Ee...
Name: tidy_tweet, dtype: object

# stemning

In [9]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet1 = tokenized_tweet1.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet1.tail()

249995    [serious, more, free, time, when, work, offic,...
249996              [mayb, bother, revis, then, that, help]
249997                           [good, know, feel, better]
249998    [#andyhurleyday, #andyhurleyday, #andyhurleyda...
249999    [morn, kid, slept, prais, went, sleep, eeeek, ...
Name: tidy_tweet, dtype: object

In [10]:
for i in range(len(tokenized_tweet1)):
    tokenized_tweet1[i] = ' '.join(tokenized_tweet1[i])

train['tidy_tweet'] = tokenized_tweet1

In [11]:
train.tail()

Unnamed: 0,Tweet,Label,tidy_tweet
249995,@MizFitOnline seriously! I had more free time ...,1,serious more free time when work offic week ju...
249996,Maybe if no one bothered revising then that'll...,1,mayb bother revis then that help
249997,@missbossy good to know you're feeling better..,1,good know feel better
249998,@trohman it is #andyhurleyday #andyhurleyday ...,1,#andyhurleyday #andyhurleyday #andyhurleyday #...
249999,Morning. Kids slept in Til 7:45am!! Praise G...,1,morn kid slept prais went sleep eeeek suuuuper...


# TF-IDF Features

In [12]:
X = train['tidy_tweet'] 
y = train['Label']


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.3,random_state=42)
X_train.shape, X_val.shape

((175000,), (75000,))

In [14]:
y_val

38683     0
64939     0
3954      0
120374    0
172861    1
         ..
246904    1
213577    1
175132    1
75216     0
145446    1
Name: Label, Length: 75000, dtype: int64

In [15]:
#Importing TFIDF 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf = TfidfVectorizer(max_df=.90,min_df=3,stop_words='english',max_features=25000)

In [16]:

#Fitting TFIDF to both training and test
x_train_tfidf =  tfidf.fit_transform(X_train) 
x_test_tfidf = tfidf.transform(X_val)


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [27]:
logis = LogisticRegression(max_iter=1000)
logis = logis.fit(x_train_tfidf,y_train)
y_pred = logis.predict(x_test_tfidf)
logr=f1_score(y_val,y_pred)
print("f1_score",logr)

f1_score 0.767049220811292


In [28]:
from sklearn import svm

#Create a svm Classifier
clf = svm.LinearSVC()
clf = clf.fit(x_train_tfidf,y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test_tfidf)
sv=f1_score(y_val,y_pred)
print("f1_score",sv)

f1_score 0.759738568899073


In [24]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10).fit(x_train_tfidf,y_train) 
prediction = rf.predict(x_test_tfidf)
rft = f1_score(y_val, prediction)
print(f"F1 score : {rft}")

F1 score : 0.7283235425796036


In [22]:
from sklearn.tree import DecisionTreeClassifier 

clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(x_train_tfidf,y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test_tfidf)
dt = f1_score(y_val, y_pred)
print("f1_score",dt)

f1_score 0.6967355432146218


In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(3)
knn.fit(x_train_tfidf, y_train)

y_pred = knn.predict(x_test_tfidf)
kn=f1_score(y_val,y_pred)
print("f1_score",kn)

f1_score 0.5774291185372783


In [21]:
from sklearn.naive_bayes import MultinomialNB 


model = MultinomialNB()
model.fit(x_train_tfidf, y_train)
pred = model.predict((x_test_tfidf))
nbf =  f1_score(y_val, pred)
print('f1_score :',nbf)

f1_score : 0.7448817198500257


In [29]:
d = {'Models':['Logistic Regression','SVM','Decision Tree',
               'Random Forest','KNN','Navie Bayes'],
     'F1 Score':[logr,sv,dt,rft,kn,nbf]}
dff = pd.DataFrame(d ,index=range(1,7))
print("\nF1 score of different models \n\n",dff)


F1 score of different models 

                 Models  F1 Score
1  Logistic Regression  0.767049
2                  SVM  0.759739
3        Decision Tree  0.696736
4        Random Forest  0.728324
5                  KNN  0.577429
6          Navie Bayes  0.744882
