# Different machine learning models


In [4]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

# reading data

In [5]:
train  = pd.read_csv("0TwitterDataset250k.csv")

In [6]:
(train["Label"]==0).sum()

125000

# Removing Twitter Handles (@user)

In [7]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 
# remove twitter handles (@user)
train['tidy_tweet'] = np.vectorize(remove_pattern)(train['Tweet'], "@[\w]*")

In [8]:
train['tidy_tweet'].head()

0    is upset that he can't update his Facebook by ...
1     I dived many times for the ball. Managed to s...
2      my whole body feels itchy and like its on fire 
3     no, it's not behaving at all. i'm mad. why am...
4                                  not the whole crew 
Name: tidy_tweet, dtype: object

# remove special characters, numbers, punctuations

In [9]:
# remove special characters, numbers, punctuations
train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [10]:
train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# tokenisation 

In [11]:
tokenized_tweet1 = train['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet1.tail()

249995                                    [could, probably]
249996    [Happy, Mothers, moms, Those, that, have, Moms...
249997                [Happy, Mama, GODmothers, gift, Lmao]
249998                                       [Happy, Momma]
249999    [Outside, vodka, craberry, full, heaven, Imbei...
Name: tidy_tweet, dtype: object

# stemning

In [12]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet1 = tokenized_tweet1.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet1.tail()

249995                                     [could, probabl]
249996    [happi, mother, mom, those, that, have, mom, t...
249997                   [happi, mama, godmoth, gift, lmao]
249998                                       [happi, momma]
249999    [outsid, vodka, craberri, full, heaven, imb, g...
Name: tidy_tweet, dtype: object

In [13]:
for i in range(len(tokenized_tweet1)):
    tokenized_tweet1[i] = ' '.join(tokenized_tweet1[i])

train['tidy_tweet'] = tokenized_tweet1

In [14]:
train.tail()

Unnamed: 0,Tweet,Label,tidy_tweet
249995,@JoesGaGirl um.................no. I could p...,1,could probabl
249996,Happy Mothers Day to All the moms and to Thos...,1,happi mother mom those that have mom that mean...
249997,Happy Mama's Day ! do I get a GODmothers day ...,1,happi mama godmoth gift lmao
249998,@kissmeandcstars Happy Momma's Day,1,happi momma
249999,Outside we vodka n craberry full of ice - heav...,1,outsid vodka craberri full heaven imb good rea...


# TF-IDF Features

In [15]:
X = train['tidy_tweet'] 
y = train['Label']


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.5,random_state=42)
X_train.shape, X_val.shape

((125000,), (125000,))

In [17]:
y_val

38683     0
64939     0
3954      0
120374    0
172861    1
         ..
60372     0
182219    1
43490     0
99683     0
221577    1
Name: Label, Length: 125000, dtype: int64

In [18]:
#Importing TFIDF 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf = TfidfVectorizer(max_df=.90,min_df=3,stop_words='english',max_features=25000)

In [19]:

#Fitting TFIDF to both training and test
x_train_tfidf =  tfidf.fit_transform(X_train) 
x_test_tfidf = tfidf.transform(X_val)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [21]:
logis = LogisticRegression(max_iter=1000)
logis = logis.fit(x_train_tfidf,y_train)
y_pred = logis.predict(x_test_tfidf)
logr=f1_score(y_val,y_pred)
print("f1_score",logr)

f1_score 0.7540375670980156


In [22]:
from sklearn import svm

#Create a svm Classifier
clf = svm.LinearSVC()
clf = clf.fit(x_train_tfidf,y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test_tfidf)
sv=f1_score(y_val,y_pred)
print("f1_score",sv)

f1_score 0.7446332702182535


In [23]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10).fit(x_train_tfidf,y_train) 
prediction = rf.predict(x_test_tfidf)
rft = f1_score(y_val, prediction)
print(f"F1 score : {rft}")

F1 score : 0.7137465074100096


In [24]:
from sklearn.tree import DecisionTreeClassifier 

clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(x_train_tfidf,y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test_tfidf)
dt = f1_score(y_val, y_pred)
print("f1_score",dt)

f1_score 0.6901286335834542


In [25]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(3)
knn.fit(x_train_tfidf, y_train)

y_pred = knn.predict(x_test_tfidf)
kn=f1_score(y_val,y_pred)
print("f1_score",kn)

f1_score 0.6777734602681174


In [26]:
from sklearn.naive_bayes import MultinomialNB 


model = MultinomialNB()
model.fit(x_train_tfidf, y_train)
pred = model.predict((x_test_tfidf))
nbf =  f1_score(y_val, pred)
print('f1_score :',nbf)

f1_score : 0.728711719860447


In [27]:
d = {'Models':['Logistic Regression','SVM','Decision Tree',
               'Random Forest','KNN','Navie Bayes'],
     'F1 Score':[logr,sv,dt,rft,kn,nbf]}
dff = pd.DataFrame(d ,index=range(1,7))
print("\nF1 score of different models \n\n",dff)


F1 score of different models 

                 Models  F1 Score
1  Logistic Regression  0.754038
2                  SVM  0.744633
3        Decision Tree  0.690129
4        Random Forest  0.713747
5                  KNN  0.677773
6          Navie Bayes  0.728712
