Basic Imports

In [3]:
%pip install wordcloud

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.




In [4]:
# importing neccessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#import wordcloud for text viz.
from wordcloud import WordCloud

# importing NLTK for nlp
import nltk
from nltk.corpus import stopwords

# downloading NLTK data
nltk.download('stopwords') #stopword data
nltk.download('punkt') # tokenizer data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
df = pd.read_csv("../experiments/spam.csv")

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True) #dropping columns that are not required

df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df.rename(columns= {'v1':'target','v2':'text'},inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Data Preprocessing

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
#check and remove duplicates
df.duplicated().sum()

403

In [14]:
df.drop_duplicates(keep='first')
len(df)

5572

Feature Engineering

In [15]:
# Importing porter stemmer for stemming words
from nltk.stem.porter import PorterStemmer

# string Module for handling special characters
import string

ps = PorterStemmer()


In [22]:
# Lowercase the sentences and text preprocessing
def transform_text(text):
    text = text.lower()

    # tokenizing the text
    text = nltk.word_tokenize(text)

    # remove special char.
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)

    # remove stop words and punctuations
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    # stemming 
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)




In [23]:
transform_text("Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...")

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [24]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfid = TfidfVectorizer(max_features = 500)

In [31]:
x = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

Train Test Split

In [32]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size  = 0.20, random_state = 2)

Model Training

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [36]:
svc = SVC(kernel = 'sigmoid', gamma = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver= 'liblinear' , penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2)
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)
xgb = XGBClassifier(n_estimators = 50, random_state = 2)

In [41]:
clfs = {
    'SVC' : svc,
    'KNN' : knc,
    'NB' : mnb,
    'DT' : dtc,
    'LR' : lrc,
    'RF' : rfc,
    'Adaboost' : abc,
    'Bgc' : bc,
    'ETC' : etc,
    'GBDT' : gbdt,
    'xgb' : xgb
}

Model Evaluation

In [42]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, x_train, y_train, x_test, y_test):
    clfs.fit(x_train,y_train)
    y_pred = clfs.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy, precision

In [None]:
accuracy_scores = []
precision_scores = []
for name, clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, x_train, y_train, x_test, y_test)
    print()
    print("For: ",name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)



For:  SVC
Accuracy:  0.9632286995515695
Precision:  0.9606299212598425

For:  KNN
Accuracy:  0.9264573991031391
Precision:  0.9871794871794872

For:  NB
Accuracy:  0.9659192825112107
Precision:  0.9838709677419355

For:  DT
Accuracy:  0.9183856502242153
Precision:  0.845360824742268

For:  LR
Accuracy:  0.9596412556053812
Precision:  0.959349593495935

For:  RF
Accuracy:  0.9713004484304932
Precision:  0.9846153846153847

For:  Adaboost
Accuracy:  0.9112107623318386
Precision:  0.8933333333333333

For:  Bgc
Accuracy:  0.9623318385650225
Precision:  0.9142857142857143

For:  ETC
Accuracy:  0.9695067264573991
Precision:  0.9696969696969697

For:  GBDT
Accuracy:  0.9399103139013453
Precision:  0.941747572815534

For:  xgb
Accuracy:  0.9632286995515695
Precision:  0.968
