In [13]:
## Importing necessary library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Word cloud for text virtualization
from wordcloud import WordCloud

## Importing NLTK for Natural Language Processing
import nltk
from nltk.corpus import stopwords

## Downloading NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to C:\Users\Siddharth
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Siddharth
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Siddharth
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [2]:
## Reading the csv file
df = pd.read_csv('spam.csv')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


In [3]:
df.drop(columns=["Unnamed: 2","Unnamed: 3", "Unnamed: 4"],inplace=True)
df.head(3)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [4]:
## Rename the columns
df = df.rename(columns= {'v1':'target','v2':'text'})
df.head(3)

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [5]:
df['target'].nunique()

2

# Data Preprocessing

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["target"] = encoder.fit_transform(df['target'])
df.head(3)

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [7]:
## Check & Remove duplicates
df.duplicated().sum()

np.int64(403)

In [8]:
len(df)

5572

In [9]:
df = df.drop_duplicates(keep='first')
len(df)

5169

# Feature Engineering

In [10]:
## imp

## Text Stemming
from nltk.stem.porter import PorterStemmer

## String module for special character
import string

## Instance of Porter Stemmer
ps = PorterStemmer()

In [14]:
stop_words = set(stopwords.words('english'))

## Text transformation
def transform_text(text):
    ## Transforming text to lower case
    text = text.lower()
    
    ## Tokenizing the text
    text = nltk.word_tokenize(text)
    
    ## Removing Special Characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    ## Removing stopwords and punctuations
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stop_words and i not in string.punctuation:
            y.append(i)
            
    ## Stemming using porter stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    
    return " ".join(y)

In [15]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfidf = TfidfVectorizer(max_features=500)

In [17]:
X = tfidf.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

# Train test split

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30, random_state=42)

# Model Training

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [20]:
svc = SVC(kernel='sigmoid', gamma = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=42)
abc = AdaBoostClassifier(n_estimators=50, random_state=42)
bc= BaggingClassifier(n_estimators=50, random_state=42)
gbdt = GradientBoostingClassifier(n_estimators=50, random_state=42)
xgb = XGBClassifier(n_estimators=50, random_state=42)

In [21]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB' : mnb,
    'DT' : dtc,
    'LR' : lrc,
    'RF' : rfc,
    'Adb': abc,
    'BGC': bc,
    'GBDT': gbdt,
    'XGB': xgb 
}

In [22]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs,X_train,y_train,X_test,y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    return accuracy, precision

In [23]:
accuracy_scores = []
precision_scores = []
for name, clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs,X_train,y_train,X_test,y_test)
    print()
    print('For: ', name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision) 


For:  SVC
Accuracy:  0.9696969696969697
Precision:  0.9576719576719577

For:  KNN
Accuracy:  0.9168278529980658
Precision:  0.9690721649484536

For:  NB
Accuracy:  0.9729206963249516
Precision:  0.9635416666666666

For:  DT
Accuracy:  0.9316569954867827
Precision:  0.8475609756097561

For:  LR
Accuracy:  0.9548678272082527
Precision:  0.9120879120879121





For:  RF
Accuracy:  0.9729206963249516
Precision:  0.9635416666666666

For:  Adb
Accuracy:  0.9129593810444874
Precision:  0.8102189781021898

For:  BGC
Accuracy:  0.9561573178594455
Precision:  0.8518518518518519

For:  GBDT
Accuracy:  0.9516441005802708
Precision:  0.9617834394904459

For:  XGB
Accuracy:  0.968407479045777
Precision:  0.9430051813471503
