## Required Imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Importing wordCloud for text visulization
from wordcloud import WordCloud

# Importing NLTK for natural langugae processing
import nltk
from nltk.corpus import stopwords

# Downloading NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
# Read the CSV file
df=pd.read_csv('../Experiments/spam.csv')

In [7]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [8]:
# Droping Unnamed columns
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)


In [9]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# Renaming the columns
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Preprocessing


In [11]:
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
df['target']=encoder.fit_transform(df['target'])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
# Checking duplicate values
print(df.duplicated().sum())

403


In [13]:
df.shape

(5572, 2)

In [14]:
# Removing Duplicates
df=df.drop_duplicates(keep='first')


In [15]:
df.shape

(5169, 2)

## Feature Engineering

In [16]:
# Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer

# Importing the string module for handling special characters
import string

# Creating an instance of Porter Stemmer
ps=PorterStemmer()

In [17]:
# Lowercase transformation and text preprocessing function
def transform_text(text):
    # Transform the text to lowercase
    text=text.lower()

    # Tokenization using NLTK
    text=nltk.word_tokenize(text)

    # Removing special characters
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    
    # Removing stop words and punctuation
    text=y[:]
    y.clear()

    # Loop through the tokens and  remove stopwords and punctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    # Steaming using Porter Stemmer
    text=y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))

    # Join the processed tokens back into the single string 
    return " ".join(y)


    


In [18]:
print(transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'))

go jurong point crazi avail bugi n great world la e buffet cine got amor wat


In [19]:
# Creating new column as transformed text
df['transformed_text']=df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [53]:
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
tfid=TfidfVectorizer(max_features=500)

In [54]:
X=tfid.fit_transform(df['transformed_text']).toarray()
y=df['target'].values

In [22]:
print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [23]:
y

array([0, 0, 1, ..., 0, 0, 0], shape=(5169,))

## Train Test Split

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

In [25]:
X_train.shape

(4135, 500)

In [26]:
X_test.shape  , y_test.shape

((1034, 500), (1034,))

## Model Training

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [28]:
svc=SVC(kernel='sigmoid',gamma=1.0)
knc=KNeighborsClassifier()
mnb=MultinomialNB()
dtc=DecisionTreeClassifier(max_depth=5)
lrc=LogisticRegression(solver='liblinear',penalty='l1')
rfc=RandomForestClassifier(n_estimators=50,random_state=1)
abc=AdaBoostClassifier(n_estimators=50,random_state=1)
bc=BaggingClassifier(n_estimators=50,random_state=1)
etc=ExtraTreesClassifier(n_estimators=50,random_state=1)
gbdt=GradientBoostingClassifier(n_estimators=50,random_state=1)
xgb=XGBClassifier(n_estimators=50,random_state=1)




In [29]:
clfs={
    'SVC':svc,
    'KNN':knc,
    'NB':mnb,
    'DT':dtc,
    'LR':lrc,
    'RF':rfc,
    'Adaboost':abc,
    'Bgc':bc,
    'ETC':etc,
    'GBDT':gbdt,
    'xgb':xgb
}

## Model Evaluation

In [30]:
from sklearn.metrics import accuracy_score,precision_score

def train_classifier(clfs,X_train,y_train,X_test,y_test):
    clfs.fit(X_train,y_train)
    y_pred=clfs.predict(X_test)

    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)

    return accuracy,precision


In [31]:
accuracy_scores=[]
precision_scores=[]

for name ,clfs in clfs.items():
    current_acccuracy,current_precision=train_classifier(clfs,X_train,y_train,X_test,y_test)
    print()

    print("For : ",name)
    print("Accuracy : ",current_acccuracy)
    print("Precision : ",current_precision)

    accuracy_scores.append(current_acccuracy)
    precision_scores.append(current_precision)


For :  SVC
Accuracy :  0.9593810444874274
Precision :  0.926605504587156

For :  KNN
Accuracy :  0.9245647969052224
Precision :  1.0

For :  NB
Accuracy :  0.9671179883945842
Precision :  0.954954954954955

For :  DT
Accuracy :  0.9197292069632496
Precision :  0.8095238095238095

For :  LR
Accuracy :  0.9535783365570599
Precision :  0.8990825688073395

For :  RF
Accuracy :  0.9642166344294004
Precision :  0.9083333333333333

For :  Adaboost
Accuracy :  0.9042553191489362
Precision :  0.75

For :  Bgc
Accuracy :  0.9429400386847195
Precision :  0.8015873015873016

For :  ETC
Accuracy :  0.9671179883945842
Precision :  0.9173553719008265

For :  GBDT
Accuracy :  0.9381044487427466
Precision :  0.927710843373494

For :  xgb
Accuracy :  0.9555125725338491
Precision :  0.8938053097345132


0,1,2
,C,1.0
,kernel,'sigmoid'
,degree,3
,gamma,1.0
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


# Testing  with message


In [70]:
sv=SVC(kernel='sigmoid',gamma=1.0)

In [71]:
sv.fit(X_train,y_train)

0,1,2
,C,1.0
,kernel,'sigmoid'
,degree,3
,gamma,1.0
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [72]:
sv.predict(X_test)

array([0, 0, 0, ..., 0, 0, 1], shape=(1034,))

In [None]:
msg = ["Congratulations! You have won a free prize. Click now"]
m = tfid.transform(msg).toarray()
sv.predict(m)

array([1])