### Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import spacy
import nltk
from nltk.tokenize import word_tokenize
nlp = spacy.load("en_core_web_lg")

import warnings
warnings.filterwarnings("ignore")


In [2]:
df=pd.read_csv("spam.csv")

In [3]:
df.shape

(5572, 2)

### Unique count

In [4]:
df["Category"].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

### Balancing Dataset

In [5]:
count_0,count_1 = df["Category"].value_counts()
df_0 = df[df["Category"] == "ham"]
df_1 = df[df["Category"] == "spam"]

df_1_over= df_1.sample(count_0,replace = True)
df = pd.concat([df_0,df_1_over],axis=0)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...


In [28]:
df["Category"].value_counts()

ham     4825
spam    4825
Name: Category, dtype: int64

### Converting text to numeric

In [29]:
df["Category"]=df["Category"].apply(lambda x: 1 if x=="spam" else 0)
df.head(10)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
10,0,I'm gonna be home soon and i don't want to tal...
13,0,I've been searching for the right words to tha...
14,0,I HAVE A DATE ON SUNDAY WITH WILL!!
16,0,Oh k...i'm watching here:)


In [30]:
df["Category"].value_counts()

0    4825
1    4825
Name: Category, dtype: int64

### Data Preprocessing 

In [31]:
def preprocessing(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:    
        if token.is_stop or token.is_punct or token.is_space:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [32]:
df["Message"] = df["Message"].apply(preprocessing)

In [33]:
df

Unnamed: 0,Category,Message
0,0,jurong point crazy available bugis n great wor...
1,0,ok lar joke wif u oni
3,0,U dun early hor u c
4,0,nah think go usf live
6,0,brother like speak treat like aids patent
...,...,...
4348,1,U 447801259231 secret admirer look 2 contact u...
415,1,100 date service cal;l 09064012103 box334sk38ch
3998,1,bore housewife Chat n date 0871750.77.11 BT na...
3501,1,dorothy@kiefer.com Bank Granite issue Strong B...


### Using TF-IDF of words

In [34]:
sep = list(map(lambda x: (x, df[x].fillna(df[x].mode()[0], inplace=True)), df.columns))
col=df.select_dtypes(include="object").columns 

cv=TfidfVectorizer()
for i in col:
    cols=cv.fit_transform(df[i])
    arr=cols.toarray() 
    column_name = list(map(lambda x : f"{i}-{x}",cv.get_feature_names_out()))
    data=pd.DataFrame(arr,columns = column_name)
    df = df.drop(i,axis = 1)
    df.reset_index(inplace = True)
    df=pd.concat([df,data],axis=1)
df.head()

Unnamed: 0,index,Category,Message-00,Message-000,Message-000pes,Message-008704050406,Message-0089,Message-0121,Message-01223585236,Message-01223585334,...,Message-zhong,Message-zindgi,Message-zoe,Message-zogtorius,Message-zoom,Message-zouk,Message-zyada,Message-èn,Message-ú1,Message-〨ud
0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Spliting the dataset

In [35]:
X = df.drop("Category",axis=1) 
y = df["Category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

### Decision Tree

In [36]:
print("-------------classification report for DecisionTree ----------------\n")
clf =  DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for DecisionTree ----------------

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       965
           1       0.98      1.00      0.99       965

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930



### Random Forest

In [37]:
print("-------------classification report for RandomForest ----------------\n")
clf = RandomForestClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for RandomForest ----------------

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       965
           1       1.00      1.00      1.00       965

    accuracy                           1.00      1930
   macro avg       1.00      1.00      1.00      1930
weighted avg       1.00      1.00      1.00      1930



### KMeans

In [38]:
print("-------------classification report for  KMeans ----------------\n")
clf = KMeans()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for  KMeans ----------------

              precision    recall  f1-score   support

           0       0.52      0.14      0.22       965
           1       0.47      0.12      0.19       965
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0

    accuracy                           0.13      1930
   macro avg       0.12      0.03      0.05      1930
weighted avg       0.49      0.13      0.20      1930



### KNeighbors Classifier

In [39]:
print("-------------classification report for KNeighborsClassifier ----------------\n")
clf = KNeighborsClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for KNeighborsClassifier ----------------

              precision    recall  f1-score   support

           0       0.89      0.63      0.74       965
           1       0.71      0.92      0.80       965

    accuracy                           0.77      1930
   macro avg       0.80      0.77      0.77      1930
weighted avg       0.80      0.77      0.77      1930



### Linear Discriminant Analysis

In [40]:
print("-------------classification report for LinearDiscriminantAnalysis ----------------\n")
clf = LinearDiscriminantAnalysis()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for LinearDiscriminantAnalysis ----------------

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       965
           1       0.99      1.00      1.00       965

    accuracy                           1.00      1930
   macro avg       1.00      1.00      1.00      1930
weighted avg       1.00      1.00      1.00      1930



### Logistic Regression

In [41]:
print("-------------classification report for LogisticRegression ----------------\n")
clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for LogisticRegression ----------------

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.99      0.98      0.99       965

    accuracy                           0.99      1930
   macro avg       0.99      0.99      0.99      1930
weighted avg       0.99      0.99      0.99      1930



### SVC

In [42]:
print("-------------classification report for SVC ----------------\n")
clf = SVC()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for SVC ----------------

              precision    recall  f1-score   support

           0       0.51      0.70      0.59       965
           1       0.51      0.31      0.39       965

    accuracy                           0.51      1930
   macro avg       0.51      0.51      0.49      1930
weighted avg       0.51      0.51      0.49      1930

