### Libraries

In [57]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import time
import spacy
import nltk
from nltk.tokenize import word_tokenize
nlp = spacy.load("en_core_web_lg")

import warnings
warnings.filterwarnings("ignore")


In [58]:
df=pd.read_csv("spam.csv")

In [59]:
df.shape

(5572, 2)

### Unique count

In [60]:
df["Category"].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

### Balancing Dataset

In [61]:
count_0,count_1 = df["Category"].value_counts()
df_0 = df[df["Category"] == "ham"]
df_1 = df[df["Category"] == "spam"]

df_1_over= df_1.sample(count_0,replace = True)
df = pd.concat([df_0,df_1_over],axis=0)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
6,ham,Even my brother is not like to speak with me. ...


In [62]:
df["Category"].value_counts()

ham     4825
spam    4825
Name: Category, dtype: int64

### Converting text to numeric

In [63]:
df["Category"]=df["Category"].apply(lambda x: 1 if x=="spam" else 0)
df.head(10)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
10,0,I'm gonna be home soon and i don't want to tal...
13,0,I've been searching for the right words to tha...
14,0,I HAVE A DATE ON SUNDAY WITH WILL!!
16,0,Oh k...i'm watching here:)


In [64]:
df["Category"].value_counts()

0    4825
1    4825
Name: Category, dtype: int64

### Data Preprocessing 

In [65]:
def preprocessing(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:    
        if token.is_stop or token.is_punct or token.is_space:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [66]:
df["Message"] = df["Message"].apply(preprocessing)

### Using Bag of words

In [67]:
sep = list(map(lambda x: (x, df[x].fillna(df[x].mode()[0], inplace=True)), df.columns))
col=df.select_dtypes(include="object").columns 
cv=CountVectorizer()

for i in col:
    msg=cv.fit_transform(df["Message"])
    arr=msg.toarray()
    column_names = list(map(lambda x: f"{i}_{x}",cv.get_feature_names_out()))
    data=pd.DataFrame(arr,columns=column_names)
    df = df.drop(i,axis = 1) 
    df.reset_index(drop=True,inplace=True)
    df=pd.concat([df,data],axis=1)
df.head()

Unnamed: 0,Category,Message_00,Message_000,Message_000pes,Message_008704050406,Message_0089,Message_0121,Message_01223585236,Message_01223585334,Message_0125698789,...,Message_zhong,Message_zindgi,Message_zoe,Message_zogtorius,Message_zoom,Message_zouk,Message_zyada,Message_èn,Message_ú1,Message_〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Spliting the dataset

In [68]:
X = df.drop("Category",axis=1) 
y = df["Category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

###  Buliding a model using  Decision Tree

In [69]:
print("-------------classification report for DecisionTree ----------------\n")
clf =  DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for DecisionTree ----------------

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       965
           1       0.97      1.00      0.98       965

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930



###  Buliding a model using  Random Forest

In [70]:
print("-------------classification report for RandomForest ----------------\n")
clf = RandomForestClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for RandomForest ----------------

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       965
           1       1.00      1.00      1.00       965

    accuracy                           1.00      1930
   macro avg       1.00      1.00      1.00      1930
weighted avg       1.00      1.00      1.00      1930



###  Buliding a model using KMeans

In [71]:
print("-------------classification report for  KMeans ----------------\n")
clf = KMeans()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for  KMeans ----------------

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       965
           1       0.98      0.13      0.22       965
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0

    accuracy                           0.06      1930
   macro avg       0.12      0.02      0.03      1930
weighted avg       0.49      0.06      0.11      1930



###  Buliding a model using  KNeighbors Classifier

In [72]:
print("-------------classification report for KNeighborsClassifier ----------------\n")
clf = KNeighborsClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for KNeighborsClassifier ----------------

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.96      0.98       965

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930



###  Buliding a model using  Linear Discriminant Analysis

In [73]:
print("-------------classification report for LinearDiscriminantAnalysis ----------------\n")
clf = LinearDiscriminantAnalysis()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for LinearDiscriminantAnalysis ----------------

              precision    recall  f1-score   support

           0       1.00      0.96      0.98       965
           1       0.96      1.00      0.98       965

    accuracy                           0.98      1930
   macro avg       0.98      0.98      0.98      1930
weighted avg       0.98      0.98      0.98      1930



###  Buliding a model using Logistic Regression

In [74]:
print("-------------classification report for LogisticRegression ----------------\n")
clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for LogisticRegression ----------------

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       965
           1       0.99      1.00      1.00       965

    accuracy                           1.00      1930
   macro avg       1.00      1.00      1.00      1930
weighted avg       1.00      1.00      1.00      1930



### Buliding a model using SVC

In [75]:
print("-------------classification report for SVC ----------------\n")
clf = SVC()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

-------------classification report for SVC ----------------

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       965
           1       1.00      1.00      1.00       965

    accuracy                           1.00      1930
   macro avg       1.00      1.00      1.00      1930
weighted avg       1.00      1.00      1.00      1930

