In [2]:
import numpy as numpy
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

#Importing 
from wordcloud import WordCloud

#Natural language processing
import nltk
from nltk.corpus import stopwords

#Downloading NLTK data
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shero\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shero\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df = pd.read_csv('spam.csv',encoding='latin-1')


# Display the first few rows of the DataFrame
print(df.head())


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [6]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)

In [7]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Data preprocessing

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['label'] = le.fit_transform(df['label'])
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#Check for duplicates
df.duplicated().sum()

403

In [11]:
df = df.drop_duplicates(keep='first')

In [12]:
len(df)

5169

Feature Engineering

In [13]:
from nltk.stem.porter import PorterStemmer

import string

ps = PorterStemmer()

In [14]:
#Function to Lower case the transformation and text preporcessing

def transform_text(text):
    text = text.lower()  # Convert to lowercase
    text = nltk.word_tokenize(text)  # Tokenize the text
    y = []
    for i in text:
        if i.isalnum():  # Check if the token is alphanumeric
            y.append(i)  # Append to the list if it is
    text = y[:]  # Create a copy of the list
    y.clear()  # Clear the original list
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))  # Apply stemming
    return " ".join(y)  # Join the list into a string

In [15]:
#checking the function if it works
print(transform_text('This is a sample message!'))  # Example usage of the function

sampl messag


In [16]:
#Applying the transformation to the 'message' column
df['transformed_text'] = df['message'].apply(transform_text)
df.head()

Unnamed: 0,label,message,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfidf = TfidfVectorizer(max_features=500)

In [23]:
x = tfidf.fit_transform(df['transformed_text']).toarray()
y= df['label'].values

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Train Test Split

In [25]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, random_state=2)

Model Training

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [28]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knn = KNeighborsClassifier()
nb = MultinomialNB()
dt = DecisionTreeClassifier(max_depth=5)
lr = LogisticRegression(solver='liblinear',penalty="l1")
rf = RandomForestClassifier(n_estimators=50, random_state=2)
ab = AdaBoostClassifier(n_estimators=50, random_state=2)
bg = BaggingClassifier(n_estimators=50, random_state=2)
et = ExtraTreesClassifier(n_estimators=50, random_state=2)
gb = GradientBoostingClassifier(n_estimators=50, random_state=2)
xgb = XGBClassifier(n_estimators=50, random_state=2)

In [30]:
clfs = {
    "SVC": svc,
    "KNN": knn,         
    "NB": nb,
    "DT": dt,
    "LR": lr,
    "RF": rf,
    "Adaboost": ab,
    "Bgc": bg,
    "ETC": et,
    "GBC": gb,
    "XGB": xgb
    
}
from sklearn.metrics import accuracy_score, precision_score

Model Evaluation

In [31]:
def train_classifier(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy, precision

In [32]:
accuracy_scores = []
precision_scores = []

for name, clf in clfs.items():
    print(f"Training {name}...")
    accuracy, precision = train_classifier(clf, x_train, y_train, x_test, y_test)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    print(f"{name} - Accuracy: {accuracy}, Precision: {precision}\n")

Training SVC...
SVC - Accuracy: 0.9661508704061895, Precision: 0.9327731092436975

Training KNN...
KNN - Accuracy: 0.9274661508704062, Precision: 1.0

Training NB...
NB - Accuracy: 0.9709864603481625, Precision: 0.9655172413793104

Training DT...
DT - Accuracy: 0.9361702127659575, Precision: 0.9

Training LR...
LR - Accuracy: 0.9622823984526112, Precision: 0.9541284403669725

Training RF...
RF - Accuracy: 0.971953578336557, Precision: 0.943089430894309

Training Adaboost...




Adaboost - Accuracy: 0.9613152804642167, Precision: 0.9375

Training Bgc...
Bgc - Accuracy: 0.965183752417795, Precision: 0.9180327868852459

Training ETC...
ETC - Accuracy: 0.9729206963249516, Precision: 0.9296875

Training GBC...
GBC - Accuracy: 0.9506769825918762, Precision: 0.9393939393939394

Training XGB...
XGB - Accuracy: 0.9709864603481625, Precision: 0.9576271186440678

