# LOAD DATA AND SET UP

In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('/content/sample_data/spam_ham_dataset.csv')

In [5]:
data

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


# PREPROCESSING DATA

In [67]:
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [64]:
def preprocess_text(text):

    text = text.lower()

    text = re.sub(r'\W', ' ', text)

    return text

In [65]:
data['cleaned_text'] = data['text'].apply(preprocess_text)

In [70]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['cleaned_text'])

y = data['label'].apply(lambda x: 1 if x == 'spam' else 0)

In [90]:
data_sampled = data.sample(frac=0.1, random_state=42)

# FEATURE SELECTION

In [22]:
from sklearn.feature_selection import RFECV, SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

In [86]:
X = vectorizer.fit_transform(data_sampled['text']).toarray()
y = data_sampled['label'].values

selector = SelectKBest(chi2, k=1000)
X_reduced = selector.fit_transform(X, y)

model = LogisticRegression(max_iter=1000)

rfecv = RFECV(estimator=model,
              step=1,
              cv=StratifiedKFold(5),
              scoring='accuracy')

rfecv.fit(X_reduced, y)

X_selected = rfecv.transform(X_reduced)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 933


# APPLY SPAM FILTER ALGORITHMS

In [75]:
from sklearn.model_selection import train_test_split

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

In [77]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [78]:
mnb_model = MultinomialNB()
mnb_model.fit (X_train, y_train)

In [79]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

In [80]:
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

### Single Prediction

In [92]:
new_email_text = ["Get approved for a loan with just one click. No credit check required. Apply now!"]

new_email_vectorized = vectorizer.transform(new_email_text).toarray()

new_email_selected = selector.transform(new_email_vectorized)
new_email_selected = rfecv.transform(new_email_selected)

single_prediction = gnb_model.predict(new_email_selected)

print("Predicted class:", "Not Spam" if single_prediction[0] == 0 else "Spam")

Predicted class: Spam


### Make Predictions and Evaluate Accuracy


In [82]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [83]:
mnb_predictions = mnb_model.predict(X_test)
dt_predictions = dt_model.predict(X_test)
gnb_predictions = gnb_model.predict(X_test)

In [84]:
mnb_accuracy = accuracy_score(y_test, mnb_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)
gnb_accuracy = accuracy_score(y_test, gnb_predictions)

print(f'Multinomial Naive Bayes Accuracy: {mnb_accuracy}', '\n')

print(f'Decision Tree Accuracy: {dt_accuracy}', '\n')

print(f'Gaussian Naive Bayes Accuracy: {gnb_accuracy}')

Multinomial Naive Bayes Accuracy: 0.9423076923076923 

Decision Tree Accuracy: 0.8846153846153846 

Gaussian Naive Bayes Accuracy: 0.9743589743589743


### Confusion Matrix

In [85]:
mnb_matrix = confusion_matrix(y_test, mnb_predictions)
dt_matrix = confusion_matrix(y_test, dt_predictions)
gnb_matrix = confusion_matrix(y_test, gnb_predictions)

print('Multinomial Naive Bayes Confusion Matrix:')
print(mnb_matrix, '\n')

print('Decision Tree Confusion Matrix:')
print(dt_matrix, '\n')

print('Gaussian Naive Bayes Confusion Matrix:')
print(gnb_matrix)

Multinomial Naive Bayes Confusion Matrix:
[[123   0]
 [  9  24]] 

Decision Tree Confusion Matrix:
[[112  11]
 [  7  26]] 

Gaussian Naive Bayes Confusion Matrix:
[[120   3]
 [  1  32]]
