In [14]:
# Importing the required librariesimport pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [15]:
# Reading the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

In [16]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [17]:
# Dropping the NA values
df.dropna(subset=['v1','v2'], inplace=True)

In [18]:
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [19]:
X = df['v2']
y = df['v1']

In [20]:
Splitting the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [21]:
# Implementing the model using the TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

In [22]:
# Implementing the model using the Naive Bayes Classifier
nbc = MultinomialNB()
nbc.fit(X_train, y_train)
y_pred = nbc.predict(X_test)
print(f"Model: Naive Bayes Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Confusion Matrix:{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:{classification_report(y_test, y_pred)}")

Model: Naive Bayes Classifier
Accuracy: 0.9704
Confusion Matrix:[[965   1]
 [ 32 117]]
Classification Report:              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       0.99      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [23]:
# Implementing the model using the Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print(f"Model: Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Confusion Matrix:{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:{classification_report(y_test, y_pred)}")

Model: Logistic Regression
Accuracy: 0.9686
Confusion Matrix:[[965   1]
 [ 34 115]]
Classification Report:              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       0.99      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [24]:
# Implementing the model using the Support Vector Machine Classifier
svc = SVC(kernel='linear', random_state=42)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print(f"Model: Support Vector Machine Classifier")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Confusion Matrix:{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report:{classification_report(y_test, y_pred)}")

Model: Support Vector Machine Classifier
Accuracy: 0.9812
Confusion Matrix:[[964   2]
 [ 19 130]]
Classification Report:              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.98      0.87      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

