In [18]:
import pandas as pd
import zipfile
import requests
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import roc_curve,auc

import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
#Step-1: Download the zip file from the url
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url)
#step-2: Extract the file from the zip archive
with zipfile.ZipFile(BytesIO(response.content)) as z:
    with z.open('SMSSpamCollection') as file:
        df = pd.read_csv(file, sep='\t', header=None, names=['Label',' Message'])

#Check the DataSet
df.head()


Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [48]:
file_path = "SMSSpamCollection"

#read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t', header=None, names=['Label', 'Message'])

In [50]:
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [52]:
#Convert labels to Binary
df['Label'] = df['Label'].map({'ham':0, 'spam': 1})

#Split into training and test set
X_train,X_test,y_train,y_test = train_test_split(df['Message'], df['Label'], test_size=0.3, random_state=42)

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [66]:
# Initiate TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# fill and transform the training data
X_train_vec = vectorizer.fit_transform(X_train)

#Transform the test Data
X_test_vec = vectorizer.transform(X_test)

In [68]:
nb_classifier = MultinomialNB()

# train the model 
nb_classifier.fit(X_train_vec,X_train)

In [69]:
#Predict
y_pred = nb_classifier.predict(X_test_vec)

In [74]:
#Evaluate the Model 
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy:", accuracy)
classification_report = classification_report(y_test, y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)

print(f"confusion Matrix: {confusion_matrix}")
print(f"classification Report: {classification_report}") 

Accuracy: 0.0


ValueError: Mix of label input types (string and number)

In [76]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'],yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

ValueError: Mix of label input types (string and number)