Importing necessary libraries and loading the dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
import pandas as pd
from google.colab import files
import io

uploaded = files.upload()

# Get the key of the uploaded file
key = list(uploaded.keys())[0]

# Decode with errors='replace' to handle invalid bytes
sms_data = pd.read_csv(io.StringIO(uploaded[key].decode('utf-8', errors='replace')))

Saving spam.csv to spam.csv


Exploring and preprocessing the data

In [None]:
sms_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
print(sms_data.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [None]:
sms_data = sms_data.rename(columns={'v1': 'label', 'v2': 'text'})

In [None]:
sms_data['label'] = sms_data['label'].map({'ham': 0, 'spam': 1})

In [None]:
import string

sms_data['text'] = sms_data['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

sms_data['text'] = sms_data['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Spliting the data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sms_data['text'], sms_data['label'], test_size=0.2, random_state=42)

Creating a text classification model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform([' '.join(msg) for msg in X_train])
X_test_tfidf = vectorizer.transform([' '.join(msg) for msg in X_test])

log_reg = LogisticRegression()
log_reg.fit(X_train_tfidf, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = log_reg.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9659192825112107
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:
[[965   0]
 [ 38 112]]


Evaluating the model

In [None]:
from sklearn.metrics import classification_report

# Get the predicted labels
y_pred = log_reg.predict(X_test_tfidf)

# Create a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["ham", "spam"]))

Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115



Classification of the machine as spam or ham

In [None]:
for i, actual_value in enumerate(y_test):
    actual = "spam" if actual_value == 1 else "ham"
    predicted_value = y_pred[i]
    predicted = "spam" if predicted_value == 1 else "ham"
    result = "Correct" if actual == predicted else "Misclassified"
    print(f"Message {i+1}\t{actual}\t{predicted}\t{result}")

Message 1	ham	ham	Correct
Message 2	ham	ham	Correct
Message 3	spam	ham	Misclassified
Message 4	ham	ham	Correct
Message 5	spam	spam	Correct
Message 6	ham	ham	Correct
Message 7	ham	ham	Correct
Message 8	ham	ham	Correct
Message 9	ham	ham	Correct
Message 10	ham	ham	Correct
Message 11	ham	ham	Correct
Message 12	spam	spam	Correct
Message 13	ham	ham	Correct
Message 14	ham	ham	Correct
Message 15	ham	ham	Correct
Message 16	ham	ham	Correct
Message 17	ham	ham	Correct
Message 18	spam	spam	Correct
Message 19	ham	ham	Correct
Message 20	ham	ham	Correct
Message 21	ham	ham	Correct
Message 22	ham	ham	Correct
Message 23	ham	ham	Correct
Message 24	ham	ham	Correct
Message 25	spam	spam	Correct
Message 26	ham	ham	Correct
Message 27	ham	ham	Correct
Message 28	ham	ham	Correct
Message 29	ham	ham	Correct
Message 30	ham	ham	Correct
Message 31	ham	ham	Correct
Message 32	ham	ham	Correct
Message 33	spam	ham	Misclassified
Message 34	ham	ham	Correct
Message 35	ham	ham	Correct
Message 36	ham	ham	Correct
Message 37	ham	