<a href="https://colab.research.google.com/github/OATS2001/OATS2001/blob/main/sms_spam_classification_empty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📘 SMS Spam Classification Notebook

## Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

## Load the dataset

In [None]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
display(df.head())

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


##  Convert labels to binary

In [None]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
display(df.head())

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Features and labels

In [None]:
X = df['message']
y = df['label']
display(X.head())
display(y.head())

Unnamed: 0,message
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


Unnamed: 0,label
0,0
1,0
2,1
3,0
4,0


## Split into train/test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (4179,)
Shape of X_test: (1393,)
Shape of y_train: (4179,)
Shape of y_test: (1393,)


## Vectorize the text data using TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF Vectorized Data Shapes:")
print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_test_tfidf shape: {X_test_tfidf.shape}")

TF-IDF Vectorized Data Shapes:
X_train_tfidf shape: (4179, 7490)
X_test_tfidf shape: (1393, 7490)


##  Define models

In [None]:
# Define models
mnb = MultinomialNB()
lr = LogisticRegression(solver='liblinear') # Using liblinear solver for smaller datasets
svm = SVC(probability=True) # probability=True for calibration and compatibility with GridSearchCV

### Train the model

In [None]:
# Train the models
mnb.fit(X_train_tfidf, y_train)
lr.fit(X_train_tfidf, y_train)
svm.fit(X_train_tfidf, y_train)

print("Models trained successfully!")

Models trained successfully!


## View best model

In [None]:
# Find the best model based on accuracy
best_model_name = max(results, key=lambda k: results[k]['accuracy'])
best_model_accuracy = results[best_model_name]['accuracy']

print(f"The best performing model is: {best_model_name}")
print(f"Accuracy of the best model: {best_model_accuracy}")

The best performing model is: Support Vector Machine
Accuracy of the best model: 0.9849246231155779


## Prediction

In [None]:
# Evaluate the models
models = {'Multinomial Naive Bayes': mnb, 'Logistic Regression': lr, 'Support Vector Machine': svm}
results = {}

for name, model in models.items():
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)

    results[name] = {
        'accuracy': accuracy,
        'classification_report': classification_rep,
        'confusion_matrix': confusion_mat
    }

    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_rep)
    print("Confusion Matrix:")
    print(confusion_mat)
    print("-" * 20)

--- Multinomial Naive Bayes ---
Accuracy: 0.9641062455132807
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1207
           1       1.00      0.73      0.84       186

    accuracy                           0.96      1393
   macro avg       0.98      0.87      0.91      1393
weighted avg       0.97      0.96      0.96      1393

Confusion Matrix:
[[1207    0]
 [  50  136]]
--------------------
--- Logistic Regression ---
Accuracy: 0.9734386216798278
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1207
           1       1.00      0.80      0.89       186

    accuracy                           0.97      1393
   macro avg       0.99      0.90      0.94      1393
weighted avg       0.97      0.97      0.97      1393

Confusion Matrix:
[[1207    0]
 [  37  149]]
--------------------
--- Support Vector Machine ---
Accuracy: 0.9849246

In [None]:
# Example of predicting on a new message
new_message = "Congratulations! You have won a free prize. Claim it now!"

# Vectorize the new message using the *same* fitted TF-IDF vectorizer
new_message_tfidf = tfidf_vectorizer.transform([new_message])

# Use the best model to predict the label
best_model = svm # The best model was identified as SVM
prediction = best_model.predict(new_message_tfidf)

# Convert the prediction back to the original label (ham or spam)
predicted_label = 'spam' if prediction[0] == 1 else 'ham'

print(f"The message: '{new_message}' is predicted as: {predicted_label}")

The message: 'Congratulations! You have won a free prize. Claim it now!' is predicted as: spam
