# Import Important Libraries

In [15]:
import pandas as pd
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score





Shape: (5572, 2)


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Loading dataset

In [None]:
# Load the dataset (it's tab-separated, not comma-separated!)
df = pd.read_csv("../data/sms.tsv.txt", sep = "\t", header=None)
df.columns = ['label', 'message']  # Rename for clarity

# Show basic info
print("Shape:", df.shape)
df.head()

# Checking  for missing values and class balance

In [14]:
# Check for nulls
print("Missing values:\n", df.isnull().sum())

# Class distribution
df['label'].value_counts()


Missing values:
 label      0
message    0
dtype: int64


label
ham     4825
spam     747
Name: count, dtype: int64

# Cleaning text

In [16]:
# Basic preprocessing function
def clean_text(text):
    text = text.lower()                          # lowercase
    text = re.sub(r'\d+', '', text)              # remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = text.strip()
    return text

# Apply to all messages
df['cleaned'] = df['message'].apply(clean_text)
df[['message', 'cleaned']].head()


Unnamed: 0,message,cleaned
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


# Converting to vectors

In [17]:
# Create Bag-of-Words features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned'])  # this becomes our feature matrix

# Target variable
y = df['label']


#  Spliting into Train & Test

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (4457, 8608)
Test size: (1115, 8608)


# Training Model

In [19]:
# Create and train the model
model = MultinomialNB()
model.fit(X_train, y_train)


#  Making  Predictions

In [20]:
# Predict on test data
y_pred = model.predict(X_test)


# Evaluating Model

In [21]:
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9739910313901345

Confusion Matrix:
[[947  19]
 [ 10 139]]

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.98      0.98       966
        spam       0.88      0.93      0.91       149

    accuracy                           0.97      1115
   macro avg       0.93      0.96      0.95      1115
weighted avg       0.97      0.97      0.97      1115



In [26]:
import joblib

# Save model and vectorizer
joblib.dump(model, "../model/spam_classifier_model.joblib")
joblib.dump(vectorizer, "../model/vectorizer.joblib")


['../model/vectorizer.joblib']