In [142]:
# 📘 HamiSkills Week 1 - Spam Email Classifier
# Author: Sabirin Mire Abukar
# Project: Spam/Ham Detection for Hami MiniMarket

#  1. Import Libraries 
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [143]:
#  2. Load Dataset
df = pd.read_csv('../dataset/spam.csv')   
print("Dataset loaded successfully!")
print(df.head())

Dataset loaded successfully!
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [144]:
#  3. Data Cleaning 
df["Category"] = df["Category"].fillna("").astype(str).str.lower().str.strip()
df["Message"] = df["Message"].astype(str).apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x.lower()))

# Encode labels: spam → 0, ham → 1
df["Category"] = df["Category"].map({"spam": 0, "ham": 1})


In [145]:
#  4. Split Data 
X = df["Message"]
y = df["Category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [146]:

#  5. Text Vectorization (TF-IDF) 
tfidf = TfidfVectorizer(stop_words="english", lowercase=True, min_df=1)
X_train_features = tfidf.fit_transform(X_train)
X_test_features = tfidf.transform(X_test)

In [147]:

#  6. Train the Model

# Logistic Regression
lr = LogisticRegression( random_state=42)
lr.fit(X_train_features, y_train)

In [148]:
#  7. Model Evaluation Functions 
def print_metrics(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=0)
    rec = recall_score(y_true, y_pred, pos_label=0)
    f1 = f1_score(y_true, y_pred, pos_label=0)
    print(f"\n{name}")
    print(f"Accuracy : {acc:.2f}")
    print(f"Precision: {prec:.2f}")
    print(f"Recall   : {rec:.2f}")
    print(f"F1-score : {f1:.2f}")

def print_confmat(name, y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    cm_df = pd.DataFrame(
        cm,
        index=["Actual: Ham (1)", "Actual: Spam (0)"],
        columns=["Pred: Ham (1)", "Pred: Spam (0)"]
    )
    print(f"\n{name} Confusion Matrix:\n{cm_df}")


In [149]:

#  8. Evaluate Model
lr_pred = lr.predict(X_test_features)

print_metrics("Logistic Regression", y_test, lr_pred)
print_confmat("Logistic Regression", y_test, lr_pred)



Logistic Regression
Accuracy : 0.96
Precision: 1.00
Recall   : 0.72
F1-score : 0.84

Logistic Regression Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             41             108


In [150]:
#  9. Save Best Model and Vectorizer 
joblib.dump(lr, "../models/spam_classifier.pkl")
joblib.dump(tfidf, "../models/tfidf_vectorizer.pkl")
print("\n Model and vectorizer saved successfully!")


 Model and vectorizer saved successfully!


In [151]:
# 10. CLI Tool for User Input 
print("\n Spam Detection CLI - Type 'exit' to quit\n")

loaded_model = joblib.load("../models/spam_classifier.pkl")
loaded_vectorizer = joblib.load("../models/tfidf_vectorizer.pkl")

while True:
    user_input = input("Enter a message: ")
    if user_input.lower() == "exit":
        print("Goodbye")
        break
    processed = re.sub(r'[^a-zA-Z\s]', '', user_input.lower())
    features = loaded_vectorizer.transform([processed])
    prediction = loaded_model.predict(features)[0]
    label = "SPAM" if prediction == 0 else "NOT SPAM"
    print(f"Prediction: {label}\n")



 Spam Detection CLI - Type 'exit' to quit



FileNotFoundError: [Errno 2] No such file or directory: 'models/spam_classifier.pkl'