Imports

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


Bringing in the data

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

Load and Clean Training Data

In [None]:
train_data=pd.read_csv("train-data.tsv",sep="\t",header=None,names=["label","message"])
train_data['label']=train_data['label'].str.strip().str.lower()


Train Model

In [None]:
model=Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('clf',LogisticRegression(max_iter=1000))
])
model.fit(train_data['message'],train_data['label'])


Prediction Function

In [None]:
def predict_message(message,threshold=0.3):
    probas= model.predict_proba([message])[0]
    spam_prob=probas[model.classes_.tolist().index("spam")]
    label="spam" if spam_prob >= threshold else "ham"
    return [spam_prob, label]


Evaluation

In [None]:
test_data =pd.read_csv("valid-data.tsv", sep="\t", header=None, names=["label", "message"])
test_data['label'] =test_data['label'].str.strip().str.lower()
y_true =test_data['label']
y_pred =[predict_message(msg)[1] for msg in test_data['message']]
acc =accuracy_score(y_true, y_pred)
prec =precision_score(y_true, y_pred, pos_label='spam')
rec =recall_score(y_true, y_pred, pos_label='spam')
f1 =f1_score(y_true, y_pred, pos_label='spam')
print("📊 Model Evaluation:")
print(f"Accuracy :{acc:.4f}")
print(f"Precision:{prec:.4f}")
print(f"Recall   :{rec:.4f}")
print(f"F1 Score :{f1:.4f}")
print("\nDetailed Report:\n")
print(classification_report(y_true, y_pred,target_names=["ham", "spam"]))


Confusion Matrix

In [None]:
cm =confusion_matrix(y_true,y_pred,labels=["ham","spam"])
sns.heatmap(cm,annot=True,fmt='d',cmap='Blues',xticklabels=["ham","spam"],yticklabels=["ham","spam"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    print(predict_message(msg))
    prediction=predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("Passed")
  else:
    print("Failed")

test_predictions()
