In [5]:
# Step 1: Install dependencies
!pip install pandas scikit-learn joblib

# Step 2: Download and load the SMS Spam dataset
import pandas as pd

# Download dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
!wget $url -O smsspamcollection.zip
!unzip -o smsspamcollection.zip

# Load into DataFrame
df = pd.read_csv("SMSSpamCollection", sep='\t', names=["label", "message"])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})  # Convert labels to 0 and 1
df.head()


--2025-05-14 14:27:28--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘smsspamcollection.zip’

smsspamcollection.z     [  <=>               ] 198.65K   756KB/s    in 0.3s    

2025-05-14 14:27:29 (756 KB/s) - ‘smsspamcollection.zip’ saved [203415]

Archive:  smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', max_features=3000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Logistic Regression with class imbalance handling
model = LogisticRegression(class_weight='balanced')
model.fit(X_train_vec, y_train)

# Evaluate
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9838565022421525
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.93      0.95      0.94       149

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [7]:
# Save both model and vectorizer
joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
print("✅ Model and vectorizer saved.")


✅ Model and vectorizer saved.


In [9]:
# Load
model = joblib.load('spam_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Predict function with confidence
def predict_spam(message):
    message = message.lower().strip()
    vec = vectorizer.transform([message])
    proba = model.predict_proba(vec)[0][1]  # Probability of spam
    prediction = model.predict(vec)[0]
    return f"{'SPAM' if prediction == 1 else 'HAM'} (Confidence: {proba:.2f})"

# Test
print(predict_spam("Congratulations! You've won a $1000 Walmart gift card. Call now!"))
print(predict_spam("Hey, are we still meeting for lunch today?"))


SPAM (Confidence: 0.57)
HAM (Confidence: 0.04)


In [11]:
# SPAM messages
print(predict_spam("Congratulations! You've won a free iPhone. Click here to claim now!"))
print(predict_spam("URGENT! Your account has been suspended. Verify at www.fakebank.com."))
print(predict_spam("You have been selected for a $1000 Walmart gift card. Reply YES to claim."))
print(predict_spam("FREE entry into our $500 weekly prize draw! Text WIN to 80085 now!"))
print(predict_spam("Get cheap meds now! No prescription needed. Visit www.pharmacydealz.ru"))

# HAM messages
print(predict_spam("Hey, are we still on for dinner tonight?"))
print(predict_spam("Please bring the documents to the office tomorrow."))
print(predict_spam("Happy birthday! Hope you have a great day."))
print(predict_spam("I’m running late, I’ll be there in 15 mins."))
print(predict_spam("Can you send me the notes from class?"))


SPAM (Confidence: 0.89)
SPAM (Confidence: 0.96)
SPAM (Confidence: 0.83)
SPAM (Confidence: 0.98)
HAM (Confidence: 0.41)
HAM (Confidence: 0.06)
HAM (Confidence: 0.11)
HAM (Confidence: 0.05)
HAM (Confidence: 0.15)
HAM (Confidence: 0.24)
