In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import zipfile
import os

# Path to your uploaded zip file in Drive
zip_path = "/content/drive/MyDrive/spam_dataset/archive (1).zip"  # change folder name if different
extract_path = "/content/spam_dataset"

# Extract
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# List extracted files
os.listdir(extract_path)


['spam.csv']

In [3]:
import pandas as pd

# Load dataset
csv_path = os.path.join(extract_path, "spam.csv")
df = pd.read_csv(csv_path, encoding='latin-1')

# Keep only label + message columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

print("Dataset shape:", df.shape)
print(df['label'].value_counts())
df.head()


Dataset shape: (5572, 2)
label
ham     4825
spam     747
Name: count, dtype: int64


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   # <-- add this line
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    text = str(text).lower()                  # lowercase
    text = re.sub(r'[^\w\s]', ' ', text)      # remove punctuation
    text = re.sub(r'\d+', ' ', text)          # remove numbers
    tokens = word_tokenize(text)              # tokenize
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    return " ".join(tokens)

# Apply cleaning
df['clean'] = df['message'].apply(clean_text)
df[['message', 'clean']].head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,message,clean
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis great wo...
1,Ok lar... Joking wif u oni...,ok lar joking wif oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,U dun say so early hor... U c already then say...,dun say early hor already say
4,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Split data
X = df['clean']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Vectorize (Bag-of-Words)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Train shape:", X_train_vec.shape, "Test shape:", X_test_vec.shape)


Train shape: (4457, 6744) Test shape: (1115, 6744)


In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# Predict
y_pred = clf.predict(X_test_vec)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9856502242152466

Classification Report:
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.98      0.91      0.94       149

    accuracy                           0.99      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115


Confusion Matrix:
 [[963   3]
 [ 13 136]]


In [8]:
import joblib

OUT_DIR = '/content/drive/MyDrive/spam_project'
os.makedirs(OUT_DIR, exist_ok=True)

joblib.dump(clf, os.path.join(OUT_DIR, 'spam_clf.joblib'))
joblib.dump(vectorizer, os.path.join(OUT_DIR, 'spam_vectorizer.joblib'))

print("Saved model & vectorizer to:", OUT_DIR)


Saved model & vectorizer to: /content/drive/MyDrive/spam_project


In [9]:
  # Reload model/vectorizer (just to show persistence works)
  clf = joblib.load(os.path.join(OUT_DIR, 'spam_clf.joblib'))
  vectorizer = joblib.load(os.path.join(OUT_DIR, 'spam_vectorizer.joblib'))

  # Function for new predictions
  def predict_text(msg):
      cleaned = clean_text(msg)
      vec = vectorizer.transform([cleaned])
      return clf.predict(vec)[0]

  # Test examples
  examples = [
      "Congratulations! You've won a free lottery. Call now to claim.",
      "Hey bro, are we meeting at 7 pm tomorrow?",
      "URGENT! Your account has been compromised. Reset your password immediately."
  ]

  for e in examples:
      print(f"{e} --> {predict_text(e)}")


Congratulations! You've won a free lottery. Call now to claim. --> spam
Hey bro, are we meeting at 7 pm tomorrow? --> ham
URGENT! Your account has been compromised. Reset your password immediately. --> spam


In [11]:
# Reload model/vectorizer
import joblib
import os

clf = joblib.load(os.path.join(OUT_DIR, 'spam_clf.joblib'))
vectorizer = joblib.load(os.path.join(OUT_DIR, 'spam_vectorizer.joblib'))

# Function for predictions
def predict_text(msg):
    cleaned = clean_text(msg)
    vec = vectorizer.transform([cleaned])
    return clf.predict(vec)[0]

# --- Custom input from user ---
while True:
    msg = input("Enter a message (or type 'quit' to stop): ")
    if msg.lower() == "quit":
        break
    print("Prediction:", predict_text(msg))


KeyboardInterrupt: Interrupted by user

In [12]:
msg = ["you won a lottery"]
cleaned = [clean_text(m) for m in msg]
vec = vectorizer.transform(cleaned)

print("Prediction:", clf.predict(vec))
print("Probabilities (ham, spam):", clf.predict_proba(vec))


Prediction: ['ham']
Probabilities (ham, spam): [[0.86582903 0.13417097]]
