In [4]:
# ==============================
# EMAIL / SMS SPAM DETECTION
# ==============================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# ------------------------------
# 1. LOAD DATASET (FIXED)
# ------------------------------
data = pd.read_csv(
    r"C:\Users\LENOVO\Downloads\archive (33)\spam.csv",
    encoding="latin-1"
)

# ------------------------------
# 2. KEEP REQUIRED COLUMNS
# ------------------------------
data = data[["v1", "v2"]]     # ✅ correct columns
data.columns = ["label", "text"]  # ✅ rename properly

print("Dataset Preview:")
print(data.head())
print("\nColumns:", data.columns)

# ------------------------------
# 3. LABEL ENCODING
# ------------------------------
data["label"] = data["label"].map({"ham": 0, "spam": 1})

# ------------------------------
# 4. SPLIT DATA
# ------------------------------
X = data["text"]     # ✅ NOW THIS EXISTS
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ------------------------------
# 5. TEXT VECTORIZATION
# ------------------------------
vectorizer = TfidfVectorizer(stop_words="english")
Xv_train = vectorizer.fit_transform(X_train)
Xv_test = vectorizer.transform(X_test)

# ------------------------------
# 6. MODEL TRAINING
# ------------------------------
model = MultinomialNB()
model.fit(Xv_train, y_train)

# ------------------------------
# 7. EVALUATION
# ------------------------------
y_pred = model.predict(Xv_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ------------------------------
# 8. SAVE MODEL
# ------------------------------
joblib.dump(model, "spam_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

print("\n✅ Model saved successfully!")


Dataset Preview:
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Columns: Index(['label', 'text'], dtype='object')

Accuracy: 0.968609865470852

Confusion Matrix:
 [[966   0]
 [ 35 114]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115


✅ Model saved successfully!
