In [10]:
# Step 1: Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Step 2: Load Dataset
df = pd.read_csv('../data/spam.csv', encoding='latin-1')

# Step 3: Clean and Rename Columns
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

# Optional: Check for nulls
df.dropna(inplace=True)

# Step 4: Convert labels to binary
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Step 5: Vectorize text using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['text'])
y = df['label']

# Step 6: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 8: Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 9: Save Model and Vectorizer
joblib.dump(model, '../notebooks/spam_classifier.pkl')
joblib.dump(tfidf, '../notebooks/vectorizer.pkl')

print("✅ Model and Vectorizer saved successfully.")




Accuracy: 0.9443946188340807

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       965
           1       0.97      0.61      0.75       150

    accuracy                           0.94      1115
   macro avg       0.96      0.80      0.86      1115
weighted avg       0.95      0.94      0.94      1115


Confusion Matrix:
 [[962   3]
 [ 59  91]]
✅ Model and Vectorizer saved successfully.
