In [296]:
# Import necessary libraries
import pandas as pd                              # data manipulation
from sklearn.model_selection import train_test_split  # train-test split
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF vectorization
from sklearn.naive_bayes import MultinomialNB      # Naive Bayes classifier
from sklearn.pipeline import Pipeline             # to create a text processing pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # evaluation metrics
import pickle                                     # saving the trained model

In [297]:
# Load the dataset correctly
data = pd.read_csv(
    r"C:\Users\karan\SPAM EMAIL DETECTION\Dataset\spam.csv",
    encoding='windows-1252'  # or 'ISO-8859-1'
)

## Explanation: 
The dataset shape tells us how many emails we have. The value_counts() of v1 shows how many ham vs spam examples are present.
This is important to see if the classes are imbalanced.

In [298]:
# Check class distribution before cleaning
print("\nClass distribution (raw):\n", data['v1'].value_counts())


Class distribution (raw):
 v1
ham     4825
spam     747
Name: count, dtype: int64


In [299]:
# Clean data
data = data[['v1', 'v2']]
data.columns = ['label', 'text']
data = data.dropna()
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
data['text'] = data['text'].str.lower()

In [300]:
# Verify changes
print("After cleaning, dataset shape:", data.shape)
print(data.head())
print("\nClass distribution (cleaned):\n", data['label'].value_counts())

After cleaning, dataset shape: (5572, 2)
   label                                               text
0      0  go until jurong point, crazy.. available only ...
1      0                      ok lar... joking wif u oni...
2      1  free entry in 2 a wkly comp to win fa cup fina...
3      0  u dun say so early hor... u c already then say...
4      0  nah i don't think he goes to usf, he lives aro...

Class distribution (cleaned):
 label
0    4825
1     747
Name: count, dtype: int64


In [301]:
print(data.head())
print(data.isnull().sum())

   label                                               text
0      0  go until jurong point, crazy.. available only ...
1      0                      ok lar... joking wif u oni...
2      1  free entry in 2 a wkly comp to win fa cup fina...
3      0  u dun say so early hor... u c already then say...
4      0  nah i don't think he goes to usf, he lives aro...
label    0
text     0
dtype: int64


In [302]:
# Define feature and target
X = data['text']
y = data['label']

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("Training set size:", X_train.shape[0], "emails")
print("Test set size:", X_test.shape[0], "emails")

Training set size: 4457 emails
Test set size: 1115 emails


In [303]:
# Build a pipeline: TF-IDF vectorizer followed by Multinomial Naive Bayes
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # convert text to TF-IDF features
    ('nb', MultinomialNB())        # train a Multinomial Naive Bayes classifier
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)
print("Pipeline training completed.")

Pipeline training completed.


In [304]:
#pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('nb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


## Evaluation

With the model trained, we predict on the test data and evaluate performance. We compute:

**Accuracy: proportion of correctly predicted emails.**

**Classification report: precision, recall, and F1-score for each class.**

**Confusion matrix: a 2x2 table showing true vs. predicted class.**

In [305]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report (precision, recall, F1-score)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9668161434977578

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Confusion Matrix:
 [[965   0]
 [ 37 113]]


In [306]:
# Save the trained pipeline to a file
model_filename = 'spam_pipeline.pkl'
with open(model_filename, 'wb') as f:
    pickle.dump(pipeline, f)
print(f"Model saved to {model_filename}")

Model saved to spam_pipeline.pkl


In [307]:
# Final confirmation message
print("✅ Spam email detection model is trained and saved successfully.")
print("Model file path:", model_filename)

✅ Spam email detection model is trained and saved successfully.
Model file path: spam_pipeline.pkl


In [308]:
# 4. Encode labels: ham -> 0, spam -> 1
data['label_num'] = data['label'].map({'ham': 0, 'spam': 1})
X = data['text']
y = data['label_num']

In [309]:
# Encode labels
data['label_num'] = data['label'].map({'ham': 0, 'spam': 1})

# REMOVE invalid rows (VERY IMPORTANT)
data = data.dropna(subset=['label_num', 'text'])

# Convert explicitly to int
data['label_num'] = data['label_num'].astype(int)

In [310]:
# 6. Split into training and test sets (e.g., 80% train, 20% test)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [311]:
print(X_train.shape)
print(y_train.shape)

(4457,)
(4457,)


In [313]:
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1-score: {f1:.3f}")
print("Confusion Matrix:")
print(cm)

Accuracy: 0.969
Precision: 1.000
Recall: 0.767
F1-score: 0.868
Confusion Matrix:
[[965   0]
 [ 35 115]]


In [315]:
import pickle

with open("spam_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("✅ spam_pipeline.pkl saved successfully")

✅ spam_pipeline.pkl saved successfully
