In [1]:
## Step 1: Import Required Libraries

import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
## Step 2: Load the Dataset

df = pd.read_csv("Spam.csv.csv", encoding='latin-1')

# Display first few rows to understand the structure
print("Dataset Preview:")
display(df.head())

Dataset Preview:


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
## Step 3: Data Cleaning and Preprocessing

def clean_text(text):
    """Function to clean text data"""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply text cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Display cleaned text preview
print("Cleaned Text Preview:")
display(df[['text', 'clean_text']].head())

Cleaned Text Preview:


Unnamed: 0,text,clean_text
0,Subject: naturally irresistible your corporate...,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,subject the stock trading gunslinger fanny is ...
2,Subject: unbelievable new homes made easy im ...,subject unbelievable new homes made easy im wa...
3,Subject: 4 color printing special request add...,subject color printing special request additio...
4,"Subject: do not have money , get software cds ...",subject do not have money get software cds fro...


In [4]:
## Step 4: Convert Text into Numerical Format using TF-IDF

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['spam']

# Display shape of transformed data
print("TF-IDF Feature Matrix Shape:", X.shape)

TF-IDF Feature Matrix Shape: (5728, 5000)


In [5]:
## Step 5: Split Data into Training and Testing Sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display data split sizes
print("Training Set Size:", X_train.shape[0])
print("Testing Set Size:", X_test.shape[0])

Training Set Size: 4582
Testing Set Size: 1146


In [6]:
## Step 6: Train a Machine Learning Model (Random Forest Classifier)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Print model training completion
print("Model training completed.")

Model training completed.


In [7]:
## Step 7: Make Predictions

y_pred = model.predict(X_test)

# Print sample predictions
print("Sample Predictions:")
display(pd.DataFrame({'Actual': y_test[:10].values, 'Predicted': y_pred[:10]}))

Sample Predictions:


Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,1,1
6,0,0
7,0,0
8,0,0
9,1,1


In [8]:
## Step 8: Evaluate the Model

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print evaluation results
print("Model Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Model Accuracy: 0.9842931937172775
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       856
           1       1.00      0.94      0.97       290

    accuracy                           0.98      1146
   macro avg       0.99      0.97      0.98      1146
weighted avg       0.98      0.98      0.98      1146



In [9]:
## Step 9: Save the Model

joblib.dump(model, "spam_classifier.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and Vectorizer saved successfully.")

Model and Vectorizer saved successfully.
