In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
file_path = 'spam.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

In [3]:
# Step 1: Data Cleaning
# Dropping irrelevant columns
cleaned_data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [4]:
# Renaming columns for clarity
cleaned_data.columns = ['Label', 'Message']

# Removing duplicate entries
cleaned_data = cleaned_data.drop_duplicates()

# Adding a new feature: message length
cleaned_data['Message_Length'] = cleaned_data['Message'].apply(len)

In [5]:
# Data Exploration (EDA)
# Checking class distribution
class_distribution = cleaned_data['Label'].value_counts(normalize=True)
print("Class Distribution:\n", class_distribution)

Class Distribution:
 Label
ham     0.87367
spam    0.12633
Name: proportion, dtype: float64


In [6]:
# Summary statistics for message length by label
length_stats = cleaned_data.groupby('Label')['Message_Length'].describe()
print("\nMessage Length Statistics by Label:\n", length_stats)


Message Length Statistics by Label:
         count        mean        std   min    25%    50%    75%    max
Label                                                                 
ham    4516.0   70.459256  56.358207   2.0   34.0   52.0   90.0  910.0
spam    653.0  137.891271  30.137753  13.0  132.0  149.0  157.0  224.0


In [7]:
# Step 3: Data Preparation
# Splitting data into features and labels
X = cleaned_data['Message']
y = cleaned_data['Label']

In [8]:
# Converting labels to binary (ham = 0, spam = 1)
y = y.map({'ham': 0, 'spam': 1})

# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Applying TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 4: Baseline Model - Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predictions and evaluation for Naive Bayes
y_pred_nb = nb_model.predict(X_test_tfidf)
print("\nNaive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb, target_names=['Ham', 'Spam']))
print("Naive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

# Step 5: Advanced Model - Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Predictions and evaluation for Random Forest
y_pred_rf = rf_model.predict(X_test_tfidf)
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf, target_names=['Ham', 'Spam']))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Naive Bayes Classification Report:
               precision    recall  f1-score   support

         Ham       0.97      1.00      0.99       903
        Spam       0.99      0.80      0.89       131

    accuracy                           0.97      1034
   macro avg       0.98      0.90      0.94      1034
weighted avg       0.97      0.97      0.97      1034

Naive Bayes Confusion Matrix:
 [[902   1]
 [ 26 105]]

Random Forest Classification Report:
               precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       903
        Spam       0.98      0.84      0.91       131

    accuracy                           0.98      1034
   macro avg       0.98      0.92      0.95      1034
weighted avg       0.98      0.98      0.98      1034

Random Forest Confusion Matrix:
 [[901   2]
 [ 21 110]]
