# Working with the mail_data dataset to detect spam messages 

### IMporting packages 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split

In [3]:
df=pd.read_csv('../Data/mail_data - mail_data.csv')

In [4]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:

# Display the dimensions of the dataset
print("Dimensions of the dataset:", df.shape)

# Display the first few rows of the dataset
print(df.head())

Dimensions of the dataset: (5572, 2)
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [8]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Preprocess text: lowercase, remove punctuation
df['Message'] = df['Message'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

# Split data into features and target
X = df['Message']
y = df['Category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
X_train_transformed = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_transformed = vectorizer.transform(X_test)


In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_transformed, y_train)

# Predictions on the test set
y_pred = nb_classifier.predict(X_test_transformed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.9874439461883409
Classification Report:
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       1.00      0.91      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



Undersampling decreases the number of rows where ham is present and makes it equal to spam 

In [None]:
ham_messages=df[df['Category']=='ham']
spam_messages=df[df['Category']=='spam']


In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Initialize RandomUnderSampler
undersampler = RandomUnderSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = undersampler.fit_resample(X_train_transformed, y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the resampled data
rf_classifier.fit(X_resampled, y_resampled)

# Predictions on the test set
y_pred = rf_classifier.predict(X_test_transformed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [13]:

# Separate "ham" and "spam" data
ham_data = df[df['Category'] == 'ham']
spam_data = df[df['Category'] == 'spam']

# Calculate the number of samples in each class
num_samples = min(len(ham_data), len(spam_data))

# Equalize the number of samples in each class
balanced_data = pd.concat([ham_data.sample(num_samples, random_state=42), spam_data.sample(num_samples, random_state=42)])

# Shuffle the balanced data
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced data to a new CSV file
balanced_data.to_csv('balanced_df.csv', index=False)

# Display the dimensions of the balanced data
print("Dimensions of the balanced data:", balanced_data.shape)


Dimensions of the balanced data: (1494, 2)


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the balanced data into features and target
X_balanced = balanced_data['Message']
y_balanced = balanced_data['Category']

# Split the balanced data into training and testing sets
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
X_train_transformed = vectorizer.fit_transform(X_train_balanced)

# Transform the testing data
X_test_transformed = vectorizer.transform(X_test_balanced)

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the balanced data
rf_classifier.fit(X_train_transformed, y_train_balanced)

# Predictions on the test set
y_pred_balanced = rf_classifier.predict(X_test_transformed)

# Evaluate the model
accuracy_balanced = accuracy_score(y_test_balanced, y_pred_balanced)
print("Accuracy:", accuracy_balanced)
print("Classification Report:")
print(classification_report(y_test_balanced, y_pred_balanced))


Accuracy: 0.9632107023411371
Classification Report:
              precision    recall  f1-score   support

         ham       0.93      1.00      0.97       155
        spam       1.00      0.92      0.96       144

    accuracy                           0.96       299
   macro avg       0.97      0.96      0.96       299
weighted avg       0.97      0.96      0.96       299

