<a href="https://colab.research.google.com/github/SJDL123/CVIP-DataScience/blob/main/Email_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv("mail_data.csv")

In [3]:
X = data['Message']
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [4]:
y = data['Category']
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: Category, dtype: object

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [7]:
# Train a Naive Bayes classifier on the vectorized training data
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

In [8]:
# Use the trained classifier to make predictions on the test data
y_pred = classifier.predict(X_test_vectorized)

In [9]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9919282511210762
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      1.00      1.00       966
        spam       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [10]:
# Experiment with different classifiers, hyperparameters, and feature extraction methods
# For instance, try using TF-IDF instead of CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

In [11]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [12]:
# Train a new classifier using TF-IDF features
classifier_tfidf = MultinomialNB()
classifier_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = classifier_tfidf.predict(X_test_tfidf)

In [13]:
# Evaluate the new model
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)

In [14]:
report_tfidf = classification_report(y_test, y_pred_tfidf)

In [15]:
print("TF-IDF Accuracy:", accuracy_tfidf)
print("TF-IDF Classification Report:\n", report_tfidf)

TF-IDF Accuracy: 0.9650224215246637
TF-IDF Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.74      0.85       149

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.97      0.96      1115

