##**SANJAY RAJ S**
##**TASK 3 - SMS SPAM DETECTION**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score

In [None]:
df = pd.read_csv('spam.csv', encoding = 'LATIN-1')

In [None]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [None]:
X = df['v2']
y = df['v1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)


In [None]:
from sklearn.model_selection import cross_val_score

models = [
    ('LogisticRegression', LogisticRegression()),
    ('SVC', SVC()),
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
    ('RandomForestClassifier', RandomForestClassifier()),
    ('AdaBoostClassifier', AdaBoostClassifier()),
    ('GradientBoostingClassifier', GradientBoostingClassifier()),
    ('MultinomialNB', MultinomialNB()),
    ('BernoulliNB', BernoulliNB())
]

best_model = None
best_accuracy = 0
# Iterate over the models and evaluate their performance
for name, model in models:
    # Create a pipeline for each model
    pipeline = Pipeline([
        ('model', model)
    ])

    # Perform cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)

    # Calculate mean accuracy
    mean_accuracy = scores.mean()

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = pipeline.predict(X_test)

    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    # Print the performance metrics
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print("Test Accuracy:", accuracy)
    print()

    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline



# save the best model
import pickle
pickle.dump(best_model, open('email_spam_classifier.pkl', 'wb'))
print("-----------------------------------------------------------")
print(f"Best model: {name} with accuracy score: {best_accuracy}")

Model: LogisticRegression
Cross-validation Accuracy: 0.9569202236616288
Test Accuracy: 0.9659192825112107

Model: SVC
Cross-validation Accuracy: 0.9746450554372826
Test Accuracy: 0.9820627802690582

Model: DecisionTreeClassifier
Cross-validation Accuracy: 0.968589733911109
Test Accuracy: 0.9650224215246637

Model: RandomForestClassifier
Cross-validation Accuracy: 0.9771131846617646
Test Accuracy: 0.9766816143497757

Model: AdaBoostClassifier
Cross-validation Accuracy: 0.9741973798774994
Test Accuracy: 0.9695067264573991

Model: GradientBoostingClassifier
Cross-validation Accuracy: 0.9703821976617194
Test Accuracy: 0.9695067264573991

Model: MultinomialNB
Cross-validation Accuracy: 0.9555756871152985
Test Accuracy: 0.9623318385650225

Model: BernoulliNB
Cross-validation Accuracy: 0.9809276119440543
Test Accuracy: 0.9748878923766816

-----------------------------------------------------------
Best model: BernoulliNB with accuracy score: 0.9820627802690582
