#  Spam Mail classification

In this notebook I have implemented classifiers (Naive Bayes and Support Vector Machines) to detect spam mails.

## Necessary libraries

In [None]:
import numpy as np
import pandas as pd
from collections import Counter

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from scikitplot.metrics import plot_confusion_matrix as plt_con_mat
import seaborn as sns

from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm

import warnings
warnings.filterwarnings("ignore")

## Loading the Data

In [None]:
data = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
data.head()

## Class distribution

In [None]:
counts = pd.value_counts(data["v1"])
counts.plot(kind= 'bar', color= ['g', 'r'])
plt.title('Ham/Spam')
plt.show()

## Text preprocessing and feature engineering

In [None]:
clean_data = []

Y = np.array(data["v1"].map({'spam':1,'ham':0}))
Y = np.expand_dims(Y, axis = -1)

for doc in data["v2"]:
    
    killpunctuation = str.maketrans('', '', string.punctuation)
    lemmatizer = WordNetLemmatizer()
    
    text = doc.lower()
    text = text.translate(killpunctuation)
    text = text.split(' ')
    text = [lemmatizer.lemmatize(word) for word in text]
    text = (" ".join(text))
    clean_data.append(text)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectors = tfidf_vectorizer.fit_transform(clean_data)
X = tfidf_vectors.toarray()

print(X.shape, Y.shape)

## Splitting the Data into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle = True, test_size = 0.2, random_state = 1)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# Multinomial naive bayes classifier

In [None]:
alphas = np.arange(1/100000, 1, 0.001)
train_acc = []
test_acc = []
recall = []
precision = []

for alpha in alphas:
    
    bayes = naive_bayes.MultinomialNB(alpha = alpha)
    bayes.fit(X_train, y_train)
    
    train_acc.append(bayes.score(X_train, y_train))
    test_acc.append(bayes.score(X_test, y_test))
    recall.append(metrics.recall_score(y_test, bayes.predict(X_test)))
    precision.append(metrics.precision_score(y_test, bayes.predict(X_test)))
    
print("Training complete")

## Model training performance

In [None]:
sns.set()
fig = plt.figure(0, (12, 4))

ax = plt.subplot(1, 2, 1)
sns.lineplot(alphas, train_acc, label = 'train')
sns.lineplot(alphas, test_acc, label = 'test')
plt.title('Accuracy')
plt.tight_layout()

ax = plt.subplot(1, 2, 2)
sns.lineplot(alphas, precision, label = 'precision')
sns.lineplot(alphas, recall, label = 'recall')
plt.title('Precision and Recall')
plt.tight_layout()

plt.show()

In [None]:
temp = np.matrix(np.c_[alphas, train_acc, test_acc, recall, precision])
nb_model_metrics = pd.DataFrame(data = temp, 
                      columns = ['alphas', 'Train Accuracy', 'Test Accuracy', 'Test Recall', 'Test Precision'])
nb_model_metrics.head()

## Choosing the best model according to precision and accuracy

In [None]:
idx = nb_model_metrics[nb_model_metrics['Test Precision'] == nb_model_metrics['Test Precision'].max()]['Test Accuracy'].idxmax()
nb_model = naive_bayes.MultinomialNB(alpha = alphas[idx])
nb_model.fit(X_train, y_train)
print(nb_model_metrics.iloc[idx, :])

## Confusion matrix

In [None]:
preds = nb_model.predict(X_test)

plt_con_mat(y_test, preds, figsize=(10,10))
plt.show()

# Support Vector Machine (SVM)

In [None]:
C_vals = np.arange(500, 2000, 100)
train_acc = []
test_acc = []
recall = []
precision = []

for C in C_vals:
    
    svc = svm.LinearSVC(C = C)
    svc.fit(X_train, y_train)
    
    train_acc.append(svc.score(X_train, y_train))
    test_acc.append(svc.score(X_test, y_test))
    recall.append(metrics.recall_score(y_test, svc.predict(X_test)))
    precision.append(metrics.precision_score(y_test, svc.predict(X_test)))

print("Training complete")

## Model training performance

In [None]:
sns.set()
fig = plt.figure(0, (12, 4))

ax = plt.subplot(1, 2, 1)
sns.lineplot(C_vals, train_acc, label = 'train')
sns.lineplot(C_vals, test_acc, label = 'test')
plt.title('Accuracy')
plt.tight_layout()

ax = plt.subplot(1, 2, 2)
sns.lineplot(C_vals, precision, label = 'precision')
sns.lineplot(C_vals, recall, label = 'recall')
plt.title('Precision and Recall')
plt.tight_layout()

plt.show()

In [None]:
temp = np.matrix(np.c_[C_vals, train_acc, test_acc, recall, precision])
svm_model_metrics = pd.DataFrame(data = temp, 
                                 columns = ['C values', 'Train Accuracy', 'Test Accuracy', 'Test Recall', 'Test Precision'])
svm_model_metrics.head()

## Choosing the best model

In [None]:
idx = svm_model_metrics[svm_model_metrics['Test Precision'] == svm_model_metrics['Test Precision'].max()]['Test Accuracy'].idxmax()
svm_model = svm.LinearSVC(C = C_vals[idx])
svm_model.fit(X_train, y_train)
print(svm_model_metrics.iloc[idx, :])

## Confusion Matrix

In [None]:
preds = svm_model.predict(X_test)

plt_con_mat(y_test, preds, figsize=(10,10))
plt.show()