In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from sklearn.metrics import precision_recall_fscore_support
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('punkt')


In [None]:
data = pd.read_csv('./data/email.csv')
data

In [None]:
data['Category'].value_counts()

In [None]:
data = data[data['Category'].isin(['ham', 'spam'])]
data

In [None]:
duplicate_rows = data.duplicated()
num_duplicates = duplicate_rows.sum()
print(num_duplicates)

In [None]:
data=data.drop_duplicates()
data

In [None]:

def preprocessed_text(text):
    # Lowercase
    text = text.lower()
    
    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # 
    processed_text = ' '.join(tokens)
    
    return processed_text

# Data Preprocessing 

In [None]:
data=data.dropna()

data['Message'] = data['Message'].apply(preprocessed_text)
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})

In [None]:
print(data)

# Vectorization

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['Message'])
y = data['Category']

print("Unique labels in y_train:", y.unique())
print("Data type of y_train:", y.dtype)

# Model

In [None]:
def MultinomialNB_model(X_train, y_train, X_test):
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    return y_pred

# Cross Validation

In [1]:
def cross_validate(X, y, num_iter=10):
    results = {'precision': [], 'recall': [], 'f1': []}
    for i in range(num_iter):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        y_pred = MultinomialNB_model(X_train, y_train, X_test)
        p, r, f,_= precision_recall_fscore_support(y_test, y_pred, pos_label=1, average='binary')
        results['precision'].append(p)
        results['recall'].append(r)
        results['f1'].append(f)
    return results
            

# Model Evaluation

In [None]:
results = cross_validate(X, y)
print('precision scores:', results["precision"])
print('recall scores:', results["recall"])
print('f1 scores:', results["f1"])
print()
print('precision average score:', sum(results["precision"]) / len(results["precision"]))
print('recall average score:', sum(results["recall"]) / len(results["recall"]))
print('f1 average score:', sum(results["f1"]) / len(results["f1"]))

# Graph Of Model Performance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = MultinomialNB_model(X_train, y_train, X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, xticklabels=["Not Spam", "Spam"], yticklabels=["Not Spam", "Spam"])
plt.xlabel('Actual label')
plt.ylabel('Predicted label');

# Multiple Graphs Of Model Performance

In [None]:
def display_cross_validate(X, y, num_iter=9):
    fig, axs = plt.subplots(3, 3, figsize=(30, 20))
    plt.suptitle('Heat Map for Each Fold of Cross-Validation')
    for i in range(num_iter):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        classifier = MultinomialNB()
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        ax = axs[i // 3, i % 3]
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, xticklabels=["Not Spam", "Spam"], yticklabels=["Not Spam", "Spam"], ax=ax)
        ax.set_title(f'Fold {i+1}')
        ax.set_xlabel('Actual label')
        ax.set_ylabel('Predicted label')
    plt.show()

display_cross_validate(X, y)