In [None]:
import pandas as pd
from functools import partial
import sys

sys.path.append("../Handlers")

JSON_WRITE_MODE = "overwrite"

import preprocessing

In [None]:
csv_file = pd.read_csv("./SpamAssassin.csv")

csv_file.head()

In [None]:
from traintest import ClassificationModel, models, add_to_json_array

metric_results = []

def train_and_evaluate_model(X, y, dataset_name):
    print(f"{dataset_name} classification report")
    print("=========================================")
    for model in models:
        classification_model = ClassificationModel(model, dataset_name)
        classification_model.validation(X, y, save_model=True)
        print(f"{model.__class__.__name__} classification report")
        metrics = classification_model.evaluate(detailed=True)
        metric_results.append(metrics)
        print(metrics)
        print("\n")

In [None]:
preprocession = partial(
    preprocessing.preprocess_text,
    remove_numbers=True
)

preprocessed_data = csv_file["Body"].apply(preprocession)
preprocessed_data

## Stemming + CountVectorizer

In [None]:
spam_assassin_stemming = preprocessed_data.apply(preprocessing.stemming)
spam_assassin_stemming_countvec = preprocessing.vectorizing(spam_assassin_stemming, "countvectorizer")

In [None]:
train_and_evaluate_model(spam_assassin_stemming_countvec, csv_file["Is_spam"], "spam_assassin_stemming_countvec")

## Stemming + TF-IDF Vectorizer

In [None]:
spam_assassin_stemming_tfidf = preprocessing.vectorizing(spam_assassin_stemming, "tfidf")

In [None]:
train_and_evaluate_model(spam_assassin_stemming_tfidf, csv_file["Is_spam"], "spam_assassin_stemmed_tfidf")

## Lemmatizing + CountVectorizer

In [None]:
spam_assassin_lemmatizing = preprocessed_data.apply(preprocessing.lemmatizing)
spam_assassin_lemmatizing_countvec = preprocessing.vectorizing(spam_assassin_lemmatizing, "countvectorizer")

In [None]:
train_and_evaluate_model(spam_assassin_lemmatizing_countvec, csv_file["Is_spam"], "spam_assassin_lemmatized_countvec")

## Lemmatizing + TF-IDF Vectorizer

In [None]:
spam_assassin_lemmatizing_tfidf = preprocessing.vectorizing(spam_assassin_lemmatizing, "tfidf")

In [None]:
train_and_evaluate_model(spam_assassin_lemmatizing_tfidf, csv_file["Is_spam"], "spam_assassin_lemmatized_tfidf")

In [None]:
print(metric_results)

def convert(dic):
    dic["confusion_matrix"] = dic["confusion_matrix"].tolist()
    return dic

metric_results = list(map(convert, metric_results))

add_to_json_array("./spam_assassin_grid_classification_report.json", metric_results, mode="overwrite")