# Importing necessary libraries

In [65]:
import re
import csv
import nltk
import pandas
import mlflow
import sklearn
import numpy as np
from sklearn.metrics import average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, auc, precision_recall_curve, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split, learning_curve
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from mlflow.models import infer_signature

# Preprocessing data

In [5]:
# Dowloading NLTK stopwords
nltk.download("stopwords")

def preprocess_data(data):

    # Remove characters other than English letters and digits
    data['text'] = data['text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

    # Convert to lowercase
    data['text'] = data['text'].apply(lambda x: x.lower())

    # Remove stopwords
    s = set(stopwords.words("english"))
    data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in s and word]))

    return data

texts = pandas.read_csv("raw_data.csv")
texts = preprocess_data(texts)

[nltk_data] Downloading package stopwords to /home/tumon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Splitting the preprocessed data

In [6]:
# Data splitting: 80% to training data, 5% to validation, 15% to test data
def split_data(data, test_size = 0.2, validation_size = 0.25, output_path = './'):

    # Split the data into train and test sets
    train_data, test_data = train_test_split(data, test_size = test_size, random_state = 1)

    # Further split the test data into validation and test sets
    validation_data, test_data = train_test_split(test_data, test_size = validation_size, random_state = 1)

    train_data.to_csv(f'{output_path}/train.csv', index = False)
    validation_data.to_csv(f'{output_path}/validation.csv', index = False)
    test_data.to_csv(f'{output_path}/test.csv', index = False)

split_data(texts)

# Loading the preprocessed train, validation, and test data

In [7]:
# Loading preprocessed train data
X_train = pandas.read_csv("train.csv")
y_train = X_train['spam']
X_train_text = X_train['text']

# Loading preprocessed validation data
X_validation = pandas.read_csv("validation.csv") 
y_validation = X_validation['spam']
X_validation_text = X_validation['text']

# Loading preprocessed test data
X_test = pandas.read_csv("test.csv")
y_test = X_test['spam']
X_test_text = X_test['text']

# Data Vectorization

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features = 50000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_validation_tfidf = tfidf_vectorizer.transform(X_validation_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

# Setting up MLflow tracking

In [70]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# The models

In [71]:
models = [
    ("Logistic Regression", LogisticRegression()),
    ("Random Forest Classifier", RandomForestClassifier()),
    ("Support Vector Machine", SVC(probability=True))
]

# Building, tracking, and registering the three models

In [72]:
for model_name, model in models:
    with mlflow.start_run() as run:

        model.fit(X_train_tfidf, y_train)

        y_pred = model.predict(X_validation_tfidf)
        signature = infer_signature(X_test, y_pred)

        mlflow.log_params(model.get_params())
        mlflow.log_metrics({"mse": mean_squared_error(y_validation, y_pred)})

        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="sklearn-model",
            signature=signature,
            registered_model_name=model_name,
        )

Successfully registered model 'Logistic Regression'.
2024/02/20 23:18:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic Regression, version 1
Created version '1' of model 'Logistic Regression'.
Successfully registered model 'Random Forest Classifier'.
2024/02/20 23:18:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Classifier, version 1
Created version '1' of model 'Random Forest Classifier'.
Successfully registered model 'Support Vector Machine'.
2024/02/20 23:18:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Machine, version 1
Created version '1' of model 'Support Vector Machine'.


# Printing AUCPR for the three models

In [79]:
# Define the run IDs and model names for the three models
run_info = [
    {"run_id": "bda6b42d1c4e42fc882ea84c39c7fcd0", "model_name": "Logistic Regression"},
    {"run_id": "637ff480f61846c5b6218465ea136fc6", "model_name": "Random Forest Classifier"},
    {"run_id": "985c9edd9b34412598b217427d73188a", "model_name": "SVC"}
]

for info in run_info:
    # Load the model artifact for the current run ID
    model = mlflow.sklearn.load_model(f"runs:/{info['run_id']}/sklearn-model")

    y_proba_val = model.predict_proba(X_validation_tfidf)[:, 1]
    precision_val, recall_val, _ = precision_recall_curve(y_validation, y_proba_val)
    aucpr_val = auc(recall_val, precision_val)

    print(f"Model Name: {info['model_name']}, Validation AUCPR = {aucpr_val}")


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Model Name: Logistic Regression, Validation AUCPR = 0.999770345798363


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Model Name: Random Forest Classifier, Validation AUCPR = 0.9974920067388685


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Model Name: SVC, Validation AUCPR = 0.9997981905712987
