In [3]:
import csv
from sklearn.feature_extraction.text import TfidfVectorizer

train_x = []
train_y = []
with open('train.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [line, label] in reader:
        train_x.append(line)
        train_y.append(int(label))

vectorizer = TfidfVectorizer()
train_x = vectorizer.fit_transform(train_x)

validation_x = []
validation_y = []
with open('validation.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [line, label] in reader:
        validation_x.append(line)
        validation_y.append(int(label))

validation_x = vectorizer.transform(validation_x)

test_x = []
test_y = []
with open('test.csv', newline='') as f:
    reader = csv.reader(f, delimiter=',', quotechar='|')
    for [line, label] in reader:
        test_x.append(line)
        test_y.append(int(label))

test_x = vectorizer.transform(test_x)

In [4]:
# Tutorial used for MLFlow - https://mlflow.org/docs/latest/tutorials-and-examples/tutorial.html (mentioned in the notes)
# I searched AUCPR on Google as I didn't know what it was.
# I also referred to Scikit learn docs.

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, auc
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse

def evaluate(model):
    train_score = model.score(train_x, train_y)
    validation_score = model.score(validation_x, validation_y)
    test_score = model.score(test_x, test_y)
    test_predict = model.predict(test_x)
    print('Training accuracy: ', train_score)
    print('Validation accuracy: ', validation_score)
    print('Test accuracy: ', test_score)
    print('Test precision: ', precision_score(test_y, test_predict))
    print('Test recall: ', recall_score(test_y, test_predict))
    precision, recall, _ = precision_recall_curve(test_y, test_predict)
    test_auc = auc(precision, recall)
    print("Test auc: ", test_auc)
    return [
        train_score, 
        validation_score, 
        test_score, 
        precision_score(test_y, test_predict), 
        recall_score(test_y, test_predict),
        test_auc
    ]

with mlflow.start_run():
    print('Naive Bayes:')
    model_naive_bayes = MultinomialNB()
    model_naive_bayes.fit(train_x, train_y)
    [
        train_score,
        validation_score,
        test_score,
        test_precision,
        test_recall,
        test_auc
    ] = evaluate(model_naive_bayes)
    print('')

    mlflow.log_metric("train_score", train_score)
    mlflow.log_metric("validation_score", validation_score)
    mlflow.log_metric("test_score", test_score)
    mlflow.log_metric("test_precision", test_precision)
    mlflow.log_metric("test_recall", test_recall)
    mlflow.log_metric("test_auc", test_auc)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != 'file':
        mlflow.sklearn.log_model(model_naive_bayes, "model", registered_model_name="NaiveBayesModel")
    else:
        mlflow.sklearn.log_model(model_naive_bayes, "model")

with mlflow.start_run():
    print('Logistic Regression:')
    model_logistic_regression = LogisticRegression(random_state=0)
    model_logistic_regression.fit(train_x, train_y)
    [
        train_score,
        validation_score,
        test_score,
        test_precision,
        test_recall,
        test_auc
    ] = evaluate(model_logistic_regression)
    print('')

    mlflow.log_metric("train_score", train_score)
    mlflow.log_metric("validation_score", validation_score)
    mlflow.log_metric("test_score", test_score)
    mlflow.log_metric("test_precision", test_precision)
    mlflow.log_metric("test_recall", test_recall)
    mlflow.log_metric("test_auc", test_auc)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != 'file':
        mlflow.sklearn.log_model(model_naive_bayes, "model", registered_model_name="LogisticRegressionModel")
    else:
        mlflow.sklearn.log_model(model_naive_bayes, "model")

with mlflow.start_run():
    print('Random Forest:')
    model_random_forest = RandomForestClassifier(random_state=0)
    model_random_forest.fit(train_x, train_y)
    [
        train_score,
        validation_score,
        test_score,
        test_precision,
        test_recall,
        test_auc
    ] = evaluate(model_random_forest)

    mlflow.log_metric("train_score", train_score)
    mlflow.log_metric("validation_score", validation_score)
    mlflow.log_metric("test_score", test_score)
    mlflow.log_metric("test_precision", test_precision)
    mlflow.log_metric("test_recall", test_recall)
    mlflow.log_metric("test_auc", test_auc)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != 'file':
        mlflow.sklearn.log_model(model_naive_bayes, "model", registered_model_name="RandomForestModel")
    else:
        mlflow.sklearn.log_model(model_naive_bayes, "model")

Naive Bayes:
Training accuracy:  0.9694950012817226
Validation accuracy:  0.9521531100478469
Test accuracy:  0.9581839904420549
Test precision:  1.0
Test recall:  0.7265625
Test auc:  0.7312621341099164

Logistic Regression:
Training accuracy:  0.9728274801332991
Validation accuracy:  0.9712918660287081
Test accuracy:  0.9689366786140979
Test precision:  1.0
Test recall:  0.796875
Test auc:  0.7610420400238949

Random Forest:
Training accuracy:  1.0
Validation accuracy:  0.9748803827751196
Test accuracy:  0.973715651135006
Test precision:  1.0
Test recall:  0.828125
Test auc:  0.7742775537634409
