In [2]:
import os
import sys
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from joblib import dump, load
import argparse
from codecarbon import EmissionsTracker
import subprocess
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn import metrics

In [None]:



def emissions_tracker(tracker_outpath):
    """
    The function initializes an EmissionsTracker object to track carbon emissions associated
    with code execution. The results of this can be found in portfolio 5.
    """
    tracker = EmissionsTracker(project_name = "portfolio 2",
                                experiment_id = "portfolio_2",
                                output_dir = tracker_outpath,
                                output_file = "emissions_portfolio2.csv")
    return tracker


def parser():
    """
    The user can specify whether to perform GridSearch and/or permutation testing when executing
    the script. The function will then parse command-line arguments and make them lower case.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--GridSearch",
                        "-gs",
                        required = True,
                        help = "Perform GridSearch (yes or no)")
    parser.add_argument("--PermutationTest",
                        "-pt",
                        required = True,
                        help = "Perform permutation test (yes or no)")    
    args = parser.parse_args()
    args.GridSearch = args.GridSearch.lower()
    args.PermutationTest = args.PermutationTest.lower()
    return args


def load_vectorised_data():
    """
    The Python subprocess module is a tool that allows you to run other programs or commands
    from your Python code. It can be used to open new programs, send them data and get results back.
    """
    if os.path.isfile('models/vectorized_data.pkl'):
        vectorized_data = joblib.load('models/vectorized_data.pkl')
        X_train_features, y_train, X_test_features, y_test = vectorized_data
    else:
        subprocess.run(['python', 'vectorizer.py']) # capture_output=True, text=True)
        X_train_features, y_train, X_test_features, y_test = vectorized_data

    return X_train_features, y_train, X_test_features, y_test


def define_classifier():
    """
    Function that defines logistic regression classifier with default parameters.
    """
    classifier = LogisticRegression(tol = 0.0001,
                                    max_iter = 100,
                                    solver = 'lbfgs',
                                    penalty = 'l2',
                                    random_state = 123,
                                    verbose = True)
    return classifier


def grid_search(classifier, X_train, y_train):
    tol = [0.01, 0.001, 0.0001, 0.00001]
    max_iter = [100, 200, 300]

    param_grid = (
        [{'tol': tol, 'max_iter': max_iter, 'solver': ('saga', 'liblinear'), 'penalty': ('l1')},
        {'tol': tol, 'max_iter': max_iter, 'solver': ('saga', 'liblinear', 'lbfgs', 'newton-cg'), 'penalty': ('l2')}]
        )

    grid_search = GridSearchCV(estimator = classifier, param_grid = param_grid, cv = 5, n_jobs = -1)
    grid_result = grid_search.fit(X_train, y_train)

    print(f'Best Accuracy for {grid_result.best_score_} using the parameters {grid_result.best_params_}')

    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print(f' mean={mean:.4}, std={stdev:.4} using {param}')

    best_estimator = grid_result.best_estimator_
    return best_estimator


def fit_classifier(classifier, X_train, y_train):
    """
    The function fits the LR classifier to the training data.
    - fits either the vectorised data to default LR parameters or parameters obtained through GridSearch
    """
    classifier = classifier.fit(X_train, y_train)
    return classifier


def evaluate_classifier(classifier, X_train_features, y_train, X_test_features, y_test, outpath):
    """
    The function evaluates the trained classifier on new, unseen data. This includes plotting a confusion
    matrix and calculating a classification report, which will be saved to a specified outpath.
    """
    y_pred = classifier.predict(X_test_features)
    metrics.ConfusionMatrixDisplay.from_estimator(classifier, 
                                                X_test_features, y_train,
                                                cmap = plt.cm.Blues,
                                                labels = ["FAKE", "REAL"])

    classifier_metrics = metrics.classification_report(y_test, y_pred, target_names = ["FAKE", "REAL"])
    print(classifier_metrics)

    with open(outpath, 'w') as file:
        file.write(classifier_metrics)
    return print("The classification report has been saved to the out folder")


def permutation_test(classifier, X_train_features, y_train, outpath):
    """
    Performs permutation test on the LR classifier to assess statistical significance of classifier's
    performance. The permutation test will be plotted and saved to a specified outpath.
    """
    score, permutation_scores, pvalue = permutation_test_score(classifier, X_train_features, y_train,
                                                                cv = 5, n_permutations = 100,
                                                                n_jobs = -1, random_state = 123,
                                                                verbose = True, coring = None)
    n_classes = 2

    plt.figure(figsize = (8, 6))
    plt.hist(permutation_scores, 20, label = 'Permutation scores', edgecolor = 'black')
    ylim = plt.ylim()
    plt.plot(2 * [score], ylim, '--g', linewidth = 3,label = 'Classification Score'' (pvalue %s)' % pvalue)
    plt.plot(2 * [1. / n_classes], ylim, '--k', linewidth = 3, label = 'Chance level')
    plt.title("Permutation test logistic regression classifier")
    plt.ylim(ylim)
    plt.legend()
    plt.xlabel('Score')
    plt.savefig(outpath)
    plt.show()
    return print("The permutation test has been saved to the out folder")


def main():
    
    args = parser()

    X_train_features, y_train, X_test_features, y_test = load_vectorised_data()

    classifier = define_classifier()

    if args.GridSearch == 'yes':
        classifier = grid_search(classifier, X_train_features, y_train)

    fit_classifier(classifier, X_train_features, y_train)

    evaluate_classifier(classifier, X_train_features, y_train, X_test_features, y_test,
                         "out/LR_classification_report.txt")

    if args.PermutationTest == 'yes':
        permutation_test(classifier, X_test, y_test, "out/LR_permutation.png")

if __name__ == "__main__":
    main()