In [1]:
import glob
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sys.path.append("../../../")

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

from helpers.assess import make_classification_report, make_confusion_matrix
from helpers.split import make_train_test_split, tag_label_feature_split

In [2]:
def logistic_regression_experiment(
    dataset, optimizer='lbfgs', penalty='l2', max_iteration=100, verbose=0
):

    dataset_name = os.path.basename(dataset).split(".")[0]
    experiment_label = "logistic_regression_" + dataset_name
    print(f"\n\nExperiment: {experiment_label}\n")
    
    # load data and separate label strings from features,
    # then encode label strings

    df = pd.read_pickle(dataset)
    _, (y, le), X = tag_label_feature_split(df, label_format="encoded")

    # divide into train and test data sets
    X_train_std, X_test_std, y_train, y_test = make_train_test_split(
        X, y, test_size=0.2, random_state=10, stratify=y, x_scaler="standard"
    )

    model = LogisticRegression(
        C=10.0,
        solver=optimizer,
        penalty=penalty,
        multi_class="auto",
        class_weight="balanced",
        max_iter=max_iteration,
        verbose=verbose,
    )

    model.fit(X_train_std, y_train)

    predictions = model.predict(X_test_std)
    train_accuracy = model.score(X_train_std, y_train)
    test_accuracy = model.score(X_test_std, y_test)

    print(f"Train: {train_accuracy = :f}\n")
    make_classification_report(
        y_train,
        model=model,
        x=X_train_std,
        digits=4,
        label_encoder=le,
        print_report=True,
        save_result=True,
        result_filename="rich_results.json",
        model_name=experiment_label+"_train",
        repeat=True
    )
    print(f"\nTest: {test_accuracy = :f}\n")
    make_classification_report(
        y_test, 
        y_pred=predictions, 
        digits=4, 
        label_encoder=le, 
        print_report=True,
        save_result=True,
        result_filename="rich_results.json",
        model_name=experiment_label+"_test",
        repeat=True
    )

    make_confusion_matrix(
        y_test,
        y_pred=predictions,
        label_encoder=le,
        title=f"{dataset_name} test (row normalized)",
    )

    return model

In [None]:
dataset_folder = "../../../datasets/"
datasets = sorted([name for name in glob.glob(dataset_folder + "dataset_*.pickle")])
for dataset in datasets:
    logistic_regression_experiment(dataset, optimizer='liblinear', penalty='l1')



Experiment: logistic_regression_dataset_00_all

