In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import os

In [2]:
train_files = [
    "One-Hot/Train_Orig_OH.csv",
    "One-Hot/Scaled/Train_Scaled_All_OH.csv",
    "One-Hot/Scaled/Train_Scaled_Cont_OH.csv",
    "One-Hot/MinMax/Train_MM_OH.csv",
    "One-Hot/MinMax/train_OH_MM_PCA15.csv",
    "One-Hot/MinMax/train_OH_MM_PCA20.csv",
    "One-Hot/MinMax/train_OH_MM_PCA25.csv",
    "One-Hot/MinMax/train_OH_MM_PCA30.csv",
    "One-Hot/MinMax/train_OH_MM_PCA35.csv",
    "IntClasses/Train_Orig_Int.csv",
    "IntClasses/Scaled/Train_Scaled_All_Int.csv",
    "IntClasses/Scaled/Train_Scaled_Cont_Int.csv",
    "IntClasses/MinMax/Train_MM_Int.csv",
    "IntClasses/MinMax/train_Int_MM_PCA10.csv",
    "IntClasses/MinMax/train_Int_MM_PCA15.csv",
    "IntClasses/MinMax/train_Int_MM_PCA20.csv",
    "IntClasses/MinMax/train_Int_MM_PCA25.csv",
]

In [3]:
results = []

def evaluate_model(name, model, X, y, dataset_type):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred, average='binary')
    rec = recall_score(y, y_pred, average='binary')
    f1 = f1_score(y, y_pred, average='binary')

    print(f"\n{name} - {dataset_type} Evaluation")
    print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1 Score:", f1)

    return acc, prec, rec, f1

In [4]:
for path in train_files:
    print("\n" + "="*80)
    print(f"Processing dataset: {path}")
    
    train_df = pd.read_csv(path)
    valid_path = path.replace("Train", "Valid").replace("train", "valid")
    test_path = path.replace("Train", "Test").replace("train", "test")

    valid_df = pd.read_csv(valid_path)
    test_df = pd.read_csv(test_path)

    is_pca = 'PCA' in path
    target_col = -1 if is_pca else -2

    X_train = train_df.iloc[:, :target_col]
    y_train = train_df.iloc[:, target_col]

    X_valid = valid_df.iloc[:, :target_col]
    y_valid = valid_df.iloc[:, target_col]

    X_test = test_df.iloc[:, :target_col]
    y_test = test_df.iloc[:, target_col]

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    name = os.path.basename(path)
    evaluate_model(name, model, X_valid, y_valid, "Validation")
    acc, prec, rec, f1 = evaluate_model(name, model, X_test, y_test, "Test")

    results.append({
        "Dataset": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    })

results_df = pd.DataFrame(results)
results_df


Processing dataset: One-Hot/Train_Orig_OH.csv


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Train_Orig_OH.csv - Validation Evaluation
Confusion Matrix:
 [[2894  150]
 [ 260  696]]
Accuracy: 0.8975
Precision: 0.8226950354609929
Recall: 0.7280334728033473
F1 Score: 0.7724750277469479

Train_Orig_OH.csv - Test Evaluation
Confusion Matrix:
 [[2900  144]
 [ 235  721]]
Accuracy: 0.90525
Precision: 0.8335260115606936
Recall: 0.75418410041841
F1 Score: 0.7918725974739154

Processing dataset: One-Hot/Scaled/Train_Scaled_All_OH.csv

Train_Scaled_All_OH.csv - Validation Evaluation
Confusion Matrix:
 [[2980   64]
 [  82  874]]
Accuracy: 0.9635
Precision: 0.9317697228144989
Recall: 0.9142259414225942
F1 Score: 0.9229144667370643

Train_Scaled_All_OH.csv - Test Evaluation
Confusion Matrix:
 [[2979   65]
 [  68  888]]
Accuracy: 0.96675
Precision: 0.9317943336831059
Recall: 0.9288702928870293
F1 Score: 0.930330015715034

Processing dataset: One-Hot/Scaled/Train_Scaled_Cont_OH.csv

Train_Scaled_Cont_OH.csv - Validation Evaluation
Confusion Matrix:
 [[2979   65]
 [  81  875]]
Accuracy: 0.9635

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Train_Orig_Int.csv - Validation Evaluation
Confusion Matrix:
 [[2903  141]
 [ 258  698]]
Accuracy: 0.90025
Precision: 0.831942789034565
Recall: 0.7301255230125523
F1 Score: 0.777715877437326

Train_Orig_Int.csv - Test Evaluation
Confusion Matrix:
 [[2900  144]
 [ 234  722]]
Accuracy: 0.9055
Precision: 0.8337182448036952
Recall: 0.7552301255230126
F1 Score: 0.7925356750823271

Processing dataset: IntClasses/Scaled/Train_Scaled_All_Int.csv

Train_Scaled_All_Int.csv - Validation Evaluation
Confusion Matrix:
 [[2972   72]
 [ 102  854]]
Accuracy: 0.9565
Precision: 0.9222462203023758
Recall: 0.893305439330544
F1 Score: 0.9075451647183848

Train_Scaled_All_Int.csv - Test Evaluation
Confusion Matrix:
 [[2959   85]
 [  85  871]]
Accuracy: 0.9575
Precision: 0.9110878661087866
Recall: 0.9110878661087866
F1 Score: 0.9110878661087866

Processing dataset: IntClasses/Scaled/Train_Scaled_Cont_Int.csv

Train_Scaled_Cont_Int.csv - Validation Evaluation
Confusion Matrix:
 [[2973   71]
 [ 102  854]]
Accu

Unnamed: 0,Dataset,Accuracy,Precision,Recall,F1
0,Train_Orig_OH.csv,0.90525,0.833526,0.754184,0.791873
1,Train_Scaled_All_OH.csv,0.96675,0.931794,0.92887,0.93033
2,Train_Scaled_Cont_OH.csv,0.967,0.931866,0.929916,0.93089
3,Train_MM_OH.csv,0.91,0.854762,0.751046,0.799555
4,train_OH_MM_PCA15.csv,0.783,0.670543,0.180962,0.285008
5,train_OH_MM_PCA20.csv,0.78225,0.588727,0.294979,0.393031
6,train_OH_MM_PCA25.csv,0.92875,0.872364,0.822176,0.846527
7,train_OH_MM_PCA30.csv,0.9345,0.886414,0.832636,0.858684
8,train_OH_MM_PCA35.csv,0.9985,0.997904,0.995816,0.996859
9,Train_Orig_Int.csv,0.9055,0.833718,0.75523,0.792536


In [5]:
best_accuracy = results_df.loc[results_df['Accuracy'].idxmax()]
best_precision = results_df.loc[results_df['Precision'].idxmax()]
best_recall = results_df.loc[results_df['Recall'].idxmax()]
best_f1 = results_df.loc[results_df['F1'].idxmax()]

print("\n" + "="*80)
print("BEST PERFORMING DATASETS")
print("Highest Accuracy: ", best_accuracy['Dataset'], f"({best_accuracy['Accuracy']:.4f})")
print("Highest Precision:", best_precision['Dataset'], f"({best_precision['Precision']:.4f})")
print("Highest Recall:   ", best_recall['Dataset'], f"({best_recall['Recall']:.4f})")
print("Highest F1 Score: ", best_f1['Dataset'], f"({best_f1['F1']:.4f})")


BEST PERFORMING DATASETS
Highest Accuracy:  train_Int_MM_PCA25.csv (0.9988)
Highest Precision: train_Int_MM_PCA25.csv (0.9979)
Highest Recall:    train_Int_MM_PCA25.csv (0.9969)
Highest F1 Score:  train_Int_MM_PCA25.csv (0.9974)
