In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import os
from sklearn.model_selection import GridSearchCV

In [2]:
train_files = [
    "One-Hot/Train_Orig_OH.csv",
    "One-Hot/Scaled/Train_Scaled_All_OH.csv",
    "One-Hot/Scaled/Train_Scaled_Cont_OH.csv",
    "One-Hot/MinMax/Train_MM_OH.csv",
    "One-Hot/MinMax/train_OH_MM_PCA15.csv",
    "One-Hot/MinMax/train_OH_MM_PCA20.csv",
    "One-Hot/MinMax/train_OH_MM_PCA25.csv",
    "One-Hot/MinMax/train_OH_MM_PCA30.csv",
    "One-Hot/MinMax/train_OH_MM_PCA35.csv",
    "IntClasses/Train_Orig_Int.csv",
    "IntClasses/Scaled/Train_Scaled_All_Int.csv",
    "IntClasses/Scaled/Train_Scaled_Cont_Int.csv",
    "IntClasses/MinMax/Train_MM_Int.csv",
    "IntClasses/MinMax/train_Int_MM_PCA10.csv",
    "IntClasses/MinMax/train_Int_MM_PCA15.csv",
    "IntClasses/MinMax/train_Int_MM_PCA20.csv",
    "IntClasses/MinMax/train_Int_MM_PCA25.csv",
]

In [3]:
results = []

def evaluate_model(name, model, X, y, dataset_type):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred, average='binary')
    rec = recall_score(y, y_pred, average='binary')
    f1 = f1_score(y, y_pred, average='binary')

    print(f"\n{name} - {dataset_type} Evaluation")
    print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1 Score:", f1)

    return acc, prec, rec, f1

In [5]:
for path in train_files:
    print("\n================================================================================")
    print(f"Processing dataset: {path}")
    
    train_df = pd.read_csv(path)
    valid_path = path.replace("Train", "Valid").replace("train", "valid")
    test_path = path.replace("Train", "Test").replace("train", "test")

    valid_df = pd.read_csv(valid_path)
    test_df = pd.read_csv(test_path)

    X_train = train_df.iloc[:, :-2]
    y_train = train_df.iloc[:, -2]

    X_valid = valid_df.iloc[:, :-2]
    y_valid = valid_df.iloc[:, -2]

    X_test = test_df.iloc[:, :-2]
    y_test = test_df.iloc[:, -2]

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    name = os.path.basename(path)
    acc, prec, rec, f1 = evaluate_model(name, model, X_valid, y_valid, "Validation")

    results.append({
        "Dataset": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    })

results_df = pd.DataFrame(results)
results_df


Processing dataset: One-Hot/Train_Orig_OH.csv

Train_Orig_OH.csv - Validation Evaluation
Confusion Matrix:
 [[2944  100]
 [ 201  755]]
Accuracy: 0.92475
Precision: 0.8830409356725146
Recall: 0.7897489539748954
F1 Score: 0.8337934842628382

Processing dataset: One-Hot/Scaled/Train_Scaled_All_OH.csv

Train_Scaled_All_OH.csv - Validation Evaluation
Confusion Matrix:
 [[2944  100]
 [ 201  755]]
Accuracy: 0.92475
Precision: 0.8830409356725146
Recall: 0.7897489539748954
F1 Score: 0.8337934842628382

Processing dataset: One-Hot/Scaled/Train_Scaled_Cont_OH.csv

Train_Scaled_Cont_OH.csv - Validation Evaluation
Confusion Matrix:
 [[2944  100]
 [ 201  755]]
Accuracy: 0.92475
Precision: 0.8830409356725146
Recall: 0.7897489539748954
F1 Score: 0.8337934842628382

Processing dataset: One-Hot/MinMax/Train_MM_OH.csv

Train_MM_OH.csv - Validation Evaluation
Confusion Matrix:
 [[2943  101]
 [ 202  754]]
Accuracy: 0.92425
Precision: 0.8818713450292398
Recall: 0.7887029288702929
F1 Score: 0.83268912203202

Unnamed: 0,Dataset,Accuracy,Precision,Recall,F1
0,Train_Orig_OH.csv,0.92475,0.883041,0.789749,0.833793
1,Train_Scaled_All_OH.csv,0.92475,0.883041,0.789749,0.833793
2,Train_Scaled_Cont_OH.csv,0.92475,0.883041,0.789749,0.833793
3,Train_MM_OH.csv,0.92425,0.881871,0.788703,0.832689
4,train_OH_MM_PCA15.csv,0.7715,0.548387,0.248954,0.342446
5,train_OH_MM_PCA20.csv,0.7665,0.528497,0.213389,0.304024
6,train_OH_MM_PCA25.csv,0.78625,0.616092,0.280335,0.385334
7,train_OH_MM_PCA30.csv,0.864,0.835505,0.536611,0.653503
8,train_OH_MM_PCA35.csv,0.8835,0.934397,0.551255,0.693421
9,Train_Orig_Int.csv,0.92425,0.876586,0.794979,0.83379


In [6]:
best_accuracy = results_df.loc[results_df['Accuracy'].idxmax()]
best_precision = results_df.loc[results_df['Precision'].idxmax()]
best_recall = results_df.loc[results_df['Recall'].idxmax()]
best_f1 = results_df.loc[results_df['F1'].idxmax()]

print("BEST PERFORMING DATASETS")
print("Highest Accuracy: ", best_accuracy['Dataset'], f"({best_accuracy['Accuracy']:.4f})")
print("Highest Precision:", best_precision['Dataset'], f"({best_precision['Precision']:.4f})")
print("Highest Recall:   ", best_recall['Dataset'], f"({best_recall['Recall']:.4f})")
print("Highest F1 Score: ", best_f1['Dataset'], f"({best_f1['F1']:.4f})")

BEST PERFORMING DATASETS
Highest Accuracy:  Train_Orig_OH.csv (0.9247)
Highest Precision: train_OH_MM_PCA35.csv (0.9344)
Highest Recall:    Train_Orig_Int.csv (0.7950)
Highest F1 Score:  Train_Orig_OH.csv (0.8338)


In [10]:
param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [2, 3, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']
}

model = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2)

In [11]:
path = "One-Hot/Train_Orig_OH.csv"

train_df = pd.read_csv(path)
valid_path = path.replace("Train", "Valid").replace("train", "valid")
test_path = path.replace("Train", "Test").replace("train", "test")

valid_df = pd.read_csv(valid_path)
test_df = pd.read_csv(test_path)

X_train = train_df.iloc[:, :-2]
y_train = train_df.iloc[:, -2]

X_valid = valid_df.iloc[:, :-2]
y_valid = valid_df.iloc[:, -2]

X_test = test_df.iloc[:, :-2]
y_test = test_df.iloc[:, -2]

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

Fitting 2 folds for each of 144 candidates, totalling 288 fits
Best Parameters: {'bootstrap': False, 'class_weight': 'balanced', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 200}


In [12]:
model = RandomForestClassifier(random_state=42, bootstrap=False, class_weight='balanced', max_depth=None, max_features='sqrt', min_samples_leaf=2, n_estimators=200)

model.fit(X_train, y_train)

evaluate_model(name, model, X_valid, y_valid, "Validation")
evaluate_model(name, model, X_test, y_test, "Test")


train_Int_MM_PCA25.csv - Validation Evaluation
Confusion Matrix:
 [[2919  125]
 [ 165  791]]
Accuracy: 0.9275
Precision: 0.8635371179039302
Recall: 0.8274058577405857
F1 Score: 0.8450854700854701

train_Int_MM_PCA25.csv - Test Evaluation
Confusion Matrix:
 [[2899  145]
 [ 158  798]]
Accuracy: 0.92425
Precision: 0.8462354188759279
Recall: 0.8347280334728033
F1 Score: 0.8404423380726699


(0.92425, 0.8462354188759279, 0.8347280334728033, 0.8404423380726699)

In [17]:
path = "IntClasses/RobustScaled/train_Int_Rscaled.csv"

train_df = pd.read_csv(path)
valid_path = path.replace("Train", "Valid").replace("train", "valid")
test_path = path.replace("Train", "Test").replace("train", "test")

valid_df = pd.read_csv(valid_path)
test_df = pd.read_csv(test_path)

X_train = train_df.iloc[:, :-2]
y_train = train_df.iloc[:, -2]

X_valid = valid_df.iloc[:, :-2]
y_valid = valid_df.iloc[:, -2]

X_test = test_df.iloc[:, :-2]
y_test = test_df.iloc[:, -2]

model = RandomForestClassifier(random_state=42, bootstrap=False, class_weight='balanced', max_depth=None, max_features='sqrt', min_samples_leaf=2, n_estimators=200)

model.fit(X_train, y_train)

name = os.path.basename(path)

evaluate_model(name, model, X_valid, y_valid, "Validation")
evaluate_model(name, model, X_test, y_test, "Test")


train_Int_Rscaled.csv - Validation Evaluation
Confusion Matrix:
 [[2922  122]
 [ 166  790]]
Accuracy: 0.928
Precision: 0.8662280701754386
Recall: 0.8263598326359832
F1 Score: 0.8458244111349037

train_Int_Rscaled.csv - Test Evaluation
Confusion Matrix:
 [[2890  154]
 [ 159  797]]
Accuracy: 0.92175
Precision: 0.8380651945320715
Recall: 0.8336820083682008
F1 Score: 0.8358678552700577


(0.92175, 0.8380651945320715, 0.8336820083682008, 0.8358678552700577)

In [18]:
path = "One-Hot/RobustScaled/train_OH_Rscaled.csv"

train_df = pd.read_csv(path)
valid_path = path.replace("Train", "Valid").replace("train", "valid")
test_path = path.replace("Train", "Test").replace("train", "test")

valid_df = pd.read_csv(valid_path)
test_df = pd.read_csv(test_path)

X_train = train_df.iloc[:, :-2]
y_train = train_df.iloc[:, -2]

X_valid = valid_df.iloc[:, :-2]
y_valid = valid_df.iloc[:, -2]

X_test = test_df.iloc[:, :-2]
y_test = test_df.iloc[:, -2]

model = RandomForestClassifier(random_state=42, bootstrap=False, class_weight='balanced', max_depth=None, max_features='sqrt', min_samples_leaf=2, n_estimators=200)

model.fit(X_train, y_train)

name = os.path.basename(path)

evaluate_model(name, model, X_valid, y_valid, "Validation")
evaluate_model(name, model, X_test, y_test, "Test")


train_OH_Rscaled.csv - Validation Evaluation
Confusion Matrix:
 [[2919  125]
 [ 164  792]]
Accuracy: 0.92775
Precision: 0.8636859323882224
Recall: 0.8284518828451883
F1 Score: 0.8457020822210358

train_OH_Rscaled.csv - Test Evaluation
Confusion Matrix:
 [[2899  145]
 [ 158  798]]
Accuracy: 0.92425
Precision: 0.8462354188759279
Recall: 0.8347280334728033
F1 Score: 0.8404423380726699


(0.92425, 0.8462354188759279, 0.8347280334728033, 0.8404423380726699)