In [31]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, make_scorer
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import time
from sklearn.model_selection import GridSearchCV

In [38]:
def cross_validate_model(model, X, y, cv, bivariate, printer = True):
    """Perform cross-validation and print metrics if asked"""
    if bivariate == False:
        scoring = {
            'accuracy': 'accuracy',
            'precision': make_scorer(precision_score, average='weighted', zero_division=1),
            'recall': make_scorer(recall_score, average='weighted', zero_division=1),
            'f1': make_scorer(f1_score, average='weighted', zero_division=1)
        }
        start = time.time()
        results = cross_validate(model, X, y, cv=cv, scoring=scoring)
        end = time.time()
        if printer == True:
            print(f"Accuracy: {results['test_accuracy'].mean():.4f}")
            print(f"Precision: {results['test_precision'].mean():.4f}")
            print(f"Recall: {results['test_recall'].mean():.4f}")
            print(f"F1 Score: {results['test_f1'].mean():.4f}")
            print("Prediction time: ", end-start)
        return results
    else:
        scoring = ['accuracy', 'precision', 'recall', 'f1']
        start = time.time()
        cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring)
        end = time.time()
        if printer == True:
            print(f"Accuracy: {cv_results['test_accuracy'].mean():.4f}")
            print(f"Precision: {cv_results['test_precision'].mean():.4f}")
            print(f"Recall: {cv_results['test_recall'].mean():.4f}")
            print(f"F1 Score: {cv_results['test_f1'].mean():.4f}")
            print("Time:", end - start)
        return cv_results

def scale_data(X_train, X_test):
    """Scale only numerical features using StandardScaler."""
    numeric_cols = X_train.select_dtypes(include=['number']).columns  
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
    return X_train_scaled, X_test_scaled

def get_dummies_all(X_train, X_test):
    """
    Converts all categorical variables in a DataFrame into dummy (one-hot encoded) variables.
    """
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns  # Select categorical columns
    X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)  # One-hot encode them
    categorical_cols = X_test.select_dtypes(include=['object', 'category']).columns  # Select categorical columns
    X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)  # One-hot encode them
    return X_train_encoded, X_test_encoded

def oversampling(X, y):
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X, y)
    print(f"Original class distribution: {y.value_counts()}")
    print(f"Resampled class distribution: {pd.Series(y_train_resampled).value_counts()}")
    return X_train_resampled, y_train_resampled

def find_optimal_k(min, max, X, y, cv, bivariate, save_path=None):
    """Find the optimal k and plot accuracy against k values"""
    k_values = range(min, max)
    accuracy_scores = []
    start_k = time.time()
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        cv_results = cross_validate_model(knn, X, y, cv=cv, bivariate=bivariate, printer=False)
        accuracy_scores.append(cv_results['test_accuracy'].mean())
    end_k = time.time()
    optimal_k = k_values[np.argmax(accuracy_scores)]
    print("Optimal k:", optimal_k)
    print("Time: ", {end_k - start_k})
    plt.figure(figsize=(12, 6))
    plt.plot(k_values, accuracy_scores, marker='o')
    plt.title('Accuracy vs. Number of Neighbors (k)')
    plt.xlabel('Number of Neighbors (k)')
    plt.ylabel('Accuracy')
    plt.grid(True)
    if save_path:
        plt.savefig(save_path, bbox_inches="tight")
    plt.close()
    return optimal_k

def hyperparameterTuning(min, max, X, y, cv, bivariate):
    param_grid = {
    'n_neighbors': range(min, max),
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan', 'cosine'],
    }

    if bivariate == True:
        scoring = {
        'accuracy': 'accuracy',
        'precision_weighted': make_scorer(precision_score, average='macro', zero_division=1),
        'recall_weighted': make_scorer(recall_score, average='macro', zero_division=1),
        'f1_weighted': make_scorer(f1_score, average='macro', zero_division=1)
        }

        start = time.time()
        knn = KNeighborsClassifier()
        grid_search = GridSearchCV(knn, param_grid, cv=cv, scoring=scoring, refit='accuracy', n_jobs=-1)
        grid_search.fit(X, y)
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation precision: {grid_search.cv_results_['mean_test_precision_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation recall: {grid_search.cv_results_['mean_test_recall_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation F1 Score: {grid_search.cv_results_['mean_test_f1_weighted'][grid_search.best_index_]:.4f}")
        print("Time for Hypertuning: ", time.time()-start)
    else:
        scoring = {
        'accuracy': 'accuracy',
        'precision_weighted': make_scorer(precision_score, average='weighted', zero_division=1),
        'recall_weighted': make_scorer(recall_score, average='weighted', zero_division=1),
        'f1_weighted': make_scorer(f1_score, average='weighted', zero_division=1)
        }
        start = time.time()
        knn = KNeighborsClassifier()
        grid_search = GridSearchCV(knn, param_grid, cv=cv, scoring=scoring, refit='accuracy', n_jobs=-1)
        grid_search.fit(X, y)
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation precision: {grid_search.cv_results_['mean_test_precision_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation recall: {grid_search.cv_results_['mean_test_recall_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation F1 Score: {grid_search.cv_results_['mean_test_f1_weighted'][grid_search.best_index_]:.4f}")
        print("Time for Hypertuning: ", time.time()-start)

    best_model = grid_search.best_estimator_  # Best trained model
    best_params = grid_search.best_params_    # Best hyperparameters
    return best_model, best_params 


def predict_on_testset(best_model, X_test, y_test):
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=1)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=1)
    f1 = f1_score(y_test, y_pred, average="macro", zero_division=1)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [None]:
print("----------------------------------------------------------------------------------------------")
print("-------------------------------------- CREDIT CARD DEFAULT -----------------------------------")
print("----------------------------------------------------------------------------------------------")
print("")

print("-------------------------------------- ORIGINAL DATASET --------------------------------------")
print("----------------------------------------------------------------------------------------------")
CCD_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CCD_train.csv")
CCD_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CCD_test.csv")
X_train = CCD_train.drop('default_payment_next_month', axis=1); y_train = CCD_train['default_payment_next_month']
X_test = CCD_test.drop('default_payment_next_month', axis=1); y_test = CCD_test['default_payment_next_month']
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
X_train_oversampled, y_train_oversampled = oversampling(X_train_scaled, y_train)

print("CCD:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Finding optimal k:")
optimal_k = find_optimal_k(1, 25, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True, save_path=None)
print(f"Optimal k={optimal_k}")
k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
cross_validate_model(k_knn, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train_oversampled, y_train_oversampled, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)


print("")
print("----------------------------------- AFTER MICROAGGREGATION -----------------------------------")
print("----------------------------------------------------------------------------------------------")

print("CCD - mdav")
CCD_train_mdav = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/modifiedDatasets/CCD_mdav.csv")
X_train_mdav = CCD_train_mdav.drop('default_payment_next_month', axis=1);  y_train_mdav = CCD_train_mdav['default_payment_next_month']
X_train_scaled, X_test_scaled = scale_data(X_train_mdav, X_test)
X_train_mdav_oversampled, y_train_mdav_oversampled = oversampling(X_train_mdav, y_train_mdav)

print("CCD MDAV:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_mdav, y_train_mdav, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD MDAV - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_mdav_oversampled, y_train_mdav_oversampled, cv=5, bivariate = True)

# print("----------------------------------------------------------------------------------------------")
# print("Finding optimal k:")
# optimal_k = find_optimal_k(1, 25, X_train_mdav_oversampled, y_train_mdav_oversampled, cv=3, bivariate = True, save_path=None)
# print(f"Optimal k={optimal_k}")
# k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
# cross_validate_model(k_knn, X_train_mdav_oversampled, y_train_mdav_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD MDAV - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,20,X_train_mdav_oversampled, y_train_mdav_oversampled, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD MDAV - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)


print("CCD - PCA")
CCD_train_pca = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/modifiedDatasets/CCD_pca.csv")
X_train_pca = CCD_train_pca.drop('default_payment_next_month', axis=1);  y_train_pca = CCD_train_pca['default_payment_next_month']
X_train_scaled, X_test_scaled = scale_data(X_train_pca, X_test)
X_train_pca_oversampled, y_train_pca_oversampled = oversampling(X_train_pca, y_train_pca)

print("CCD PCA:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_pca, y_train_pca, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("CCD PCA - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_pca_oversampled, y_train_pca_oversampled, cv=5, bivariate=True)

# print("----------------------------------------------------------------------------------------------")
# print("Finding optimal k:")
# optimal_k = find_optimal_k(1, 25, X_train_pca_oversampled, y_train_pca_oversampled, cv=3, bivariate = True, save_path=None)
# print(f"Optimal k={optimal_k}")
# k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
# cross_validate_model(k_knn, X_train_pca_oversampled, y_train_pca_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD PCA - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,20,X_train_pca_oversampled, y_train_pca_oversampled, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("CCD PCA - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)


print("CCD - onedims")
CCD_train_onedims = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/modifiedDatasets/CCD_onedims.csv")
X_train_onedims = CCD_train_onedims.drop('default_payment_next_month', axis=1);  y_train_onedims = CCD_train_onedims['default_payment_next_month']
X_train_scaled, X_test_scaled = scale_data(X_train_onedims, X_test)
X_train_onedims_oversampled, y_train_onedims_oversampled = oversampling(X_train_onedims, y_train_onedims)

print("CCD onedims:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_onedims, y_train_onedims, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("CCD onedims - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_onedims_oversampled, y_train_onedims_oversampled, cv=5, bivariate=True)

# print("----------------------------------------------------------------------------------------------")
# print("Finding optimal k:")
# optimal_k = find_optimal_k(1, 25, X_train_onedims_oversampled, y_train_onedims_oversampled, cv=3, bivariate = True, save_path=None)
# print(f"Optimal k={optimal_k}")
# k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
# cross_validate_model(k_knn, X_train_onedims_oversampled, y_train_onedims_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD onedims - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,20,X_train_onedims_oversampled, y_train_onedims_oversampled, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("CCD onedims - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)

print("")
print("----------------------------------- AFTER GLOBAL TRANSFORMATION -----------------------------------")
print("---------------------------------------------------------------------------------------------------")
CCD_train_GT = pd.read_csv("data/CCD_train_GT.csv")
CCD_test_GT = pd.read_csv("data/CCD_test_GT.csv")
X_train = CCD_train_GT.drop('default_payment_next_month', axis=1); y_train = CCD_train_GT['default_payment_next_month']
X_test = CCD_test_GT.drop('default_payment_next_month', axis=1); y_test = CCD_test_GT['default_payment_next_month']
X_train, X_test = get_dummies_all(X_train, X_test)
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
X_train_oversampled, y_train_oversampled = oversampling(X_train_scaled, y_train)

print("CCD - Global Transformation:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD Global Transformation - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True)

#print("----------------------------------------------------------------------------------------------")
#print("Finding optimal k:")
#optimal_k = find_optimal_k(1, 25, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True, save_path=None)
#print(f"Optimal k={optimal_k}")
#k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
#cross_validate_model(k_knn, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD Global Transformation - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train, y_train, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD Global Transformation - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)


print("")
print("----------------------------------- AFTER LOCAL TRANSFORMATION -----------------------------------")
print("---------------------------------------------------------------------------------------------------")
CCD_train_GT = pd.read_csv("data/CCD_train_LT.csv")
CCD_test_GT = pd.read_csv("data/CCD_test_LT.csv")
X_train = CCD_train_GT.drop('default_payment_next_month', axis=1); y_train = CCD_train_GT['default_payment_next_month']
X_test = CCD_test_GT.drop('default_payment_next_month', axis=1); y_test = CCD_test_GT['default_payment_next_month']
X_train, X_test = get_dummies_all(X_train, X_test)
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
X_train_oversampled, y_train_oversampled = oversampling(X_train_scaled, y_train)

print("CCD - Local Transformation:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD Local Transformation - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True)

#print("----------------------------------------------------------------------------------------------")
#print("Finding optimal k:")
#optimal_k = find_optimal_k(1, 25, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True, save_path=None)
#print(f"Optimal k={optimal_k}")
#k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
#cross_validate_model(k_knn, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD Local Transformation - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train, y_train, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("CCD Local Transformation - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)

----------------------------------------------------------------------------------------------
-------------------------------------- CREDIT CARD DEFAULT -----------------------------------
----------------------------------------------------------------------------------------------

-------------------------------------- ORIGINAL DATASET --------------------------------------
----------------------------------------------------------------------------------------------
Original class distribution: default_payment_next_month
0    18538
1     5212
Name: count, dtype: int64
Resampled class distribution: default_payment_next_month
1    18538
0    18538
Name: count, dtype: int64
CCD:
Accuracy: 0.7531
Precision: 0.3720
Recall: 0.1798
F1 Score: 0.2420
Time: 50.849504709243774
----------------------------------------------------------------------------------------------
CCD - Oversampled:
Accuracy: 0.7580
Precision: 0.7177
Recall: 0.8504
F1 Score: 0.7784
Time: 89.14964318275452
-------------

PicklingError: Could not pickle the task to send it to the workers.

In [None]:
print("----------------------------------------------------------------------------------------------")
print("------------------------------------ PIMA INDIANS DIABETES  ----------------------------------")
print("----------------------------------------------------------------------------------------------")
print("")

print("-------------------------------------- ORIGINAL DATASET --------------------------------------")
print("----------------------------------------------------------------------------------------------")
PID_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/PID_train.csv")
PID_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/PID_test.csv")
X_train = PID_train.drop('Outcome', axis=1); y_train = PID_train['Outcome']
X_test = PID_test.drop('Outcome', axis=1); y_test = PID_test['Outcome']
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
X_train_oversampled, y_train_oversampled = oversampling(X_train_scaled, y_train)

print("PID:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train, y_train, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_oversampled, y_train_oversampled, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Finding optimal k:")
optimal_k = find_optimal_k(1, 25, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True, save_path=None)
print(f"Optimal k={optimal_k}")
k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
cross_validate_model(k_knn, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train_oversampled, y_train_oversampled, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)


print("")
print("----------------------------------- AFTER MICROAGGREGATION -----------------------------------")
print("----------------------------------------------------------------------------------------------")

print("PID - mdav")
PID_train_mdav = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/modifiedDatasets/PID_mdav.csv")
X_train_mdav = PID_train_mdav.drop('Outcome', axis=1);  y_train_mdav = PID_train_mdav['Outcome']
X_train_scaled, X_test_scaled = scale_data(X_train_mdav, X_test)
X_train_mdav_oversampled, y_train_mdav_oversampled = oversampling(X_train_mdav, y_train_mdav)

print("PID MDAV:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_mdav, y_train_mdav, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID MDAV - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_mdav_oversampled, y_train_mdav_oversampled, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Finding optimal k:")
optimal_k = find_optimal_k(1, 25, X_train_mdav_oversampled, y_train_mdav_oversampled, cv=3, bivariate = True, save_path=None)
print(f"Optimal k={optimal_k}")
k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
cross_validate_model(k_knn, X_train_mdav_oversampled, y_train_mdav_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID MDAV - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train_mdav_oversampled, y_train_mdav_oversampled, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID MDAV - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)


print("PID - PCA")
PID_train_pca = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/modifiedDatasets/PID_pca.csv")
X_train_pca = PID_train_pca.drop('Outcome', axis=1);  y_train_pca = PID_train_pca['Outcome']
X_train_scaled, X_test_scaled = scale_data(X_train_pca, X_test)
X_train_pca_oversampled, y_train_pca_oversampled = oversampling(X_train_pca, y_train_pca)

print("PID PCA:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_pca, y_train_pca, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("PID PCA - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_pca_oversampled, y_train_pca_oversampled, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("Finding optimal k:")
optimal_k = find_optimal_k(1, 25, X_train_pca_oversampled, y_train_pca_oversampled, cv=3, bivariate = True, save_path=None)
print(f"Optimal k={optimal_k}")
k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
cross_validate_model(k_knn, X_train_pca_oversampled, y_train_pca_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID PCA - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train_pca_oversampled, y_train_pca_oversampled, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("PID PCA - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)


print("PID - onedims")
PID_train_onedims = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/modifiedDatasets/PID_onedims.csv")
X_train_onedims = PID_train_onedims.drop('Outcome', axis=1);  y_train_onedims = PID_train_onedims['Outcome']
X_train_scaled, X_test_scaled = scale_data(X_train_onedims, X_test)
X_train_onedims_oversampled, y_train_onedims_oversampled = oversampling(X_train_onedims, y_train_onedims)

print("PID onedims:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_onedims, y_train_onedims, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("PID onedims - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_onedims_oversampled, y_train_onedims_oversampled, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("Finding optimal k:")
optimal_k = find_optimal_k(1, 25, X_train_onedims_oversampled, y_train_onedims_oversampled, cv=3, bivariate = True, save_path=None)
print(f"Optimal k={optimal_k}")
k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
cross_validate_model(k_knn, X_train_onedims_oversampled, y_train_onedims_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID onedims - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train_onedims_oversampled, y_train_onedims_oversampled, cv=5, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("PID onedims - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)

print("")
print("----------------------------------- AFTER Global Transformation -----------------------------------")
print("---------------------------------------------------------------------------------------------------")

print("PID - Global Transformation")
PID_train_GT = pd.read_csv("data/PID_train_GT.csv")
PID_test_GT = pd.read_csv("data/PID_test_GT.csv")
X_train = PID_train_GT.drop('Outcome', axis=1); y_train = PID_train_GT['Outcome']
X_test = PID_test_GT.drop('Outcome', axis=1); y_test = PID_test_GT['Outcome']
X_train, X_test = get_dummies_all(X_train, X_test)
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
X_train_oversampled, y_train_oversampled = oversampling(X_train_scaled, y_train)

print("PID Global Transformation:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train, y_train, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID Global Transformation - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_oversampled, y_train_oversampled, cv=5, bivariate = True)

#print("----------------------------------------------------------------------------------------------")
#print("Finding optimal k:")
#optimal_k = find_optimal_k(1, 25, X_train_GT_oversampled, y_train_GT_oversampled, cv=3, bivariate = True, save_path=None)
#print(f"Optimal k={optimal_k}")
#k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
#cross_validate_model(k_knn, X_train_GT_oversampled, y_train_GT_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID Global Transformation - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train, y_train, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID Global Transformation - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)

print("")
print("----------------------------------- AFTER Local Transformation -----------------------------------")
print("---------------------------------------------------------------------------------------------------")

print("PID - Global Transformation")
PID_train_GT = pd.read_csv("data/PID_train_LT.csv")
PID_test_GT = pd.read_csv("data/PID_test_LT.csv")
X_train = PID_train_GT.drop('Outcome', axis=1); y_train = PID_train_GT['Outcome']
X_test = PID_test_GT.drop('Outcome', axis=1); y_test = PID_test_GT['Outcome']
X_train, X_test = get_dummies_all(X_train, X_test)
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
X_train_oversampled, y_train_oversampled = oversampling(X_train_scaled, y_train)

print("PID Local Transformation:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train, y_train, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID Local Transformation - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_oversampled, y_train_oversampled, cv=5, bivariate = True)

#print("----------------------------------------------------------------------------------------------")
#print("Finding optimal k:")
#optimal_k = find_optimal_k(1, 25, X_train_GT_oversampled, y_train_GT_oversampled, cv=3, bivariate = True, save_path=None)
#print(f"Optimal k={optimal_k}")
#k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
#cross_validate_model(k_knn, X_train_GT_oversampled, y_train_GT_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID Local Transformation - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train, y_train, cv=5, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("PID Local Transformation - Predict on testset:")
predict_on_testset(best_model, X_test, y_test)


----------------------------------------------------------------------------------------------
------------------------------------ PIMA INDIANS DIABETES  ----------------------------------
----------------------------------------------------------------------------------------------

-------------------------------------- ORIGINAL DATASET --------------------------------------
----------------------------------------------------------------------------------------------
Original class distribution: Outcome
0    388
1    208
Name: count, dtype: int64
Resampled class distribution: Outcome
1    388
0    388
Name: count, dtype: int64
PID:
Accuracy: 0.7148
Precision: 0.6241
Recall: 0.5531
F1 Score: 0.5757
Time: 0.28379154205322266
----------------------------------------------------------------------------------------------
PID - Oversampled:
Accuracy: 0.7732
Precision: 0.7549
Recall: 0.8146
F1 Score: 0.7829
Time: 0.2318859100341797
---------------------------------------------------------

In [None]:
print("--------------------------------------------------------------------------------------")
print("------------------------------------ CENSUS INCOME  ----------------------------------")
print("--------------------------------------------------------------------------------------")
print("")


print("")
print("----------------------------------- AFTER Global Transformation ------------------------------")
print("----------------------------------------------------------------------------------------------")

print("KDD - Global Transformation")
KDD_train_GT = pd.read_csv("data/KDD_train_GT.csv")
KDD_test_GT = pd.read_csv("data/KDD_test_GT.csv")
X_train = KDD_train_GT.drop('income', axis=1); y_train = KDD_train_GT['income']
X_test = KDD_test_GT.drop('income', axis=1); y_test = KDD_test_GT['income']
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
X_train, X_test = get_dummies_all(X_train, X_test)
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
X_train_oversampled, y_train_oversampled = oversampling(X_train_scaled, y_train_encoded)

print("KDD Global Transformation:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train, y_train_encoded, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("KDD Global Transformation - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True)

#print("----------------------------------------------------------------------------------------------")
#print("Finding optimal k:")
#optimal_k = find_optimal_k(1, 25, X_train_GT_oversampled, y_train_GT_oversampled, cv=3, bivariate = True, save_path=None)
#print(f"Optimal k={optimal_k}")
#k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
#cross_validate_model(k_knn, X_train_GT_oversampled, y_train_GT_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("KDD Global Transformation - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train, y_train_encoded, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("KDD Global Transformation - Predict on testset:")
predict_on_testset(best_model, X_test, y_test_encoded)


print("")
print("----------------------------------- AFTER Local Transformation ------------------------------")
print("----------------------------------------------------------------------------------------------")

print("KDD - Global Transformation")
KDD_train_GT = pd.read_csv("data/KDD_train_LT.csv")
KDD_test_GT = pd.read_csv("data/KDD_test_LT.csv")
X_train = KDD_train_GT.drop('income', axis=1); y_train = KDD_train_GT['income']
X_test = KDD_test_GT.drop('income', axis=1); y_test = KDD_test_GT['income']
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
X_train, X_test = get_dummies_all(X_train, X_test)
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
X_train_oversampled, y_train_oversampled = oversampling(X_train_scaled, y_train_encoded)

print("KDD Global Transformation:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train, y_train_encoded, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("KDD Global Transformation - Oversampled:")
rf = KNeighborsClassifier()
cross_validate_model(rf, X_train_oversampled, y_train_oversampled, cv=3, bivariate = True)

#print("----------------------------------------------------------------------------------------------")
#print("Finding optimal k:")
#optimal_k = find_optimal_k(1, 25, X_train_GT_oversampled, y_train_GT_oversampled, cv=3, bivariate = True, save_path=None)
#print(f"Optimal k={optimal_k}")
#k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
#cross_validate_model(k_knn, X_train_GT_oversampled, y_train_GT_oversampled, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("KDD Global Transformation - Hyperparameter Tuning:")
best_model, best_params = hyperparameterTuning(1,15,X_train, y_train_encoded, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("KDD Global Transformation - Predict on testset:")
predict_on_testset(best_model, X_test, y_test_encoded)