In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
import seaborn as sns

In [None]:
X_train = pd.read_csv ('X_train.csv', index_col = 0)
y_train = pd.read_csv ('y_train.csv', index_col = 0)
y_train = y_train['prediabetes_bin_y'].squeeze()

In [None]:
X_train_33, _, y_train_33, _ = train_test_split (X_train, y_train, test_size=0.67, stratify=y_train)

In [None]:
del X_train, y_train

In [None]:
# Define pre-processing and  hyperparameter options
scaling_options = [MinMaxScaler()] # Scaling
pca_options = [None, 5] # Dimensionality reduction options
penalty_options = ['l1', 'l2'] # Lasso or Ridge regression
C_options = [0.01, 0.1, 0, 1, 10] # Coefficient penalties

In [None]:
# Initialize a pipeline without any parameters:
pipe = Pipeline (
    [
        ('scaling', None),
        ('pca', None),
        ('logreg', LogisticRegression(max_iter = 500))
    ]
)

In [None]:
# Define parameters for grid search
param_grid = [
    {
        'scaling' : scaling_options,
        'pca' : pca_options,
        'logreg__penalty' : penalty_options,
        'logreg__C' : C_options
    }
]

In [None]:
# Instantiate the grid search with the pipeline and hyperparameters:
grid_search = GridSearchCV (pipe, # pipeline initiated
                            param_grid = param_grid, # grid parameter options
                            cv = 10) # Use cross-validation of 10-fold

In [None]:
# Fit the grid search for best logistic regression parameter on train data
grid_search.fit (X_train_33, y_train_33)

In [None]:
# Extract the best model:
best_model = grid_search.best_estimator_
# Extract the best hyperparameters:
best_params = grid_search.best_params_
print("Best Hyperparameters:\n", best_params)

In [None]:
# Clean up:
del X_train_33, y_train_33, best_model

In [None]:
# Load training data again:
X_train = pd.read_csv ('X_train.csv', index_col = 0)
y_train = pd.read_csv ('y_train.csv', index_col = 0)
y_train = y_train['prediabetes_bin_y'].squeeze()

In [None]:
## Train a logistic regression with the best parameters discovered:
# 1. Scale data using
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform (X_train)
# 2. Instantiate Logistic Regression
logreg = LogisticRegression (penalty = 'l2',
                             C = 10,
                             max_iter = 1000) # Increase the maximum number of iterations
# 3. Fit Logistic Regression on the scaled train dataset
logreg.fit (X_train_scaled, y_train)

In [None]:
# Predict rating for train:
lg_best_train = logreg.predict (X_train_scaled)
# Training test performance:
report_train = classification_report (y_train, lg_best_train)
print(report_train)

In [None]:
del X_train, y_train, X_train_scaled, lg_best_train

In [None]:
# Load test data:
X_test = pd.read_csv ('X_test.csv', index_col = 0)
y_test = pd.read_csv ('y_test.csv', index_col = 0)
y_test = y_test['prediabetes_bin_y'].squeeze()

In [None]:
# Scale test set:
X_test_scaled = scaler.transform (X_test)
# Predict rating for test:
lg_best_test = logreg.predict (X_test_scaled)

# Obtain calssification reports:
report_test = classification_report (y_test, lg_best_test)

# Print results:
print ("Full report for train dataset:\n", report_train, "----------\n")
print ("Full report for test dataset:\n", report_test, "----------\n")

In [None]:
X_test