In [43]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [44]:
balanced = pd.read_csv('/Users/apple/Desktop/Reprocessing/balanced.csv')
random = pd.read_csv('/Users/apple/Desktop/Reprocessing/random.csv')

print(balanced.head())
print(random.head())

   GenHlth  HighBP  Age  HighChol  DiffWalk  Income  HeartDiseaseorAttack  \
0        1       0    3         0         0       5                     0   
1        1       0    3         0         0       5                     0   
2        1       0    3         0         0       5                     0   
3        1       0    3         0         0       5                     0   
4        1       0    3         0         0       5                     0   

   Diabetes_012  BMI  PhysHlth  
0             0   26         0  
1             0   26         0  
2             0   23        15  
3             0   30         0  
4             0   19         0  
   Diabetes_012   BMI  PhysHlth  Membership
0             0  23.0       0.0           4
1             0  25.0       0.0           4
2             0  25.0       0.0          13
3             0  32.0       0.0          14
4             0  31.0      10.0          14


### Logistic Regression

In [45]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
import numpy as np

In [50]:
def evaluate_model(data, target_name, folds=4):
    X = data.drop(columns=[target_name])
    y = data[target_name]
    
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    model = LogisticRegression(solver='liblinear', random_state=42)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    y_pred_prob = cross_val_predict(model, X_scaled, y, cv=kf, method='predict_proba')[:, 1]
    y_pred = (y_pred_prob >= 0.5).astype(int)
    
    auc = roc_auc_score(y, y_pred_prob)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    
    mcc = matthews_corrcoef(y, y_pred)

    
    return auc, precision, recall, f1, mcc

In [51]:
balanced_scores = evaluate_model(balanced, 'Diabetes_012')
print("Balanced dataset scores:")
print(f"AUC: {balanced_scores[0]:.4f}, Precision: {balanced_scores[1]:.4f}, Recall: {balanced_scores[2]:.4f}, F1: {balanced_scores[3]:.4f}, MCC: {balanced_scores[4]:.4f}")

random_scores = evaluate_model(random, 'Diabetes_012')
print("Random dataset scores:")
print(f"AUC: {random_scores[0]:.4f}, Precision: {random_scores[1]:.4f}, Recall: {random_scores[2]:.4f}, F1: {random_scores[3]:.4f}, MCC: {random_scores[4]:.4f}")

Balanced dataset scores:
AUC: 0.8279, Precision: 0.8515, Recall: 0.8794, F1: 0.8504, MCC: 0.2697
Random dataset scores:
AUC: 0.7794, Precision: 0.8234, Recall: 0.8654, F1: 0.8149, MCC: 0.1342


### Random Forest

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
def evaluate_model_rf(data, target_name, folds=4):
    X = data.drop(columns=[target_name])
    y = data[target_name]
    
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    y_pred_prob = cross_val_predict(model, X_scaled, y, cv=kf, method='predict_proba')[:, 1]
    y_pred = (y_pred_prob >= 0.5).astype(int)
    
    auc = roc_auc_score(y, y_pred_prob)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    
    mcc = matthews_corrcoef(y, y_pred)

    
    return auc, precision, recall, f1, mcc

In [54]:
balanced_scores_rf = evaluate_model_rf(balanced, 'Diabetes_012')
print("Balanced dataset (Random Forest) scores:")
print(f"AUC: {balanced_scores_rf[0]:.4f}, Precision: {balanced_scores_rf[1]:.4f}, Recall: {balanced_scores_rf[2]:.4f}, F1: {balanced_scores_rf[3]:.4f}, MCC: {balanced_scores[4]:.4f}")

random_scores_rf = evaluate_model_rf(random, 'Diabetes_012')
print("Random dataset (Random Forest) scores:")
print(f"AUC: {random_scores_rf[0]:.4f}, Precision: {random_scores_rf[1]:.4f}, Recall: {random_scores_rf[2]:.4f}, F1: {random_scores_rf[3]:.4f}, MCC: {random_scores[4]:.4f}")

Balanced dataset (Random Forest) scores:
AUC: 0.7655, Precision: 0.8349, Recall: 0.8621, F1: 0.8443, MCC: 0.2697
Random dataset (Random Forest) scores:
AUC: 0.7189, Precision: 0.8034, Recall: 0.8498, F1: 0.8176, MCC: 0.1342


# Tuning

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, make_scorer, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

### LR tuning

In [36]:
scorers = {
    'auc_score': make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr'),
    'precision_score': make_scorer(precision_score, average='weighted'),
    'recall_score': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted'),
    'matthews_corrcoef': make_scorer(matthews_corrcoef)
}

param_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2']
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logisticregression', LogisticRegression(random_state=42, solver='saga', max_iter=10000))
])

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

In [38]:
def perform_grid_search(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    mcc_scorer = make_scorer(matthews_corrcoef)
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scorers, refit='auc_score', return_train_score=False)
    grid_search.fit(X, y)

    print("Best parameters:", grid_search.best_params_)
    print("Best AUC:", grid_search.best_score_)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X)
    y_pred_proba = best_model.predict_proba(X)[:, 1]

    auc = roc_auc_score(y, y_pred_proba, average='weighted')
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    
    mcc = matthews_corrcoef(y, y_pred)


    return auc, precision, recall, f1, mcc

metrics_balanced_tuned = perform_grid_search(balanced, 'Diabetes_012')
metrics_random_tuned = perform_grid_search(random, 'Diabetes_012')

print("Balanced Dataset Tuned Metrics:", metrics_balanced_tuned)
print("Random Dataset Tuned Metrics:", metrics_random_tuned)

Best parameters: {'logisticregression__C': 0.01, 'logisticregression__penalty': 'l2'}
Best AUC: 0.8279420373569755
Best parameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l2'}
Best AUC: 0.7796468594254607
Balanced Dataset Tuned Metrics: (0.8291271313152305, 0.8507827033760527, 0.8793125, 0.8477917383814448, 0.25726828392368173)
Random Dataset Tuned Metrics: (0.7796557747988291, 0.8258755788416278, 0.8658125, 0.8147142526763588, 0.1358186242032341)


### RF tuning

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

In [40]:
scorers = {
    'auc_score': make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr'),
    'precision_score': make_scorer(precision_score, average='weighted'),
    'recall_score': make_scorer(recall_score, average='weighted'),
    'f1_score': make_scorer(f1_score, average='weighted'),
    'matthews_corrcoef': make_scorer(matthews_corrcoef)
}

param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

In [42]:
def perform_random_search(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    rf_classifier = RandomForestClassifier(random_state=42)

    mcc_scorer = make_scorer(matthews_corrcoef)
    random_search = RandomizedSearchCV(rf_classifier, param_distributions=param_dist, n_iter=10, cv=cv, scoring=scorers, refit='auc_score', random_state=42)
    random_search.fit(X, y)

    print("Best parameters:", random_search.best_params_)
    print("Best AUC:", random_search.best_score_)

    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X)
    y_pred_proba = best_model.predict_proba(X)[:, 1]

    auc = roc_auc_score(y, y_pred_proba)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')
    mcc = matthews_corrcoef(y, y_pred)


    return auc, precision, recall, f1, mcc

metrics_balanced_tuned = perform_random_search(balanced, 'Diabetes_012')
metrics_random_tuned = perform_random_search(random, 'Diabetes_012')

print("Balanced Dataset Tuned Metrics:", metrics_balanced_tuned)
print("Random Dataset Tuned Metrics:", metrics_random_tuned)

Best parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 10}
Best AUC: 0.8210573665917666
Best parameters: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 10}
Best AUC: 0.7685776847439822
Balanced Dataset Tuned Metrics: (0.8889529352406798, 0.8839376167119211, 0.895125, 0.870004441563413, 0.3923057655244977)
Random Dataset Tuned Metrics: (0.8277332567683628, 0.8538648046780979, 0.8725625, 0.828028659636676, 0.23136116116593491)
