In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, matthews_corrcoef
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

balanced_df = pd.read_csv('/Users/apple/Desktop/Reprocessing/balanced.csv')
random_df = pd.read_csv('/Users/apple/Desktop/Reprocessing/random.csv')

print(balanced_df.head())
print(random_df.head())

   GenHlth  HighBP  Age  HighChol  DiffWalk  Income  HeartDiseaseorAttack  \
0        1       0    3         0         0       5                     0   
1        1       0    3         0         0       5                     0   
2        1       0    3         0         0       5                     0   
3        1       0    3         0         0       5                     0   
4        1       0    3         0         0       5                     0   

   Diabetes_012  BMI  PhysHlth  
0             0   26         0  
1             0   26         0  
2             0   23        15  
3             0   30         0  
4             0   19         0  
   Diabetes_012   BMI  PhysHlth  Membership
0             0  23.0       0.0           4
1             0  25.0       0.0           4
2             0  25.0       0.0          13
3             0  32.0       0.0          14
4             0  31.0      10.0          14


In [17]:
# Define a function to prepare data
def prepare_data(df):
    # Assuming 'Diabetes_012' is the target variable and it's categorical
    X = df.drop('Diabetes_012', axis=1)
    y = df['Diabetes_012'].astype(int)
    
    # Handle missing values
    imputer = SimpleImputer(strategy='mean')
    X_imputed = imputer.fit_transform(X)
    
    # Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    return X_scaled, y

# Prepare data for both datasets
X_balanced, y_balanced = prepare_data(balanced_df)
X_random, y_random = prepare_data(random_df)

In [18]:
import numpy as np

# Define a function to perform cross-validation and calculate metrics
def cross_validate_and_score(X, y):
    skf = StratifiedKFold(n_splits=4)
    metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'balanced_accuracy': [], 'mcc': []}
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Calculate metrics for each fold
        metrics['accuracy'].append(accuracy_score(y_test, y_pred))
        metrics['precision'].append(precision_score(y_test, y_pred, average='macro'))
        metrics['recall'].append(recall_score(y_test, y_pred, average='macro'))
        metrics['f1'].append(f1_score(y_test, y_pred, average='macro'))
        metrics['balanced_accuracy'].append(balanced_accuracy_score(y_test, y_pred))
        metrics['mcc'].append(matthews_corrcoef(y_test, y_pred))
    
    # Calculate the average of each metric across all folds
    metrics_avg = {k: np.mean(v) for k, v in metrics.items()}
    return metrics_avg

# Perform cross-validation and calculate metrics for balanced dataset
metrics_balanced = cross_validate_and_score(X_balanced, y_balanced)
print("Metrics for Balanced Dataset:", metrics_balanced)

# Perform cross-validation and calculate metrics for random dataset
metrics_random = cross_validate_and_score(X_random, y_random)
print("Metrics for Random Dataset:", metrics_random)

Metrics for Balanced Dataset: {'accuracy': 0.7943125, 'precision': 0.788731520622511, 'recall': 0.5789974619396336, 'f1': 0.5208674233170156, 'balanced_accuracy': 0.5789974619396336, 'mcc': 0.19276737934817026}
Metrics for Random Dataset: {'accuracy': 0.865875, 'precision': 0.7037073666152903, 'recall': 0.5219156336144839, 'f1': 0.5097142895197418, 'balanced_accuracy': 0.5219156336144839, 'mcc': 0.13312532599948726}


In [19]:
from sklearn.model_selection import GridSearchCV

# Define a function to perform grid search for hyperparameter tuning
def tune_parameters(X, y):
    model = LogisticRegression(max_iter=1000, solver='liblinear')  # liblinear works well for small to medium datasets
    param_grid = {
        'C': np.logspace(-4, 4, 10),  # Values for C
        'penalty': ['l1', 'l2']  # l1 and l2 penalties
    }
    grid_search = GridSearchCV(model, param_grid, cv=4, scoring='accuracy', verbose=1)
    grid_search.fit(X, y)
    return grid_search.best_params_

# Tune parameters for the balanced dataset
best_params_balanced = tune_parameters(X_balanced, y_balanced)
print("Best Parameters for Balanced Dataset:", best_params_balanced)

# Tune parameters for the random dataset
best_params_random = tune_parameters(X_random, y_random)
print("Best Parameters for Random Dataset:", best_params_random)

Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best Parameters for Balanced Dataset: {'C': 0.0001, 'penalty': 'l1'}
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best Parameters for Random Dataset: {'C': 0.046415888336127774, 'penalty': 'l2'}


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold

# Define a function to perform cross-validation and calculate metrics
def cross_validate_and_score_rf(X, y):
    skf = StratifiedKFold(n_splits=4)
    metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'balanced_accuracy': [], 'mcc': []}
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees in the forest
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Calculate metrics for each fold
        metrics['accuracy'].append(accuracy_score(y_test, y_pred))
        metrics['precision'].append(precision_score(y_test, y_pred, average='macro'))
        metrics['recall'].append(recall_score(y_test, y_pred, average='macro'))
        metrics['f1'].append(f1_score(y_test, y_pred, average='macro'))
        metrics['balanced_accuracy'].append(balanced_accuracy_score(y_test, y_pred))
        metrics['mcc'].append(matthews_corrcoef(y_test, y_pred))
    
    # Calculate the average of each metric across all folds
    metrics_avg = {k: np.mean(v) for k, v in metrics.items()}
    return metrics_avg

# Perform cross-validation and calculate metrics for balanced dataset
metrics_balanced_rf = cross_validate_and_score_rf(X_balanced, y_balanced)
print("Metrics for Balanced Dataset (Random Forest):", metrics_balanced_rf)

# Perform cross-validation and calculate metrics for random dataset
metrics_random_rf = cross_validate_and_score_rf(X_random, y_random)
print("Metrics for Random Dataset (Random Forest):", metrics_random_rf)

Metrics for Balanced Dataset (Random Forest): {'accuracy': 0.5711250000000001, 'precision': 0.5879832557302749, 'recall': 0.44897158848962626, 'f1': 0.39053246480582593, 'balanced_accuracy': 0.44897158848962626, 'mcc': -0.046178381913856315}
Metrics for Random Dataset (Random Forest): {'accuracy': 0.8520625, 'precision': 0.6200663016307717, 'recall': 0.5450984939664723, 'f1': 0.5516664773262012, 'balanced_accuracy': 0.5450984939664723, 'mcc': 0.14685959697641282}


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define a function to perform grid search for hyperparameter tuning
def tune_parameters_rf(X, y):
    model = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [100, 200, 300],  # Number of trees
        'max_depth': [10, 20, None],  # Maximum depth of the trees
        'min_samples_split': [2, 5, 10]  # Minimum number of samples required to split an internal node
    }
    grid_search = GridSearchCV(model, param_grid, cv=4, scoring='accuracy', verbose=1)
    grid_search.fit(X, y)
    return grid_search.best_params_

# Tune parameters for the balanced dataset
best_params_balanced_rf = tune_parameters_rf(X_balanced, y_balanced)
print("Best Parameters for Balanced Dataset (Random Forest):", best_params_balanced_rf)

# Tune parameters for the random dataset
best_params_random_rf = tune_parameters_rf(X_random, y_random)
print("Best Parameters for Random Dataset (Random Forest):", best_params_random_rf)

Fitting 4 folds for each of 27 candidates, totalling 108 fits
Best Parameters for Balanced Dataset (Random Forest): {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300}
Fitting 4 folds for each of 27 candidates, totalling 108 fits
Best Parameters for Random Dataset (Random Forest): {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300}
