In [1]:
from pandas import DataFrame
from sklearn.preprocessing import StandardScaler


def drop_empty_items(data_frame: DataFrame) -> DataFrame:
    """The 0_data_exploration.ipynb consists of data for empty values for each column:
        {
            'Pregnancies': 14.5,
            'Glucose': 0.7,
            'BloodPressure': 4.6,
            'SkinThickness': 29.6,
            'Insulin': 48.7,
            'BMI': 1.4,
            'DiabetesPedigreeFunction': 0.0,
            'Age': 0.0
        }
    - We will be removing datapoints which has empty ``Glucose``, ``BloodPressure`` & ``BMI``.
    - ``Age`` and ``DiabetesPedigreeFunction`` have no null values, hence are omitted.
    - ``Pregnancies`` can be 0, so it's not a missing value problem.
    - ``Insulin`` & ``SkinThickness`` values will remove a large chunk of entries if removed, hence are not touched.
    """
    # zero_counts = {
    #     'Glucose': (data_frame['Glucose'] == 0).sum(),
    #     'BloodPressure': (data_frame['BloodPressure'] == 0).sum(),
    #     'BMI': (data_frame['BMI'] == 0).sum()
    # }
    # print("Number of entries with zero values:")
    # for column, count in zero_counts.items():
    #     print(f"{column} was 0 in {count} entries")

    # Drop rows where 'Glucose', 'BloodPressure', or 'BMI' are 0
    data_frame = data_frame.drop(['SkinThickness','Insulin'],axis=1)
    return data_frame[(data_frame['Glucose'] != 0) & (data_frame['BloodPressure'] != 0) & (data_frame['BMI'] != 0)]


def scale_features(X_train, X_test, features):
    """
    Scale specified features in the training and test datasets using StandardScaler.

    Parameters:
    - X_train (pd.DataFrame): Training dataset
    - X_test (pd.DataFrame): Test dataset
    - features (list): List of column names to scale

    Returns:
    - X_train_scaled (pd.DataFrame): Scaled training dataset
    - X_test_scaled (pd.DataFrame): Scaled test dataset
    """
    # Verify all features exist in the datasets
    missing_features = [f for f in features if f not in X_train.columns or f not in X_test.columns]
    if missing_features:
        raise ValueError(f"Features not found in dataset: {missing_features}")

    # Initialize scaler
    scaler = StandardScaler()

    # Copy datasets to avoid modifying originals
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()

    # Scale specified features
    X_train_scaled[features] = scaler.fit_transform(X_train[features])
    X_test_scaled[features] = scaler.transform(X_test[features])

    return X_train_scaled, X_test_scaled

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from imblearn.over_sampling import SMOTE

random_value = 0
# Load The Data
df = pd.read_csv('data/diabetes.csv')
# Random Forest with Preprocessing, SMOTE, and Grid Search
# Step 1: Remove rows with zero values in 'Glucose', 'BloodPressure', or 'BMI'
df = drop_empty_items(df)

# Define features
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI',
            'DiabetesPedigreeFunction', 'Age']

# Splitting Dataset into 80-20 split
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_value)

# Using Z-Scores:
# (Deviation from Research Paper)
# We will be using Scaling, after the split, to ensure no leakage occurs!
X_train, X_test = scale_features(X_train, X_test, features)

smote = SMOTE(random_state=random_value)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Initialize Random Forest and Grid Search
rf = RandomForestClassifier(random_state=random_value)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',
    # cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit Grid Search on SMOTE-balanced training data
grid_search.fit(X_train_smote, y_train_smote)

# Print best parameters and score
print("\nBest Hyperparameters:")
print(grid_search.best_params_)
print(f"Best Cross-Validation F1-Weighted Score: {grid_search.best_score_:.4f}")

# Train best model on full SMOTE training data
best_rf = grid_search.best_estimator_
best_rf.fit(X_train_smote, y_train_smote)

Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best Hyperparameters:
{'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best Cross-Validation F1-Weighted Score: 0.8413


In [8]:
grid_search.best_score_

0.8413210177762288

In [3]:

# Evaluate best model on test set
y_pred = best_rf.predict(X_test)

# Compute overall metrics
accuracy = accuracy_score(y_test, y_pred)
precision_overall, recall_overall, f1_overall, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

# Compute per-class metrics and specificity
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None)
cm = confusion_matrix(y_test, y_pred)

# Compute specificity for each class
specificity = []
negative_counts = []
for i in range(2):  # Binary classification (0: Non-Diabetic, 1: Diabetic)
    tn = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i])  # Total - (row i + col i - TP)
    fp = cm[:, i].sum() - cm[i, i]  # Col i - TP
    specificity.append(tn / (tn + fp) if (tn + fp) > 0 else 0.0)
    negative_counts.append(tn + fp)  # Number of negative instances for class i

# Compute overall specificity (weighted by negative instances)
total_negative = sum(negative_counts)
specificity_overall = sum(
    spec * count for spec, count in zip(specificity, negative_counts)) / total_negative if total_negative > 0 else 0.0

# Print evaluation metrics
print("\nBest Random Forest Model Performance (Test Set):")
print("\nOverall Metrics:")
print(f"{'Metric':<15} {'Value':<10}")
print(f"{'Accuracy':<15} {accuracy:<10.4f}")
print(f"{'Precision':<15} {precision_overall:<10.4f}")
print(f"{'Recall':<15} {recall_overall:<10.4f}")
print(f"{'F1-Score':<15} {f1_overall:<10.4f}")
print(f"{'Specificity':<15} {specificity_overall:<10.4f}")

print("\nConfusion Matrix:")
print(f"{'':<15} {'Predicted Non-Diabetic':<22} {'Predicted Diabetic':<22}")
print(f"{'Actual Non-Diabetic':<15} {cm[0, 0]:<22} {cm[0, 1]:<22}")
print(f"{'Actual Diabetic':<15} {cm[1, 0]:<22} {cm[1, 1]:<22}")


Class distribution before SMOTE:
Outcome
0    0.65285
1    0.34715
Name: proportion, dtype: float64

Class distribution after SMOTE:
Outcome
1    0.5
0    0.5
Name: proportion, dtype: float64
Train set shape after SMOTE: (756, 6)
Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best Hyperparameters:
{'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best Cross-Validation F1-Weighted Score: 0.8413

Best Random Forest Model Performance (Test Set):

Overall Metrics:
Metric          Value     
Accuracy        0.7586    
Precision       0.7734    
Recall          0.7586    
F1-Score        0.7631    
Specificity     0.7586    

Confusion Matrix:
                Predicted Non-Diabetic Predicted Diabetic    
Actual Non-Diabetic 75                     22                    
Actual Diabetic 13                     35                    
