Data & Package Load

In [3]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency

Training_Data_Path = "https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Train%20Dataset%20(Clustered%20%2B%20Feature%20Engineering%20%2B%20SMOTEENN).csv"
Testing_Data_Path = "https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Untouched%20Test%20Data.csv"

# Read the train data file
df = pd.read_csv(Training_Data_Path)
test_df = pd.read_csv(Testing_Data_Path)



Chi-Square Test

In [4]:
# Selecting relevant columns for the ChiSquare test
relevant_columns = df.columns.drop(['ID', 'Diabetes_binary'])

# Performing ChiSquare test for each column
chi2_results = {}
for column in relevant_columns:
    contingency_table = pd.crosstab(df[column], df['Diabetes_binary'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    chi2_results[column] = {'chi2': chi2, 'p-value': p}

# Sorting the results by chi2 value
chi2_results_sorted = pd.DataFrame(chi2_results).T.sort_values(by='chi2', ascending=False)

# Displaying the sorted results
chi2_results_sorted

Unnamed: 0,chi2,p-value
PhysicalCondition,92089.220561,1.0621629999999999e-122
LackOfDisease,80808.650041,0.0
GenHlth,77403.395543,8.301916e-232
Income,63666.865784,1.0
BMI,62450.613724,1.256976e-11
Age,60481.270467,2.672064e-08
NoHighBP,59367.616335,0.0
Lifestyle,50214.844518,1.0
NoHighChol,42311.306944,0.0
PhysHlth,32988.537211,1.0


Modelling

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define features and target for training data
X_train = df[['PhysicalCondition', 'LackOfDisease', 'Age', 'BMI']]
y_train = df['Diabetes_binary']

# Define features and target for testing data
X_test = test_df[['PhysicalCondition', 'LackOfDisease', 'Age', 'BMI']]
y_test = test_df['Diabetes_binary']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    # Evaluate the model
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("\n" + "-"*60 + "\n")

Model: Logistic Regression
Accuracy: 0.5853831598864712
              precision    recall  f1-score   support

           0       0.96      0.54      0.69     43667
           1       0.23      0.87      0.37      7069

    accuracy                           0.59     50736
   macro avg       0.60      0.71      0.53     50736
weighted avg       0.86      0.59      0.65     50736


------------------------------------------------------------

Model: Random Forest
Accuracy: 0.8089128035320088
              precision    recall  f1-score   support

           0       0.92      0.85      0.89     43667
           1       0.37      0.52      0.43      7069

    accuracy                           0.81     50736
   macro avg       0.64      0.69      0.66     50736
weighted avg       0.84      0.81      0.82     50736


------------------------------------------------------------

Model: Gradient Boosting
Accuracy: 0.7790523494165879
              precision    recall  f1-score   support

     

Hyperparameter Tuning

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(class_weight='balanced')

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit Grid Search
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_rf.predict(X_test_scaled)

# Evaluate the model
print("Selected Model: Random Forest Classifier")
print(f"Best Parameters: {best_params}")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Selected Model: Random Forest Classifier
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Accuracy: 0.8423998738568275
              precision    recall  f1-score   support

           0       0.90      0.92      0.91     43667
           1       0.42      0.36      0.39      7069

    accuracy                           0.84     50736
   macro avg       0.66      0.64      0.65     50736
weighted avg       0.83      0.84      0.84     50736



In [16]:
# Select the specific columns for training
X_train = df[['PhysicalCondition', 'LackOfDisease', 'Age', 'BMI', 'Income']]
y_train = df['Diabetes_binary']  

# Select the specific columns for testing
X_test = test_df[['PhysicalCondition', 'LackOfDisease', 'Age', 'BMI', 'Income']]
y_test = test_df['Diabetes_binary']  

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(class_weight='balanced')

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit Grid Search
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_rf.predict(X_test_scaled)

# Evaluate the model
print("Selected Model: Random Forest Classifier")
print(f"Best Parameters: {best_params}")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Selected Model: Random Forest Classifier
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.8314411857458215
              precision    recall  f1-score   support

           0       0.90      0.90      0.90     43667
           1       0.40      0.41      0.40      7069

    accuracy                           0.83     50736
   macro avg       0.65      0.66      0.65     50736
weighted avg       0.83      0.83      0.83     50736



All Columns

In [17]:
# Select the specific columns for training
X_train = df[['CholCheck', 'BMI', 'PhysActivity', 'Fruits', 'Veggies', 'AnyHealthcare', 'NoDocbcCost', 
    'GenHlth', 'MentHlth', 'PhysHlth', 'Sex', 'Age', 'Education', 'Income', 
    'NoDiffWalk', 'NoHighBP', 'NoHighChol', 'NoHeartDiseaseorAttack', 'NoStroke', 
    'NoSmoker', 'NoHvyAlcoholConsump']]
y_train = df['Diabetes_binary']  

# Select the specific columns for testing
X_test = test_df[['CholCheck', 'BMI', 'PhysActivity', 'Fruits', 'Veggies', 'AnyHealthcare', 'NoDocbcCost', 
    'GenHlth', 'MentHlth', 'PhysHlth', 'Sex', 'Age', 'Education', 'Income', 
    'NoDiffWalk', 'NoHighBP', 'NoHighChol', 'NoHeartDiseaseorAttack', 'NoStroke', 
    'NoSmoker', 'NoHvyAlcoholConsump']]
y_test = test_df['Diabetes_binary']  

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
rf = RandomForestClassifier(class_weight='balanced')

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit Grid Search
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_rf.predict(X_test_scaled)

# Evaluate the model
print("Selected Model: Random Forest Classifier")
print(f"Best Parameters: {best_params}")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Selected Model: Random Forest Classifier
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Accuracy: 0.8184523809523809
              precision    recall  f1-score   support

           0       0.92      0.87      0.89     43667
           1       0.39      0.52      0.44      7069

    accuracy                           0.82     50736
   macro avg       0.65      0.69      0.67     50736
weighted avg       0.84      0.82      0.83     50736

