In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import os

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)
columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Clustering adjustments

# GenHlth scaling
gen_hlth_scale = {1: 1, 2: 0.75, 3: 0.5, 4: 0.25, 5: 0}
df['GenHlth'] = df['GenHlth'].map(gen_hlth_scale)

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# BMI scaling
def bmi_scale(bmi):
    if bmi < 18.5:
        return 1
    elif 18.5 <= bmi <= 24.9:
        return 1
    elif 25 <= bmi <= 29.9:
        return 0.5
    elif 30 <= bmi <= 39.9:
        return 0.25
    elif bmi >= 40:
        return 0
    return bmi

df['BMI'] = df['BMI'].apply(bmi_scale)

# Adding the NotOverweight column
df['NotOverweight'] = df['BMI'].apply(lambda x: 1 if x < 25 else 0)

# Income scaling
income_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.25, 5: 0.5, 6: 0.75, 7: 0.75, 8: 1}
df['Income'] = df['Income'].map(income_scale)

# Education scaling
education_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.75, 5: 1, 6: 1}
df['Education'] = df['Education'].map(education_scale)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0.25, 12: 0.25, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['NoDiffWalk'] + df['PhysHlth'] + df['PhysActivity']) / 4
df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol'] + df['NoHeartDiseaseorAttack'] + df['NoStroke']) / 4
df['Lifestyle'] = (df['NoSmoker'] + df['NoHvyAlcoholConsump'] + df['Veggies'] + df['Fruits']) / 4

# Splitting the dataset into features and target variable
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_temp = scaler.fit_transform(X_temp)
X_val = scaler.transform(X_val)

# Applying SMOTE to the remaining 80% of the data to oversample the minority class significantly
smote = SMOTE(sampling_strategy=1.0, random_state=42)  # Adjust sampling_strategy to oversample more
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

# Splitting the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Training the Random Forest model
model = RandomForestClassifier(random_state=42, n_estimators=200)
model.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_test_pred = model.predict(X_test)
print("Test Set Evaluation")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_test_pred)}")

# Predicting and evaluating on the untouched validation set
y_val_pred = model.predict(X_val)
print("\nValidation Set Evaluation")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
print(f"Classification Report:\n{classification_report(y_val, y_val_pred)}")

# Identifying rows in the validation set where the model incorrectly predicted the 1s (false negatives)
false_negatives = (y_val == 1) & (y_val_pred == 0)
false_negative_rows = X_val[false_negatives]

# Converting scaled data back to original scale for saving
false_negative_rows_original_scale = scaler.inverse_transform(false_negative_rows)

# Converting to DataFrame for saving
false_negative_rows_df = pd.DataFrame(false_negative_rows_original_scale, columns=df.drop(columns=['Diabetes_binary']).columns)

# Saving the false negative rows to a CSV file on the desktop
file_path = r"C:\Users\Numan\Desktop\false_negative_rows_random_classifier.csv"
false_negative_rows_df.to_csv(file_path, index=False)

Test Set Evaluation
Accuracy: 0.913528451559323
Confusion Matrix:
[[32777  2112]
 [ 3927 31022]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.92     34889
           1       0.94      0.89      0.91     34949

    accuracy                           0.91     69838
   macro avg       0.91      0.91      0.91     69838
weighted avg       0.91      0.91      0.91     69838


Validation Set Evaluation
Accuracy: 0.8433262377798801
Confusion Matrix:
[[40910  2829]
 [ 5120  1877]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91     43739
           1       0.40      0.27      0.32      6997

    accuracy                           0.84     50736
   macro avg       0.64      0.60      0.62     50736
weighted avg       0.82      0.84      0.83     50736



Adjusted Feature Engineering & Selection

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import os

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)
columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Clustering adjustments

# GenHlth scaling
gen_hlth_scale = {1: 1, 2: 0.75, 3: 0.5, 4: 0.25, 5: 0}
df['GenHlth'] = df['GenHlth'].map(gen_hlth_scale)

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# Adding the NotObese column
df['NotObese'] = df['BMI'].apply(lambda x: 1 if x < 30 else 0)

# Income scaling
income_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.25, 5: 0.5, 6: 0.75, 7: 0.75, 8: 1}
df['Income'] = df['Income'].map(income_scale)

# Education scaling
education_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.75, 5: 1, 6: 1}
df['Education'] = df['Education'].map(education_scale)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0, 12: 0, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['PhysHlth']) / 2
df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol']) / 2
df['Lifestyle'] = (df['NoSmoker'] + df['Fruits']) / 2

# Splitting the dataset into features and target variable
X = df[['Age', 'MentHlth', 'NotObese', 'NoDocbcCost', 'PhysicalCondition', 'Lifestyle', 'NoDisease']]
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_temp = scaler.fit_transform(X_temp)
X_val = scaler.transform(X_val)

# Applying SMOTE to the remaining 80% of the data to oversample the minority class significantly
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

# Splitting the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Training the Random Forest model
model = RandomForestClassifier(random_state=42, n_estimators=200)
model.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_test_pred = model.predict(X_test)
print("Test Set Evaluation")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_test_pred)}")

# Predicting and evaluating on the untouched validation set
y_val_pred = model.predict(X_val)
print("\nValidation Set Evaluation")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
print(f"Classification Report:\n{classification_report(y_val, y_val_pred)}")

Test Set Evaluation
Accuracy: 0.7561069904636444
Confusion Matrix:
[[25111  9778]
 [ 7255 27694]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.72      0.75     34889
           1       0.74      0.79      0.76     34949

    accuracy                           0.76     69838
   macro avg       0.76      0.76      0.76     69838
weighted avg       0.76      0.76      0.76     69838


Validation Set Evaluation
Accuracy: 0.714423683380637
Confusion Matrix:
[[31236 12503]
 [ 1986  5011]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.71      0.81     43739
           1       0.29      0.72      0.41      6997

    accuracy                           0.71     50736
   macro avg       0.61      0.72      0.61     50736
weighted avg       0.85      0.71      0.76     50736



Further Adjustments

In [3]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)
columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Scaling adjustments

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# Adding the NotObese column
df['NotObese'] = df['BMI'].apply(lambda x: 1 if x < 30 else 0)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0, 12: 0, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['PhysHlth']) / 2
df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol']) / 2
df['Lifestyle'] = (df['NoSmoker'] + df['Fruits']) / 2

# Splitting the dataset into features and target variable
X = df[['Age', 'MentHlth', 'NotObese', 'NoDocbcCost', 'PhysicalCondition', 'NoDisease', 'PhysHlth', 'Lifestyle', 'GenHlth', 'Income', 'Fruits']]
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_temp = scaler.fit_transform(X_temp)
X_val = scaler.transform(X_val)

# Applying SMOTE to the remaining 80% of the data to oversample the minority class significantly
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

# Splitting the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# List of models to train
models = [
    ("Logistic Regression", LogisticRegression(class_weight='balanced', max_iter=200, random_state=42)),
    ("Random Forest", RandomForestClassifier(class_weight={0: 1, 1: 2}, random_state=42, n_estimators=200, max_features='sqrt', max_depth=10, min_samples_split=5)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)),
    ("KNN", KNeighborsClassifier())
]

# Function to train and evaluate a model
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    
    report = classification_report(y_val, y_val_pred, output_dict=True)
    accuracy = accuracy_score(y_val, y_val_pred)
    
    return {
        "Model": model.__class__.__name__,
        "Accuracy": accuracy,
        "Class 0 Precision": report['0']['precision'],
        "Class 0 Recall": report['0']['recall'],
        "Class 0 F1-Score": report['0']['f1-score'],
        "Class 1 Precision": report['1']['precision'],
        "Class 1 Recall": report['1']['recall'],
        "Class 1 F1-Score": report['1']['f1-score']
    }

# Evaluate all models and collect the results
results = []
for name, model in models:
    result = evaluate_model(model, X_train, y_train, X_val, y_val)
    results.append(result)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Create the accuracy table
accuracy_df = results_df[['Model', 'Accuracy']].copy()
accuracy_df.loc[:, 'Accuracy'] = accuracy_df['Accuracy'] * 100  # Convert to percentage

# Create the detailed metrics table
detailed_metrics_df = results_df.melt(id_vars=["Model"], var_name="Metric", value_name="Score")
detailed_metrics_df[['Class', 'Metric']] = detailed_metrics_df['Metric'].str.extract(r'Class (\d) (.+)')
detailed_metrics_df = detailed_metrics_df.pivot_table(index=['Model', 'Class'], columns='Metric', values='Score').reset_index()

# Display the results
print("Accuracy Table")
print(accuracy_df)

print("\nDetailed Metrics Table")
print(detailed_metrics_df)

Accuracy Table
                        Model   Accuracy
0          LogisticRegression  72.004100
1      RandomForestClassifier  60.842400
2  GradientBoostingClassifier  74.722091
3        KNeighborsClassifier  73.982971

Detailed Metrics Table
Metric                       Model Class  F1-Score  Precision    Recall
0       GradientBoostingClassifier     0  0.836522   0.945293  0.750200
1       GradientBoostingClassifier     1  0.442900   0.318148  0.728598
2             KNeighborsClassifier     0  0.834893   0.921705  0.763026
3             KNeighborsClassifier     1  0.386731   0.286501  0.594826
4               LogisticRegression     0  0.814045   0.952366  0.710807
5               LogisticRegression     1  0.433833   0.300813  0.777762
6           RandomForestClassifier     0  0.712535   0.970440  0.562930
7           RandomForestClassifier     1  0.386082   0.246294  0.892811
