Data Preperation, Feature Engineering, SMOTE-ENN --- Logistic Regression

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)
columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Clustering adjustments

# GenHlth scaling
gen_hlth_scale = {1: 1, 2: 0.75, 3: 0.5, 4: 0.25, 5: 0}
df['GenHlth'] = df['GenHlth'].map(gen_hlth_scale)

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# BMI scaling
def bmi_scale(bmi):
    if bmi < 18.5:
        return 1
    elif 18.5 <= bmi <= 24.9:
        return 1
    elif 25 <= bmi <= 29.9:
        return 0.5
    elif 30 <= bmi <= 39.9:
        return 0.25
    elif bmi >= 40:
        return 0
    return bmi

df['BMI'] = df['BMI'].apply(bmi_scale)

# Adding the NotOverweight column
df['NotOverweight'] = df['BMI'].apply(lambda x: 1 if x < 25 else 0)

# Income scaling
income_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.25, 5: 0.5, 6: 0.75, 7: 0.75, 8: 1}
df['Income'] = df['Income'].map(income_scale)

# Education scaling
education_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.75, 5: 1, 6: 1}
df['Education'] = df['Education'].map(education_scale)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0.25, 12: 0.25, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['NoDiffWalk'] + df['PhysHlth'] + df['PhysActivity']) / 4
df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol'] + df['NoHeartDiseaseorAttack'] + df['NoStroke']) / 4
df['Lifestyle'] = (df['NoSmoker'] + df['NoHvyAlcoholConsump'] + df['Veggies'] + df['Fruits']) / 4

# Splitting the dataset into features and target variable
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

# Applying SMOTE-ENN
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Combining resampled features and target into a single DataFrame
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)

# Splitting the resampled dataset into training and validation sets
train_df, val_df = train_test_split(df_resampled, test_size=0.2, random_state=42)

# Training the Logistic Regression model
X_train = train_df.drop(columns=['Diabetes_binary'])
y_train = train_df['Diabetes_binary']
X_val = val_df.drop(columns=['Diabetes_binary'])
y_val = val_df['Diabetes_binary']

model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

# Predicting and evaluating on the training set
y_train_pred = model.predict(X_train)
print("Training Set Evaluation")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
print(f"Classification Report:\n{classification_report(y_train, y_train_pred)}")

# Predicting and evaluating on the validation set
y_val_pred = model.predict(X_val)
print("\nValidation Set Evaluation")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
print(f"Classification Report:\n{classification_report(y_val, y_val_pred)}")

Training Set Evaluation
Accuracy: 0.9100520150547639
Confusion Matrix:
[[ 40545   7626]
 [  7263 110095]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84     48171
           1       0.94      0.94      0.94    117358

    accuracy                           0.91    165529
   macro avg       0.89      0.89      0.89    165529
weighted avg       0.91      0.91      0.91    165529


Validation Set Evaluation
Accuracy: 0.9104704830485948
Confusion Matrix:
[[10059  1878]
 [ 1827 27619]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84     11937
           1       0.94      0.94      0.94     29446

    accuracy                           0.91     41383
   macro avg       0.89      0.89      0.89     41383
weighted avg       0.91      0.91      0.91     41383



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Adjustment with Features & SMOTE-ENN

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)
columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Clustering adjustments

# GenHlth scaling
gen_hlth_scale = {1: 1, 2: 0.75, 3: 0.5, 4: 0.25, 5: 0}
df['GenHlth'] = df['GenHlth'].map(gen_hlth_scale)

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# BMI scaling
def bmi_scale(bmi):
    if bmi < 18.5:
        return 1
    elif 18.5 <= bmi <= 24.9:
        return 1
    elif 25 <= bmi <= 29.9:
        return 0.5
    elif 30 <= bmi <= 39.9:
        return 0.25
    elif bmi >= 40:
        return 0
    return bmi

df['BMI'] = df['BMI'].apply(bmi_scale)

# Adding the NotOverweight column
df['NotOverweight'] = df['BMI'].apply(lambda x: 1 if x < 25 else 0)

# Income scaling
income_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.25, 5: 0.5, 6: 0.75, 7: 0.75, 8: 1}
df['Income'] = df['Income'].map(income_scale)

# Education scaling
education_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.75, 5: 1, 6: 1}
df['Education'] = df['Education'].map(education_scale)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0.25, 12: 0.25, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['NoDiffWalk'] + df['PhysHlth'] + df['PhysActivity']) / 4
df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol'] + df['NoHeartDiseaseorAttack'] + df['NoStroke']) / 4
df['Lifestyle'] = (df['NoSmoker'] + df['NoHvyAlcoholConsump'] + df['Veggies'] + df['Fruits']) / 4

# Splitting the dataset into features and target variable
X = df[['NotOverweight', 'PhysicalCondition', 'NoDisease', 'Lifestyle']]
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying SMOTE-ENN to the remaining 80% of the data
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_temp, y_temp)

# Splitting the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Training the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_test_pred = model.predict(X_test)
print("Test Set Evaluation")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_test_pred)}")

# Predicting and evaluating on the untouched validation set
y_val_pred = model.predict(X_val)
print("\nValidation Set Evaluation")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
print(f"Classification Report:\n{classification_report(y_val, y_val_pred)}")

Test Set Evaluation
Accuracy: 0.9762388677432468
Confusion Matrix:
[[25666   182]
 [  461   752]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     25848
           1       0.81      0.62      0.70      1213

    accuracy                           0.98     27061
   macro avg       0.89      0.81      0.84     27061
weighted avg       0.97      0.98      0.97     27061


Validation Set Evaluation
Accuracy: 0.8527475559760328
Confusion Matrix:
[[42252  1487]
 [ 5984  1013]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.97      0.92     43739
           1       0.41      0.14      0.21      6997

    accuracy                           0.85     50736
   macro avg       0.64      0.56      0.57     50736
weighted avg       0.81      0.85      0.82     50736



SMOTE instead of SMOTE-ENN - I

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)
columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Clustering adjustments

# GenHlth scaling
gen_hlth_scale = {1: 1, 2: 0.75, 3: 0.5, 4: 0.25, 5: 0}
df['GenHlth'] = df['GenHlth'].map(gen_hlth_scale)

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# BMI scaling
def bmi_scale(bmi):
    if bmi < 18.5:
        return 1
    elif 18.5 <= bmi <= 24.9:
        return 1
    elif 25 <= bmi <= 29.9:
        return 0.5
    elif 30 <= bmi <= 39.9:
        return 0.25
    elif bmi >= 40:
        return 0
    return bmi

df['BMI'] = df['BMI'].apply(bmi_scale)

# Adding the NotOverweight column
df['NotOverweight'] = df['BMI'].apply(lambda x: 1 if x < 25 else 0)

# Income scaling
income_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.25, 5: 0.5, 6: 0.75, 7: 0.75, 8: 1}
df['Income'] = df['Income'].map(income_scale)

# Education scaling
education_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.75, 5: 1, 6: 1}
df['Education'] = df['Education'].map(education_scale)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0.25, 12: 0.25, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['NoDiffWalk'] + df['PhysHlth'] + df['PhysActivity']) / 4
df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol'] + df['NoHeartDiseaseorAttack'] + df['NoStroke']) / 4
df['Lifestyle'] = (df['NoSmoker'] + df['NoHvyAlcoholConsump'] + df['Veggies'] + df['Fruits']) / 4

# Splitting the dataset into features and target variable
X = df[['NotOverweight', 'PhysicalCondition', 'NoDisease', 'Lifestyle']]
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying SMOTE to the remaining 80% of the data to double the minority class
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

# Splitting the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Training the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_test_pred = model.predict(X_test)
print("Test Set Evaluation")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_test_pred)}")

# Predicting and evaluating on the untouched validation set
y_val_pred = model.predict(X_val)
print("\nValidation Set Evaluation")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
print(f"Classification Report:\n{classification_report(y_val, y_val_pred)}")

Test Set Evaluation
Accuracy: 0.7022967438930096
Confusion Matrix:
[[24677 10212]
 [10579 24370]]
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.71      0.70     34889
           1       0.70      0.70      0.70     34949

    accuracy                           0.70     69838
   macro avg       0.70      0.70      0.70     69838
weighted avg       0.70      0.70      0.70     69838


Validation Set Evaluation
Accuracy: 0.703307316304005
Confusion Matrix:
[[30707 13032]
 [ 2021  4976]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.70      0.80     43739
           1       0.28      0.71      0.40      6997

    accuracy                           0.70     50736
   macro avg       0.61      0.71      0.60     50736
weighted avg       0.85      0.70      0.75     50736



SMOTE II

In [10]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import os

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Datasets/Full%20Dataset%20(RAW).csv")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)
columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Clustering adjustments

# GenHlth scaling
gen_hlth_scale = {1: 1, 2: 0.75, 3: 0.5, 4: 0.25, 5: 0}
df['GenHlth'] = df['GenHlth'].map(gen_hlth_scale)

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# BMI scaling
def bmi_scale(bmi):
    if bmi < 18.5:
        return 1
    elif 18.5 <= bmi <= 24.9:
        return 1
    elif 25 <= bmi <= 29.9:
        return 0.5
    elif 30 <= bmi <= 39.9:
        return 0.25
    elif bmi >= 40:
        return 0
    return bmi

df['BMI'] = df['BMI'].apply(bmi_scale)

# Adding the NotOverweight column
df['NotOverweight'] = df['BMI'].apply(lambda x: 1 if x < 25 else 0)

# Income scaling
income_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.25, 5: 0.5, 6: 0.75, 7: 0.75, 8: 1}
df['Income'] = df['Income'].map(income_scale)

# Education scaling
education_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.75, 5: 1, 6: 1}
df['Education'] = df['Education'].map(education_scale)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0.25, 12: 0.25, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['NoDiffWalk'] + df['PhysHlth'] + df['PhysActivity']) / 4
df['NoDisease'] = (df['NoHighBP'] + df['NoHighChol'] + df['NoHeartDiseaseorAttack'] + df['NoStroke']) / 4
df['Lifestyle'] = (df['NoSmoker'] + df['NoHvyAlcoholConsump'] + df['Veggies'] + df['Fruits']) / 4

# Splitting the dataset into features and target variable
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

# Split off 20% of the data for validation later on
X_temp, X_val, y_temp, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_temp = scaler.fit_transform(X_temp)
X_val = scaler.transform(X_val)

# Applying SMOTE to the remaining 80% of the data to oversample the minority class significantly
smote = SMOTE(sampling_strategy=1.0, random_state=42)  # Adjust sampling_strategy to oversample more
X_resampled, y_resampled = smote.fit_resample(X_temp, y_temp)

# Splitting the resampled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Training the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=2000)
model.fit(X_train, y_train)

# Predicting and evaluating on the test set
y_test_pred = model.predict(X_test)
print("Test Set Evaluation")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_test_pred)}")

# Predicting and evaluating on the untouched validation set
y_val_pred = model.predict(X_val)
print("\nValidation Set Evaluation")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, y_val_pred)}")
print(f"Classification Report:\n{classification_report(y_val, y_val_pred)}")

# Identifying rows in the validation set where the model incorrectly predicted the 1s (false negatives)
false_negatives = (y_val == 1) & (y_val_pred == 0)
false_negative_rows = X_val[false_negatives]

# Converting scaled data back to original scale for saving
false_negative_rows_original_scale = scaler.inverse_transform(false_negative_rows)

# Converting to DataFrame for saving
false_negative_rows_df = pd.DataFrame(false_negative_rows_original_scale, columns=df.drop(columns=['Diabetes_binary']).columns)

# Saving the false negative rows to a CSV file on the desktop
file_path = r"C:\Users\Numan\Desktop\false_negative_rows.csv"
false_negative_rows_df.to_csv(file_path, index=False)

Test Set Evaluation
Accuracy: 0.7529425241272659
Confusion Matrix:
[[25354  9535]
 [ 7719 27230]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.73      0.75     34889
           1       0.74      0.78      0.76     34949

    accuracy                           0.75     69838
   macro avg       0.75      0.75      0.75     69838
weighted avg       0.75      0.75      0.75     69838


Validation Set Evaluation
Accuracy: 0.7293243456322926
Confusion Matrix:
[[31594 12145]
 [ 1588  5409]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.72      0.82     43739
           1       0.31      0.77      0.44      6997

    accuracy                           0.73     50736
   macro avg       0.63      0.75      0.63     50736
weighted avg       0.86      0.73      0.77     50736

