In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc, accuracy_score, classification_report, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from IPython.display import clear_output
import collections
from PIL import Image

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:

diabetes_012_health_indicators_BRFSS2015 = 'https://raw.githubusercontent.com/Panta-Rhei-LZ/DS-9000-Project-Data/refs/heads/main/diabetes_012_health_indicators_BRFSS2015.csv'
diabetes_binary_health_indicators_BRFSS2015 = 'https://raw.githubusercontent.com/Panta-Rhei-LZ/DS-9000-Project-Data/refs/heads/main/diabetes_binary_health_indicators_BRFSS2015.csv'
diabetes_binary_5050split_health_indicators_BRFSS2015 = 'https://raw.githubusercontent.com/Panta-Rhei-LZ/DS-9000-Project-Data/refs/heads/main/diabetes_binary_5050split_health_indicators_BRFSS2015.csv'

diabetes_012_health_indicators_BRFSS2015 = pd.read_csv(diabetes_012_health_indicators_BRFSS2015)
diabetes_binary_health_indicators_BRFSS2015 = pd.read_csv(diabetes_binary_health_indicators_BRFSS2015)
diabetes_binary_5050split_health_indicators_BRFSS2015 = pd.read_csv(diabetes_binary_5050split_health_indicators_BRFSS2015)

## LR 5050

In [3]:
X = diabetes_binary_5050split_health_indicators_BRFSS2015.drop(['Diabetes_binary'], axis=1)
y = diabetes_binary_5050split_health_indicators_BRFSS2015.Diabetes_binary

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2,
    stratify=y,
    random_state=11
)

In [4]:
LR=LogisticRegression(max_iter=10000000000)
lr = LR.fit(X_train,y_train)
y_pred = lr.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (includes precision, recall, and F1 score)
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.75
Confusion Matrix:
[[5157 1912]
 [1601 5469]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.73      0.75      7069
         1.0       0.74      0.77      0.76      7070

    accuracy                           0.75     14139
   macro avg       0.75      0.75      0.75     14139
weighted avg       0.75      0.75      0.75     14139



## LR 012(using resampling)

In [8]:
# import sklearn
# import imblearn
# print("scikit-learn version:", sklearn.__version__)
# print("imbalanced-learn version:", imblearn.__version__)

from imblearn.over_sampling import SMOTE

# Separate features and target
X = diabetes_012_health_indicators_BRFSS2015.drop(['Diabetes_012'], axis=1)
y = diabetes_012_health_indicators_BRFSS2015.Diabetes_012

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, stratify=y, random_state=11
)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=11)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
from collections import Counter
print("Class distribution after SMOTE:", Counter(y_train_smote))

# Train logistic regression with class_weight='balanced'
model_balanced = LogisticRegression(
	class_weight='balanced', 
	multi_class='multinomial', 
	solver='lbfgs', 
	max_iter=100000
)
model_balanced.fit(X_train_smote, y_train_smote)

# Predictions on the test set
y_pred_balanced = model_balanced.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_balanced)
print(f"Test Accuracy: {accuracy:.2f}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_balanced)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, F1-score)
class_report = classification_report(y_test, y_pred_balanced)
print("Classification Report:")
print(class_report)


Class distribution after SMOTE: Counter({0.0: 170962, 2.0: 170962, 1.0: 170962})




Test Accuracy: 0.64
Confusion Matrix:
[[28131  7516  7094]
 [  242   280   404]
 [ 1258  1699  4112]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78     42741
         1.0       0.03      0.30      0.05       926
         2.0       0.35      0.58      0.44      7069

    accuracy                           0.64     50736
   macro avg       0.44      0.51      0.42     50736
weighted avg       0.85      0.64      0.72     50736



## LR 012(using weighted loss) (bad, discard)

In [6]:
X = diabetes_012_health_indicators_BRFSS2015.drop(['Diabetes_012'], axis=1)
y = diabetes_012_health_indicators_BRFSS2015.Diabetes_012
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    X, y, test_size=0.2,
    stratify=y,
    random_state=11
)
model_balanced = LogisticRegression(class_weight='balanced', multi_class='multinomial', solver='lbfgs', max_iter=1000)
model_balanced.fit(X_train_1, y_train_1)



In [7]:
y_pred_balanced = lr.predict(X_test_1)

accuracy = accuracy_score(y_test_1, y_pred_balanced)
print(f"Test Accuracy: {accuracy:.2f}")

conf_matrix = confusion_matrix(y_test_1, y_pred_balanced)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (includes precision, recall, and F1 score)
class_report = classification_report(y_test_1, y_pred_balanced)
print("Classification Report:")
print(class_report)

Test Accuracy: 0.63
Confusion Matrix:
[[31384 11357     0]
 [  324   602     0]
 [ 1682  5387     0]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.73      0.82     42741
         1.0       0.03      0.65      0.07       926
         2.0       0.00      0.00      0.00      7069

    accuracy                           0.63     50736
   macro avg       0.32      0.46      0.30     50736
weighted avg       0.79      0.63      0.70     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## LR binary

In [15]:
X = diabetes_binary_health_indicators_BRFSS2015.drop(['Diabetes_binary'], axis=1)
y = diabetes_binary_health_indicators_BRFSS2015.Diabetes_binary
class_distribution = y.value_counts()
print(class_distribution)
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, stratify=y, random_state=11
)

#Apply SMOTE to balance the training data
smote = SMOTE(random_state=11)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
from collections import Counter
print("Class distribution after SMOTE:", Counter(y_train_smote))

# Train logistic regression with class_weight='balanced'
model_balanced = LogisticRegression(
	class_weight='balanced',  
	solver='lbfgs', 
	max_iter=100000
)
model_balanced.fit(X_train_smote, y_train_smote)

# Predictions on the test set
y_pred_balanced = model_balanced.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_balanced)
print(f"Test Accuracy: {accuracy:.2f}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_balanced)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report (precision, recall, F1-score)
class_report = classification_report(y_test, y_pred_balanced)
print("Classification Report:")
print(class_report)

Diabetes_binary
0.0    218334
1.0     35346
Name: count, dtype: int64
Class distribution after SMOTE: Counter({0.0: 174667, 1.0: 174667})
Test Accuracy: 0.73
Confusion Matrix:
[[31565 12102]
 [ 1753  5316]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.72      0.82     43667
         1.0       0.31      0.75      0.43      7069

    accuracy                           0.73     50736
   macro avg       0.63      0.74      0.63     50736
weighted avg       0.86      0.73      0.77     50736

