In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import random

In [2]:
random.seed(42)

In [3]:
# df = pd.read_csv("/home/longyuwen/Data_mining_and_business_analysis/group project/diabetes/diabetes_binary_health_indicators_BRFSS2021.csv")
df = pd.read_csv("D:\\数据挖掘与商业分析\\group project\\diabetes\\diabetes_binary_health_indicators_BRFSS2021.csv")
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0,1.0,1,15.0,1.0,0.0,0.0,0,1,...,1,0.0,5.0,10.0,20.0,0.0,0,11,4.0,5.0
1,1.0,1,0.0,1,28.0,0.0,0.0,1.0,0,1,...,1,0.0,2.0,0.0,0.0,0.0,0,11,4.0,3.0
2,1.0,1,1.0,1,33.0,0.0,0.0,0.0,1,1,...,1,0.0,2.0,10.0,0.0,0.0,0,9,4.0,7.0
3,1.0,0,1.0,1,29.0,0.0,1.0,1.0,1,1,...,1,0.0,5.0,0.0,30.0,1.0,1,12,3.0,4.0
4,0.0,0,0.0,1,24.0,1.0,0.0,0.0,0,0,...,1,0.0,3.0,0.0,0.0,1.0,1,13,5.0,6.0


In [4]:
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [5]:
lab_col = ['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck','Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']
num_col = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth']

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[num_col])
df_scaled = pd.DataFrame(df_scaled, columns=num_col)
df_scaled.describe()

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth
count,236378.0,236378.0,236378.0,236378.0
mean,-5.0500120000000004e-17,-2.6452450000000003e-17,9.859548e-18,9.619071e-19
std,1.000002,1.000002,1.000002,1.000002
min,-2.587526,-1.438803,-0.4992982,-0.4549293
25%,-0.7560359,-0.4671098,-0.4992982,-0.4549293
50%,-0.1455392,-0.4671098,-0.4992982,-0.4549293
75%,0.4649576,0.5045835,0.007898326,-0.2123842
max,10.69078,2.44797,3.304676,3.183247


In [6]:
df_scaled = pd.concat([df_scaled, df[lab_col]], axis=1)
labeler = OneHotEncoder(sparse_output=False, drop="first")
df_labeled = labeler.fit_transform(df[['Education']])

In [7]:
df_scaled.shape

(236378, 22)

In [8]:
df_scaled = df_scaled.drop(columns=['Education'])
df_labeled = pd.DataFrame(df_labeled, columns=labeler.get_feature_names_out())
df_scaled = pd.concat([df_scaled, df_labeled], axis=1)

In [9]:
df_scaled.columns

Index(['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Diabetes_binary', 'HighBP',
       'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
       'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump',
       'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex', 'Age', 'Income',
       'Education_2.0', 'Education_3.0', 'Education_4.0', 'Education_5.0',
       'Education_6.0'],
      dtype='object')

In [10]:
lab_col.remove("Diabetes_binary")
lab_col.remove("Education")
lab_col.extend(['Education_2.0', 'Education_3.0', 'Education_4.0', 'Education_5.0', 'Education_6.0'])
lab_col

['HighBP',
 'HighChol',
 'CholCheck',
 'Smoker',
 'Stroke',
 'HeartDiseaseorAttack',
 'PhysActivity',
 'Fruits',
 'Veggies',
 'HvyAlcoholConsump',
 'AnyHealthcare',
 'NoDocbcCost',
 'DiffWalk',
 'Sex',
 'Age',
 'Income',
 'Education_2.0',
 'Education_3.0',
 'Education_4.0',
 'Education_5.0',
 'Education_6.0']

In [11]:
print("proportion of 0:{:.3f}; propotion of 1:{:.3f}".format(np.mean(df_scaled['Diabetes_binary']==0),np.mean(df_scaled['Diabetes_binary']==1)))

proportion of 0:0.858; propotion of 1:0.142


In [12]:
X = df_scaled.drop(columns="Diabetes_binary")
y = df_scaled["Diabetes_binary"]
        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [13]:
lab_col

['HighBP',
 'HighChol',
 'CholCheck',
 'Smoker',
 'Stroke',
 'HeartDiseaseorAttack',
 'PhysActivity',
 'Fruits',
 'Veggies',
 'HvyAlcoholConsump',
 'AnyHealthcare',
 'NoDocbcCost',
 'DiffWalk',
 'Sex',
 'Age',
 'Income',
 'Education_2.0',
 'Education_3.0',
 'Education_4.0',
 'Education_5.0',
 'Education_6.0']

In [14]:
print("train:proportion of 0:{:.3f}; propotion of 1:{:.3f}".format(np.mean(y_train==0),np.mean(y_train==1)))
print("test:proportion of 0:{:.3f}; propotion of 1:{:.3f}".format(np.mean(y_test==0),np.mean(y_test==1)))

train:proportion of 0:0.858; propotion of 1:0.142
test:proportion of 0:0.858; propotion of 1:0.142


In [15]:
smotenc = SMOTENC(lab_col, random_state=1)
bal_X_train, bal_y_train = smotenc.fit_resample(X_train, y_train)
print("resample:proportion of 0:{:.3f}; propotion of 1:{:.3f}".format(np.mean(bal_y_train==0),np.mean(bal_y_train==1)))

resample:proportion of 0:0.500; propotion of 1:0.500


In [16]:
naive_logis = LogisticRegression(penalty="l1", random_state=1, solver='liblinear')
naive_logis.fit(X_train, y_train)

In [17]:
print("Naive logistic regression score for test set:")

naive_y_hat_test_class = naive_logis.predict(X_test)
print("Precision: {:.3f}".format(precision_score(y_test, naive_y_hat_test_class)))
print("Recall: {:.3f}".format(recall_score(y_test, naive_y_hat_test_class)))
print("F1 score: {:.3f}".format(f1_score(y_test, naive_y_hat_test_class)))
print("Accuracy score: {:.3f}".format(accuracy_score(y_test, naive_y_hat_test_class)))

naive_y_hat_test = naive_logis.predict_proba(X_test)
print("AUC score: {:.3f}".format(roc_auc_score(y_test, naive_y_hat_test[:,1])))

# naive_cf = confusion_matrix(y_test, naive_y_hat_test_class)
# print("Sensitivity: {:.3f}".format(naive_cf[1, 1] / (naive_cf[1, 1] + naive_cf[1, 0])))

Naive logistic regression score for test set:
Precision: 0.549
Recall: 0.151
F1 score: 0.237
Accuracy score: 0.862
AUC score: 0.814


In [18]:
smote_logis = LogisticRegression(penalty="l1", random_state=1, solver='liblinear')
smote_logis.fit(bal_X_train, bal_y_train)

In [19]:
print("Logistic regression with SMOTE-NC resampling score for test set:")

smote_y_hat_test_class = smote_logis.predict(X_test)
print("Precision: {:.3f}".format(precision_score(y_test, smote_y_hat_test_class)))
print("Recall: {:.3f}".format(recall_score(y_test, smote_y_hat_test_class)))
print("F1 score: {:.3f}".format(f1_score(y_test, smote_y_hat_test_class)))
print("Accuracy score: {:.3f}".format(accuracy_score(y_test, smote_y_hat_test_class)))

smote_y_hat_test = smote_logis.predict_proba(X_test)
print("AUC score: {:.3f}".format(roc_auc_score(y_test, smote_y_hat_test[:,1])))

# smote_cf = confusion_matrix(y_test, smote_y_hat_test_class)
# print("Sensitivity: {:.3f}".format(smote_cf[1, 1] / (smote_cf[1, 1] + smote_cf[1, 0])))

Logistic regression with SMOTE-NC resampling score for test set:
Precision: 0.305
Recall: 0.725
F1 score: 0.429
Accuracy score: 0.726
AUC score: 0.800


In [20]:
smote_coef = smote_logis.coef_
for i in range(0, len(X.columns)):
    if abs(smote_coef[0, i]) >= 0.1:
        print(X.columns[i] + " {:.4f}".format(smote_coef[0, i]))

BMI 0.4515
GenHlth 0.6528
HighBP 0.7461
HighChol 0.5577
CholCheck 2.8060
Stroke -0.6003
PhysActivity -0.1271
Veggies 0.1514
HvyAlcoholConsump -2.1826
AnyHealthcare 0.8205
NoDocbcCost -0.7167
Sex 0.3047
Age 0.2105
Education_2.0 -3.2503
Education_3.0 -3.1954
Education_4.0 -3.1909
Education_5.0 -3.1345
Education_6.0 -3.2997
