In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import statsmodels.formula.api as sm

In [2]:
diabetes_df = pd.read_csv('../indicators_bmi_filtered.csv')
diabetes_df.head()

Unnamed: 0,DIABETE4,_RACE,TOLDHI3,BPHIGH6,_BMI5,SMOKE100,_RFBING5,EDUCA,GENHLTH,_AGEG5YR,EXERANY2,FRUIT2,VEGETAB2,_INCOMG1,MEDCOST1,_SEX
0,No,White,Yes,No,14.54,Yes,Yes,High School Grad,Poor,Age 70 to 74,No,Yes,Yes,"$25,000 to < $35,000",No,Female
1,Yes,Black,No,Yes,28.29,No,Yes,High School Grad,Very Good,Age 70 to 74,No,Yes,Yes,"$15,000 to < $25,000",No,Female
2,Yes,White,Yes,Yes,33.47,No,No,High School Grad,Very Good,Age 60 to 64,Yes,Yes,Yes,"$50,000 to < $100,000",No,Female
3,Yes,Multiracial,Yes,No,28.73,No,Yes,Some High School,Poor,Age 75 to 79,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
4,No,White,No,No,24.37,Yes,Yes,Some College,Good,Age 80 or older,No,Yes,Yes,"$35,000 to < $50,000",No,Male


In [3]:
diabetes_df.dtypes

DIABETE4     object
_RACE        object
TOLDHI3      object
BPHIGH6      object
_BMI5       float64
SMOKE100     object
_RFBING5     object
EDUCA        object
GENHLTH      object
_AGEG5YR     object
EXERANY2     object
FRUIT2       object
VEGETAB2     object
_INCOMG1     object
MEDCOST1     object
_SEX         object
dtype: object

In [4]:
# Generate our categorical variable list
indicators_cat = diabetes_df.dtypes[diabetes_df.dtypes == "object"].index.tolist()

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False, drop='first')

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(diabetes_df[indicators_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names_out(indicators_cat)
encode_df.head()

Unnamed: 0,DIABETE4_Yes,_RACE_American Indian or Alaskan Native,_RACE_Asian,_RACE_Black,_RACE_Hispanic,_RACE_Multiracial,_RACE_Other,_RACE_White,TOLDHI3_Yes,BPHIGH6_Yes,...,FRUIT2_Yes,VEGETAB2_Yes,"_INCOMG1_$100,000 to < $200,000","_INCOMG1_$15,000 to < $25,000","_INCOMG1_$25,000 to < $35,000","_INCOMG1_$35,000 to < $50,000","_INCOMG1_$50,000 to < $100,000","_INCOMG1_Less than $15,000",MEDCOST1_Yes,_SEX_Male
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [6]:
# Merge one-hot encoded features and drop the originals
diabetes_df = diabetes_df.merge(encode_df,left_index=True, right_index=True)
diabetes_df = diabetes_df.drop(indicators_cat,1)
diabetes_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,_BMI5,DIABETE4_Yes,_RACE_American Indian or Alaskan Native,_RACE_Asian,_RACE_Black,_RACE_Hispanic,_RACE_Multiracial,_RACE_Other,_RACE_White,TOLDHI3_Yes,...,FRUIT2_Yes,VEGETAB2_Yes,"_INCOMG1_$100,000 to < $200,000","_INCOMG1_$15,000 to < $25,000","_INCOMG1_$25,000 to < $35,000","_INCOMG1_$35,000 to < $50,000","_INCOMG1_$50,000 to < $100,000","_INCOMG1_Less than $15,000",MEDCOST1_Yes,_SEX_Male
0,14.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,28.29,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,33.47,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,28.73,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,24.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [7]:
# Define features and targets
y = diabetes_df['DIABETE4_Yes']
X = diabetes_df.drop(columns=['DIABETE4_Yes'])

In [8]:
# Split into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 78, stratify=y)

In [9]:
# Create Standard Scaler
scaler = StandardScaler()

# Fit Data
X_scaler = scaler.fit(X_train)

# Scale Data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create and logistic regression model 
classifier = LogisticRegression(random_state=78)
classifier

LogisticRegression(random_state=78)

In [11]:
# Fit and train model 
clf = classifier.fit(X_train_scaled, y_train)

In [12]:
predictions = classifier.predict(X_test_scaled)

In [13]:
# Calculate accuracy score 
accuracy_score(y_test, predictions)

0.8560155516383585

In [14]:
# Generate confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create df from confusion matrix
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Acutal 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,50911,833
Acutal 1,7907,1050


In [15]:
# classification report
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     51744
         1.0       0.56      0.12      0.19      8957

    accuracy                           0.86     60701
   macro avg       0.71      0.55      0.56     60701
weighted avg       0.82      0.86      0.81     60701



In [16]:
print(clf.coef_)

[[ 0.35785479  0.00832996  0.00488655 -0.05190905 -0.05643791 -0.05826885
  -0.03941287 -0.23657067  0.24594298  0.34047011 -0.01585973  0.19318261
   0.00898529  0.00886965  0.01018781  0.03923332  0.01410704  0.49344946
   0.50277548  0.31679133  0.24835047  0.07553151  0.14418824  0.21928217
   0.30953284  0.35735352  0.43963908  0.51438426  0.55265036  0.59840995
   0.61245678  0.51442988  0.46802458 -0.07635641  0.01013799 -0.00299069
   0.06008032  0.10688381  0.11240425  0.09912829  0.10110447  0.0891575
  -0.00572384  0.07855911]]


In [17]:
coef = pd.DataFrame(zip(X_train, np.transpose(clf.coef_.tolist()[0])), columns=['features', 'coef'])
coef

Unnamed: 0,features,coef
0,_BMI5,0.357855
1,_RACE_American Indian or Alaskan Native,0.00833
2,_RACE_Asian,0.004887
3,_RACE_Black,-0.051909
4,_RACE_Hispanic,-0.056438
5,_RACE_Multiracial,-0.058269
6,_RACE_Other,-0.039413
7,_RACE_White,-0.236571
8,TOLDHI3_Yes,0.245943
9,BPHIGH6_Yes,0.34047


In [18]:
coef.sort_values('coef')

Unnamed: 0,features,coef
7,_RACE_White,-0.236571
33,EXERANY2_Yes,-0.076356
5,_RACE_Multiracial,-0.058269
4,_RACE_Hispanic,-0.056438
3,_RACE_Black,-0.051909
6,_RACE_Other,-0.039413
10,SMOKE100_Yes,-0.01586
42,MEDCOST1_Yes,-0.005724
35,VEGETAB2_Yes,-0.002991
2,_RACE_Asian,0.004887


In [19]:
import statsmodels.api as sm
logit_model=sm.Logit(y_train, X_train)
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.351847
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:           DIABETE4_Yes   No. Observations:               182101
Model:                          Logit   Df Residuals:                   182057
Method:                           MLE   Df Model:                           43
Date:                Tue, 25 Apr 2023   Pseudo R-squ.:                  0.1591
Time:                        15:14:01   Log-Likelihood:                -64072.
converged:                       True   LL-Null:                       -76198.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                              coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------
_BMI5                                       0.0471

In [20]:
LRresult = (result.summary().tables[1])
results_df = pd.DataFrame(LRresult)
results_df.columns=results_df.iloc[0]
results_df = results_df.iloc[1:, :]

In [21]:
results_df.columns=['feature', 'coef', 'std_err', 'z', 'Pz', '[0.025', '0.975]']

In [22]:
results_df.to_csv('../Resources/result_summary_z.csv')