In [23]:
import scipy.stats as stats
import statsmodels.api as sm
import numpy as np
import pandas as pd

df = pd.read_csv("diabetes.csv")

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,Treatment,HemoglobinA1C.Baseline,HemoglobinA1C.Followup,Female,Age,SBP,BMI
0,1,False,4.9,7.1,False,43,123,25.43
1,2,True,5.1,5.4,False,47,128,26.28
2,3,True,5.1,4.3,True,55,120,28.21
3,4,True,5.2,5.1,True,54,132,25.98
4,5,True,5.2,7.1,True,40,111,26.22


<h1>Simple regression</h1>

### Logistic regression for HemoglobinA1C.Baseline 

In [25]:
df.rename(columns = {'HemoglobinA1C.Baseline':'Baseline', 'HemoglobinA1C.Followup':'Followup'}, inplace = True)

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,Treatment,Baseline,Followup,Female,Age,SBP,BMI
0,1,False,4.9,7.1,False,43,123,25.43
1,2,True,5.1,5.4,False,47,128,26.28
2,3,True,5.1,4.3,True,55,120,28.21
3,4,True,5.2,5.1,True,54,132,25.98
4,5,True,5.2,7.1,True,40,111,26.22


In [27]:
model = sm.GLM.from_formula("Treatment ~ Baseline", family = sm.families.Binomial(), data=df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,"['Treatment[False]', 'Treatment[True]']",No. Observations:,520.0
Model:,GLM,Df Residuals:,518.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-334.95
Date:,"Mon, 07 Nov 2022",Deviance:,669.91
Time:,12:59:43,Pearson chi2:,518.0
No. Iterations:,4,Pseudo R-squ. (CS):,0.089
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.8424,0.428,-6.646,0.000,-3.681,-2.004
Baseline,0.2965,0.046,6.443,0.000,0.206,0.387


In [28]:
res.conf_int()

Unnamed: 0,0,1
Intercept,-3.680709,-2.004149
Baseline,0.206332,0.386763


### We can observe that HemoglobinA1C.Baseline is a good predictor for Treatment since p value is less than 0.05. The confidence intervals also indicate the same.

### Logistic regression for HemoglobinA1C.Followup

In [29]:
model = sm.GLM.from_formula("Treatment ~ Followup", family = sm.families.Binomial(), data=df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,"['Treatment[False]', 'Treatment[True]']",No. Observations:,520.0
Model:,GLM,Df Residuals:,518.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-335.13
Date:,"Mon, 07 Nov 2022",Deviance:,670.26
Time:,13:00:40,Pearson chi2:,520.0
No. Iterations:,4,Pseudo R-squ. (CS):,0.08839
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.5114,0.377,-6.655,0.000,-3.251,-1.772
Followup,0.2872,0.044,6.486,0.000,0.200,0.374


In [30]:
res.conf_int()

Unnamed: 0,0,1
Intercept,-3.251085,-1.771791
Followup,0.200394,0.373951


### We can observe that HemoglobinA1C.Followup is a good predictor for Treatment since p value is less than 0.05. The confidance intervals also indicate the same.

### Logistic regression for Age

In [31]:
model = sm.GLM.from_formula("Treatment ~ Age", family = sm.families.Binomial(), data=df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,"['Treatment[False]', 'Treatment[True]']",No. Observations:,520.0
Model:,GLM,Df Residuals:,518.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-354.87
Date:,"Mon, 07 Nov 2022",Deviance:,709.74
Time:,13:00:48,Pearson chi2:,520.0
No. Iterations:,4,Pseudo R-squ. (CS):,0.01648
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1196,0.441,2.540,0.011,0.256,1.984
Age,-0.0235,0.008,-2.909,0.004,-0.039,-0.008


In [32]:
res.conf_int()

Unnamed: 0,0,1
Intercept,0.25572,1.983543
Age,-0.039281,-0.007654


### We can observe that Age is a good predictor for Treatment since p value is less than 0.05. The confidance intervals also indicate the same.

### Logistic regression for Female

In [33]:
model = sm.GLM.from_formula("Treatment ~ Female", family = sm.families.Binomial(), data=df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,"['Treatment[False]', 'Treatment[True]']",No. Observations:,520.0
Model:,GLM,Df Residuals:,518.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-359.11
Date:,"Mon, 07 Nov 2022",Deviance:,718.23
Time:,13:00:54,Pearson chi2:,520.0
No. Iterations:,4,Pseudo R-squ. (CS):,0.0002899
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1737,0.126,-1.379,0.168,-0.421,0.073
Female[T.True],0.0683,0.176,0.388,0.698,-0.276,0.413


In [34]:
res.conf_int()

Unnamed: 0,0,1
Intercept,-0.420549,0.073222
Female[T.True],-0.276486,0.413092


### We can observe that Female is not a good predictor for Treatment since p value is greater than 0.05. The confidance intervals also indicate the same.

### Logistic regression for SBP

In [35]:
model = sm.GLM.from_formula("Treatment ~ SBP", family = sm.families.Binomial(), data=df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,"['Treatment[False]', 'Treatment[True]']",No. Observations:,520.0
Model:,GLM,Df Residuals:,518.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-359.03
Date:,"Mon, 07 Nov 2022",Deviance:,718.06
Time:,13:01:00,Pearson chi2:,520.0
No. Iterations:,4,Pseudo R-squ. (CS):,0.000607
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3724,0.914,0.407,0.684,-1.419,2.164
SBP,-0.0039,0.007,-0.562,0.574,-0.017,0.010


In [36]:
res.conf_int()

Unnamed: 0,0,1
Intercept,-1.419408,2.164236
SBP,-0.01746,0.009683


### We can observe that SBP is not a good predictor for Treatment since p value is greater than 0.05. The confidance intervals also indicate the same.

### Logistic regression for BMI

In [37]:
model = sm.GLM.from_formula("Treatment ~ BMI", family = sm.families.Binomial(), data=df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,"['Treatment[False]', 'Treatment[True]']",No. Observations:,520.0
Model:,GLM,Df Residuals:,518.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-358.79
Date:,"Mon, 07 Nov 2022",Deviance:,717.58
Time:,13:01:05,Pearson chi2:,520.0
No. Iterations:,4,Pseudo R-squ. (CS):,0.001535
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.9822,0.949,-1.035,0.301,-2.842,0.878
BMI,0.0264,0.030,0.893,0.372,-0.032,0.084


In [38]:
res.conf_int()

Unnamed: 0,0,1
Intercept,-2.842237,0.877862
BMI,-0.031552,0.084353


### We can observe that BMI is not a good predictor for Treatment since p value is greater than 0.05. The confidance intervals also indicate the same.

<h1>Multiple regression</h1>

In [39]:
model = sm.GLM.from_formula("Treatment ~ Baseline + Followup + Age + Female + BMI + SBP", family = sm.families.Binomial(), data=df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,"['Treatment[False]', 'Treatment[True]']",No. Observations:,520.0
Model:,GLM,Df Residuals:,513.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-323.59
Date:,"Mon, 07 Nov 2022",Deviance:,647.18
Time:,13:01:11,Pearson chi2:,518.0
No. Iterations:,4,Pseudo R-squ. (CS):,0.128
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2271,1.168,-0.194,0.846,-2.517,2.063
Female[T.True],0.0152,0.189,0.080,0.936,-0.356,0.386
Baseline,0.2204,0.067,3.287,0.001,0.089,0.352
Followup,0.1711,0.062,2.780,0.005,0.050,0.292
Age,-0.0294,0.010,-3.019,0.003,-0.048,-0.010
BMI,-0.0613,0.040,-1.549,0.121,-0.139,0.016
SBP,0.0015,0.010,0.159,0.874,-0.018,0.021


In [40]:
res.aic

661.1755752588326

### Here we can see that Female, BMI and SBP are not good predictors for Treatment. Let's try removing them.

In [41]:
model = sm.GLM.from_formula("Treatment ~ Baseline + Followup + Age", family = sm.families.Binomial(), data=df)
res = model.fit()
res.aic

658.0988824441788

In [42]:
model = sm.GLM.from_formula("Treatment ~ Baseline  + Age", family = sm.families.Binomial(), data=df)
res = model.fit()
res.aic

664.1701139583399

In [43]:
model = sm.GLM.from_formula("Treatment ~  Followup + Age", family = sm.families.Binomial(), data=df)
res = model.fit()
res.aic

665.045014398811

In [44]:
model = sm.GLM.from_formula("Treatment ~ Baseline + Followup", family = sm.families.Binomial(), data=df)
res = model.fit()
res.aic

668.1292383066989

### We can observe that we get the best result (least aic) when we use Baseline, Followup and Age.