<center>
    <h1>Logistic Regression using Statsmodels</h1>
</center>

In [5]:
# importing all the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import statsmodels.api as sm

from sklearn.metrics import accuracy_score, confusion_matrix

## Exploring the dataset

In [6]:
# reading the diabetes data (https://www.kaggle.com/uciml/pima-indians-diabetes-database)
data = pd.read_csv(r"E:/DataScience_and_DataAnalyst_Notes/Machine Learning/Logistic regression/dataset/diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [8]:
data.shape

(768, 9)

In [9]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [10]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [11]:
# adding a constant to the independent variables
X=sm.add_constant(X)

In [12]:
X.head()

Unnamed: 0,const,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1.0,6,148,72,35,0,33.6,0.627,50
1,1.0,1,85,66,29,0,26.6,0.351,31
2,1.0,8,183,64,0,0,23.3,0.672,32
3,1.0,1,89,66,23,94,28.1,0.167,21
4,1.0,0,137,40,35,168,43.1,2.288,33


In [13]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

## Splitting the dataset into Train and Validation

In [14]:
# performing train and validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=24)
print(f"Train Data: {X_train.shape}, {y_train.shape}")
print(f"Validation Data: {X_val.shape}, {y_val.shape}")

Train Data: (614, 9), (614,)
Validation Data: (154, 9), (154,)


## Fitting the Logistic Regression Model

In [15]:
logistic_model = sm.Logit(y_train, X_train) # initialize the model
trained_logistic_model = logistic_model.fit() # train the model
print(trained_logistic_model.summary())

Optimization terminated successfully.
         Current function value: 0.451849
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  614
Model:                          Logit   Df Residuals:                      605
Method:                           MLE   Df Model:                            8
Date:                Fri, 18 Mar 2022   Pseudo R-squ.:                  0.2989
Time:                        22:36:22   Log-Likelihood:                -277.44
converged:                       True   LL-Null:                       -395.71
Covariance Type:            nonrobust   LLR p-value:                 1.222e-46
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -9.0557      0.839    -10.792      0.000     -10.700

In [16]:
predicted_probabilities = trained_logistic_model.predict(X_val)
print(predicted_probabilities)

313    0.211413
99     0.431012
550    0.095223
334    0.035878
211    0.572800
         ...   
244    0.477227
398    0.024601
684    0.064661
730    0.213325
241    0.098669
Length: 154, dtype: float64


In [17]:
threshold = 0.5
predictions = (predicted_probabilities > threshold).astype(int)
print(predictions)

313    0
99     0
550    0
334    0
211    1
      ..
244    0
398    0
684    0
730    0
241    0
Length: 154, dtype: int32


In [18]:
print(f"Accuracy of the model: {accuracy_score(y_val, predictions)}")
print(confusion_matrix(y_val, predictions))

Accuracy of the model: 0.7402597402597403
[[87 11]
 [29 27]]


## Splitting the dataset by removing non significant independent variables

In [19]:
# performing train and validation split by removing the non significant independent variables
X_train, X_val, y_train, y_val = train_test_split(X.drop(columns=["SkinThickness", "Age"]), y, test_size=0.2, random_state=24)
print(f"Train Data: {X_train.shape}, {y_train.shape}")
print(f"Validation Data: {X_val.shape}, {y_val.shape}")

Train Data: (614, 7), (614,)
Validation Data: (154, 7), (154,)


## Retraining the Logistic Regression Model

In [20]:
logistic_model = sm.Logit(y_train, X_train)
trained_logistic_model = logistic_model.fit()
print(trained_logistic_model.summary())

Optimization terminated successfully.
         Current function value: 0.452997
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  614
Model:                          Logit   Df Residuals:                      607
Method:                           MLE   Df Model:                            6
Date:                Fri, 18 Mar 2022   Pseudo R-squ.:                  0.2971
Time:                        22:41:19   Log-Likelihood:                -278.14
converged:                       True   LL-Null:                       -395.71
Covariance Type:            nonrobust   LLR p-value:                 6.142e-48
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -8.8557      0.816    -10.848      0.000     -10.456

In [21]:
predicted_probabilities = trained_logistic_model.predict(X_val)
print(predicted_probabilities)

313    0.221546
99     0.419947
550    0.106654
334    0.038358
211    0.597579
         ...   
244    0.486091
398    0.026397
684    0.048681
730    0.215368
241    0.109946
Length: 154, dtype: float64


In [22]:
threshold = 0.5
predictions = (predicted_probabilities > threshold).astype(int)
print(predictions)

313    0
99     0
550    0
334    0
211    1
      ..
244    0
398    0
684    0
730    0
241    0
Length: 154, dtype: int32


In [23]:
print(f"Accuracy of the model: {accuracy_score(y_val, predictions)}")
print(confusion_matrix(y_val, predictions))

Accuracy of the model: 0.7402597402597403
[[88 10]
 [30 26]]
