Step 1- Import required libraries


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
%matplotlib inline

#to remove the warnings
import warnings 
warnings.filterwarnings('ignore')

Step 2- Load, visualize and explore the dataset

In [16]:
df = pd.read_csv("framingham.csv")
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  TenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB


In [17]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [18]:
#Check for null entries
print("Number of null values in the data set are - ",df.isnull().values.any().sum())
df.dropna(inplace=True)
df.isnull().values.any().sum()

Number of null values in the data set are -  1


0

Step 3- Deal with the outliers

In [19]:
#Function to find the upper and lower limits to identify and remover outliers
def interQuartile(x):
    percentile25= x.quantile(0.25)
    percentile75=x.quantile(0.75)
    iqr=percentile75-percentile25
    upperLimit= percentile75+1.5*iqr
    lowerLimit= percentile25-1.5*iqr
    return upperLimit, lowerLimit
"""
To find the upper and lower limit any column and 
check if any values are beyond these limits
"""
upper,lower= interQuartile(df['education'])
print("Lower and upper limit calculated are -", upper, lower)

Lower and upper limit calculated are - 6.0 -2.0


In [20]:
print("Number of entries below the lower limit are ", (df['education'] < lower).sum())
print("Number of entries above the upper limit are ", (df['education'] > upper).sum())

Number of entries below the lower limit are  0
Number of entries above the upper limit are  0


Step 4: Define dependent and independent variables and then split the data into a training set and testing set.

In [21]:
#Define the independent and dependent variables
y= df['TenYearCHD'] #dependent variable is Decision
x= df.drop(['TenYearCHD'], axis=1)
# splitting the data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2)

Step 5- Fit a logistic regression model using sklearn

In [22]:
#Implementing Logistic Regression using sklearn
modelLogistic = LogisticRegression()
modelLogistic.fit(x_train,y_train)

LogisticRegression()

In [23]:
#print the regression coefficients

print("The intercept b0= ", modelLogistic.intercept_)

print("The coefficient b1= ", modelLogistic.coef_)

The intercept b0=  [-0.21446237]
The coefficient b1=  [[ 0.22324494  0.03882206 -0.36236627 -0.12753798  0.02165258  0.08302468
   0.04033628  0.49324112  0.05017262 -0.00165614  0.02204022 -0.02350282
  -0.07266465 -0.03130718  0.00393215]]


Step 6- Apply the model on the test data and make a prediction

In [24]:
#Make prediction for the test data
y_pred= modelLogistic.predict(x_test)

Step 8- Evaluate the model using a confusion matrix to obtain an accuracy rate.

In [25]:
#Creating confusion matrix
ConfusionMatrix = confusion_matrix(y_test, y_pred)
print(ConfusionMatrix)

[[607   5]
 [112   8]]


In [27]:
#Accuracy from confusion matrix
TP= ConfusionMatrix[1,1] #True positive
TN= ConfusionMatrix[0,0] #True negative
Total=len(y_test)
print("Accuracy from confusion matrix is ", (TN+TP)/Total)

Accuracy from confusion matrix is  0.8401639344262295


Step 9: Obtain the regression coefficients using the statsmodel package

In [28]:
#Using statsmodels package to obtian the model
import statsmodels.api as sm
x_train = sm.add_constant(x_train)
logit_model=sm.Logit(y_train,x_train)
result=logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.368568
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             TenYearCHD   No. Observations:                 2924
Model:                          Logit   Df Residuals:                     2908
Method:                           MLE   Df Model:                           15
Date:                Wed, 15 Jun 2022   Pseudo R-squ.:                  0.1261
Time:                        11:13:23   Log-Likelihood:                -1077.7
converged:                       True   LL-Null:                       -1233.2
Covariance Type:            nonrobust   LLR p-value:                 2.798e-57
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -8.8789      0.815    -10.893      0.000     -10.476      -7.281
male          