In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# dataset was downloaded from Kaggle
df = pd.read_csv('/content/framingham.csv')

In [3]:
df.info()
df.index
df.columns
df.head()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  TenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB


Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,4238.0,4238.0,4133.0,4238.0,4209.0,4185.0,4238.0,4238.0,4238.0,4188.0,4238.0,4238.0,4219.0,4237.0,3850.0,4238.0
mean,0.429212,49.584946,1.97895,0.494101,9.003089,0.02963,0.005899,0.310524,0.02572,236.721585,132.352407,82.893464,25.802008,75.878924,81.966753,0.151958
std,0.495022,8.57216,1.019791,0.500024,11.920094,0.169584,0.076587,0.462763,0.158316,44.590334,22.038097,11.91085,4.080111,12.026596,23.959998,0.359023
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,89.875,28.04,83.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [4]:
#checking and taking care of missing values
df.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [5]:
#fill in missing values of glucose with the means by age 
df['glucose'] = df['glucose'].fillna(df.groupby('age')['glucose'].transform('mean'))

In [6]:
df['glucose'].isnull().sum()

0

In [7]:
#fill in missing values of BMI with the means by age and gender
df[df['BMI'].isnull() == True]
df['BMI'][df['BMI'].isnull() == True]
df['BMI'].groupby([df['age'], df['male']]).mean()
df['BMI'] = df['BMI'].fillna(df.groupby([df['age'], df['male']])['BMI'].transform('mean'))
df['BMI'].isnull().sum()

0

In [8]:
#drop the column of education
df = df.drop('education', axis=1)

In [9]:
#check on the columns of cigsPerDay. Since the data points that have missing values of cigsPerDay all have currentSmoker = 1, so fill in missing values of cigsPerDay with the mean of cigsPerDay
df['currentSmoker'][df['cigsPerDay'].isnull() == True]
df['cigsPerDay'] = df['cigsPerDay'].fillna(df['cigsPerDay'].mean())

In [10]:
#for BPMeds and heart rate, can't really predict whether the patients are on it or not, so drop the rows that contain missing BPMeds and heartRate
df = df.dropna(subset=['BPMeds', 'heartRate'])

In [11]:
###fit a linear regression model to predict the values of totChol and fill in the missing values of totChol in the dataset with the predicted values###
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
traindf = df[df['totChol'].isnull()==False]
y = traindf['totChol']
traindf_new = traindf.drop('totChol', axis=1)
lr.fit(traindf_new, y)
testdf = df[df['totChol'].isnull()==True]
testdf_new = testdf.drop('totChol', axis=1)
pred = lr.predict(testdf_new)

In [12]:
testdf_new['totChol'] = pred
df['totChol'] = df['totChol'].fillna(testdf_new.totChol)

In [13]:
df.isnull().sum()

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [19]:
#train a logistic model
y_log = df['TenYearCHD']
X_log = df.drop('TenYearCHD', axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_log, y_log, test_size=0.3, random_state=101)
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='lbfgs', max_iter=5000)
log_model.fit(X_train, y_train)
predictions = log_model.predict(X_test)

In [20]:
#evaluate the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.85      0.99      0.92      1061
           1       0.59      0.05      0.09       195

    accuracy                           0.85      1256
   macro avg       0.72      0.52      0.51      1256
weighted avg       0.81      0.85      0.79      1256



In [21]:
print(confusion_matrix(y_test, predictions))

[[1054    7]
 [ 185   10]]


In [22]:
print(accuracy_score(y_test, predictions))

0.8471337579617835
