In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [3]:
pd.read_csv("E:/PRATHAM/CASE STUDY DATABASE/framingham_heart_disease.csv")

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
4234,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
4235,0,48,2.0,1,20.0,,0,0,0,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,0,44,1.0,1,15.0,0.0,0,0,0,210.0,126.5,87.0,19.16,86.0,,0


In [4]:
missing_values = ["n/a","NA","N/A","na","==","."]
data = pd.read_csv("E:/PRATHAM/CASE STUDY DATABASE/framingham_heart_disease.csv",na_values = missing_values)

In [5]:
data.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  TenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB


In [7]:
def missing_values_treatment(df):
    for col in df.columns:
        print("Treating the ",col)
        if (((df[col].dtypes)=="float64") | ((df[col].dtypes)=="int64")):
            col_mean = df[col].mean()
            df[col] = df[col].fillna(col_mean)
        else:
            col_mode = df[col].mode()
            df[col] = df[col].fillna(col_mode)
    return df
data = missing_values_treatment(data)

Treating the  male
Treating the  age
Treating the  education
Treating the  currentSmoker
Treating the  cigsPerDay
Treating the  BPMeds
Treating the  prevalentStroke
Treating the  prevalentHyp
Treating the  diabetes
Treating the  totChol
Treating the  sysBP
Treating the  diaBP
Treating the  BMI
Treating the  heartRate
Treating the  glucose
Treating the  TenYearCHD


In [8]:
# CODE FOR REMOVING OUTLIERS FROM MULTIPLE COLUMNS
# def is used for defining that we are creating function
def cap_data(df):
    for col in df.columns:
        print("capping the ",col)
        if (((df[col].dtype)=="float64") | ((df[col].dtype)=="int64")):
            percentiles = df[col].quantile([0.01,0.99]).values
            df.loc[df[col] <= percentiles[0],col] = percentiles[0]
            df.loc[df[col] >= percentiles[1],col] = percentiles[1]
        else:
            df[col] = df[col]
    return df
data = cap_data(data)

capping the  male
capping the  age
capping the  education
capping the  currentSmoker
capping the  cigsPerDay
capping the  BPMeds
capping the  prevalentStroke
capping the  prevalentHyp
capping the  diabetes
capping the  totChol
capping the  sysBP
capping the  diaBP
capping the  BMI
capping the  heartRate
capping the  glucose
capping the  TenYearCHD


In [10]:
y = data["TenYearCHD"]
x = data.drop("TenYearCHD",axis = 1)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
x_train

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
3252,1,40,4.0,1,30.0,0.0,0,0,0,205.0,131.0,81.0,23.74,66.0,87.0
3946,0,57,2.0,0,0.0,0.0,0,1,0,250.0,152.5,92.5,32.31,75.0,94.0
1261,0,47,1.0,0,0.0,0.0,0,0,0,230.0,123.0,71.0,26.98,83.0,73.0
2536,1,41,2.0,1,30.0,0.0,0,0,0,228.0,113.0,82.5,25.67,67.0,70.0
4089,0,64,1.0,0,0.0,0.0,0,1,0,232.0,149.5,84.0,20.49,68.0,96.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3444,0,36,1.0,1,5.0,0.0,0,1,0,222.0,147.0,94.0,26.79,76.0,71.0
466,0,57,3.0,1,15.0,0.0,0,0,0,250.0,125.0,74.0,21.08,80.0,72.0
3092,0,60,2.0,0,0.0,0.0,0,1,0,298.0,133.0,89.0,25.09,83.0,81.0
3772,1,39,2.0,1,10.0,0.0,0,0,0,215.0,102.0,64.5,24.50,68.0,62.0


In [12]:
# creating an object of linear regression class
LR = LogisticRegression()
# fitting the training data
# fit is model creation function
LR.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [13]:
LR.intercept_

array([-0.11385291])

In [14]:
LR.intercept_

array([-0.11385291])

In [15]:
y_pred_train = LR.predict(x_train)
print("y_pred_train =",y_pred_train)

y_pred_train = [0 0 0 ... 0 0 0]


In [16]:
acc_train = metrics.accuracy_score(y_train,y_pred_train)
print("ACCURACY  = ",acc_train)
acc_train = metrics.average_precision_score(y_train,y_pred_train)
print("PRECISION = ",acc_train)
acc_train = metrics.recall_score(y_train,y_pred_train)
print("RECALL    = ",acc_train)
acc_train = metrics.roc_auc_score(y_train,y_pred_train)
print("ROC_AUC   = ",acc_train)

ACCURACY  =  0.8474926253687316
PRECISION =  0.16696371475132538
RECALL    =  0.03461538461538462
ROC_AUC   =  0.5146944518895739


In [17]:
y_pred_test = LR.predict(x_test)
print("y_pred_test =",y_pred_test)

y_pred_test = [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 

In [18]:
acc = metrics.accuracy_score(y_test,y_pred_test)
print("ACCURACY  = ",acc)
acc = metrics.average_precision_score(y_test,y_pred_test)
print("PRECISION = ",acc)
acc = metrics.recall_score(y_test,y_pred_test)
print("RECALL    = ",acc)
acc = metrics.roc_auc_score(y_test,y_pred_test)
print("ROC_AUC   = ",acc)

ACCURACY  =  0.8537735849056604
PRECISION =  0.15193244065733413
RECALL    =  0.016129032258064516
ROC_AUC   =  0.506683300659419
