# IMPORTING LIBRARIES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# CONNECTING TO DRIVE

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMPORTING THE DATA

In [3]:
df = pd.read_csv("/content/drive/MyDrive/framingham.csv")
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  TenYearCHD       4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB
None


# DATA PREPROCESSING

**NULL VALUES TREATMENT**

In [5]:
# Check null values
df.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

**REPLACING NULL VALUES WITH MEAN VALUES**

In [6]:
m1=np.mean(df['education']) 
m2=np.mean(df['cigsPerDay']) 
m3=np.mean(df['BPMeds']) 
m4=np.mean(df['totChol'])
m5=np.mean(df['heartRate'])
m6=np.mean(df['glucose'])
m7=np.mean(df['BMI'])
df_new=df.copy()
df['education'].fillna(value=m1,inplace=True) 
df['cigsPerDay'].fillna(value=m2,inplace=True)
df['BPMeds'].fillna(value=m3,inplace=True)
df['totChol'].fillna(value=m4,inplace=True)
df['heartRate'].fillna(value=m5,inplace=True)
df['glucose'].fillna(value=m6,inplace=True)
df['BMI'].fillna(value=m7,inplace=True)

In [7]:
#Check null values
df.isnull().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [8]:
outliers = []
def detect_outliers_zscore(df):
    thres = 3
    mean = np.mean(df)
    std = np.std(df)
    for i in df:
        z_score = (i-mean)/std
        if (np.abs(z_score) > thres):
           outliers.append(i)
    return outliers

**NORMALISATION**

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_vars = ['age', 'education','cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose']
df[num_vars] = scaler.fit_transform(df[num_vars])
df

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,0.184211,1.000000,0,0.000000,0.000000,0,0,0,0.149406,0.106383,0.232804,0.277024,0.363636,0.104520,0
1,0,0.368421,0.333333,0,0.000000,0.000000,0,0,0,0.242784,0.177305,0.349206,0.319680,0.515152,0.101695,0
2,1,0.421053,0.000000,1,0.285714,0.000000,0,0,0,0.234295,0.208038,0.338624,0.237518,0.313131,0.084746,0
3,0,0.763158,0.666667,1,0.428571,0.000000,0,1,0,0.200340,0.314421,0.497354,0.316045,0.212121,0.177966,1
4,0,0.368421,0.666667,1,0.328571,0.000000,0,0,0,0.302207,0.219858,0.380952,0.183228,0.414141,0.127119,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4235,0,0.421053,0.333333,1,0.285714,0.029615,0,0,0,0.239389,0.224586,0.253968,0.156568,0.404040,0.129944,0
4236,0,0.315789,0.000000,1,0.214286,0.000000,0,0,0,0.174873,0.203310,0.412698,0.087736,0.424242,0.118541,0
4237,0,0.526316,0.333333,0,0.000000,0.000000,0,0,0,0.275042,0.236407,0.370370,0.143723,0.363636,0.189266,0
4238,1,0.210526,0.666667,0,0.000000,0.000000,0,1,0,0.132428,0.271868,0.529101,0.243820,0.232323,0.090395,0


# SPLITTING THE DATA

In [10]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size = 0.7, test_size = 0.3, random_state = 20)
y_train = df_train.pop('TenYearCHD')
x_train = df_train
y_test = df_test.pop('TenYearCHD')
x_test = df_test
x_train = x_train.T
y_train = y_train.T
x_test = x_test.T
y_test = y_test.T

# TRAINING THE MODEL

**LOGISTIC REGRESSION**

In [11]:
accuracies = {}
accuracies1 = {}
lr = LogisticRegression()
lr.fit(x_train.T,y_train.T)
acc = lr.score(x_train.T,y_train.T)*100
acc1 = lr.score(x_test.T,y_test.T)*100
accuracies['Logistic Regression'] = acc
accuracies1['Logistic Regression'] = acc1
print("Training Accuracy {:.2f}%".format(acc))
print("Testing Accuracy {:.2f}%".format(acc1))

Training Accuracy 85.78%
Testing Accuracy 84.59%


**ERROR METRICS CALCULATION**

In [12]:
import math as mt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
prediction=lr.predict(x_test.T)
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.392540507864431
Mean square error 0.1540880503144654
Mean absolute error 0.1540880503144654


**KNN**

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn3 = KNeighborsClassifier(n_neighbors = 3)  
knn3.fit(x_train.T, y_train.T)
prediction = knn3.predict(x_test.T)
print("{} NN Training Score: {:.2f}%".format(3, knn3.score(x_train.T, y_train.T)*100))
print("{} NN Testing Score: {:.2f}%".format(3, knn3.score(x_test.T, y_test.T)*100))

3 NN Training Score: 88.58%
3 NN Testing Score: 80.97%


**ERROR METRICS CALCULATION**

In [14]:
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.4361783721449793
Mean square error 0.19025157232704404
Mean absolute error 0.19025157232704404


**DECISION TREE**

In [15]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(x_train.T, y_train.T)
acc = dtc.score(x_train.T, y_train.T)*100
acc1 = dtc.score(x_test.T, y_test.T)*100
accuracies['Decision Tree'] = acc
accuracies1['Decision Tree'] = acc1
print("Decision Tree Training Accuracy {:.2f}%".format(acc))
print("Decision Tree Testing Accuracy {:.2f}%".format(acc1))

Decision Tree Training Accuracy 86.89%
Decision Tree Testing Accuracy 82.63%


**ERROR METRICS CALCULATION**

In [16]:
prediction=dtc.predict(x_test.T)
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.41682386971571084
Mean square error 0.17374213836477986
Mean absolute error 0.17374213836477986


**RANDOM FOREST**

In [17]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 1000,max_depth=5, random_state = 1)
rf.fit(x_train.T, y_train.T)
acc = rf.score(x_train.T,y_train.T)*100
acc1 = rf.score(x_test.T,y_test.T)*100
accuracies['Random Forest'] = acc
accuracies1['Random Forest'] = acc1
print("Random Forest Algorithm Training Accuracy Score : {:.2f}%".format(acc))
print("Random Forest Algorithm Testing Accuracy Score : {:.2f}%".format(acc1))

Random Forest Algorithm Training Accuracy Score : 85.82%
Random Forest Algorithm Testing Accuracy Score : 83.88%


**ERROR METRICS CALCULATION**

In [18]:
prediction=rf.predict(x_test.T)
rmse = mt.sqrt(mean_squared_error(y_test.T, prediction)) 
print('Root mean square error', rmse) 
mse = (mean_squared_error(y_test, prediction)) 
print('Mean square error', mse) 
mae=mean_absolute_error(y_test, prediction)
print('Mean absolute error', mae)

Root mean square error 0.4014517679778962
Mean square error 0.1611635220125786
Mean absolute error 0.1611635220125786
