In [2]:
import pandas as pd

data = pd.read_csv('cardio_dataset.csv')
data.head()

Unnamed: 0,SEX,AGEIR,TC,HDL,SMOKE_,BPMED,DIAB_noyes,RISK
0,female,48.0,236.0,66,no smoking,taking,no,1.1
1,male,44.0,260.0,51,no smoking,taking,yes,7.0
2,male,44.0,187.0,49,smoking,taking,no,7.0
3,female,42.0,216.0,57,smoking,taking,no,0.4
4,female,56.0,156.0,42,no smoking,taking,no,2.2


In [3]:
data.isnull().sum()

SEX           5
AGEIR         5
TC            6
HDL           0
SMOKE_        3
BPMED         1
DIAB_noyes    0
RISK          0
dtype: int64

In [4]:
data.fillna({'SEX': data['SEX'].value_counts().idxmax()}, inplace=True)
data.fillna({'TC': data['TC'].mean().round().astype(int)}, inplace=True)
data.fillna({'AGEIR': data['AGEIR'].mean().round().astype(int)}, inplace=True)
data.fillna({'SMOKE_': data['SMOKE_'].value_counts().idxmax()}, inplace=True)
data.fillna({'BPMED': data['BPMED'].value_counts().idxmax()}, inplace=True)

In [5]:
print(data.isnull().sum())
data.head()

SEX           0
AGEIR         0
TC            0
HDL           0
SMOKE_        0
BPMED         0
DIAB_noyes    0
RISK          0
dtype: int64


Unnamed: 0,SEX,AGEIR,TC,HDL,SMOKE_,BPMED,DIAB_noyes,RISK
0,female,48.0,236.0,66,no smoking,taking,no,1.1
1,male,44.0,260.0,51,no smoking,taking,yes,7.0
2,male,44.0,187.0,49,smoking,taking,no,7.0
3,female,42.0,216.0,57,smoking,taking,no,0.4
4,female,56.0,156.0,42,no smoking,taking,no,2.2


In [6]:
print(data.loc[(data['SEX'] != 'female') & (data['SEX'] != 'male')])
print()
print(data.loc[(data['SMOKE_'] != 'no smoking') & (data['SMOKE_'] != 'smoking')])
print()
print(data.loc[(data['BPMED'] != 'taking') & (data['BPMED'] != 'not taking')])
print()
print(data.loc[(data['DIAB_noyes'] != 'yes') & (data['DIAB_noyes'] != 'no')])

    SEX  AGEIR     TC  HDL      SMOKE_   BPMED DIAB_noyes  RISK
27  yes   57.0  256.0   33  no smoking  taking         no  25.3

Empty DataFrame
Columns: [SEX, AGEIR, TC, HDL, SMOKE_, BPMED, DIAB_noyes, RISK]
Index: []

Empty DataFrame
Columns: [SEX, AGEIR, TC, HDL, SMOKE_, BPMED, DIAB_noyes, RISK]
Index: []

Empty DataFrame
Columns: [SEX, AGEIR, TC, HDL, SMOKE_, BPMED, DIAB_noyes, RISK]
Index: []


In [7]:
data = data.drop([27])

In [8]:
# hndaling outliers
factor = 4

upper_lim = data['AGEIR'].mean() + data['AGEIR'].std() * factor
lower_lim = data['AGEIR'].mean() - data['AGEIR'].std() * factor

data = data[(data['AGEIR'] > lower_lim) & (data['AGEIR'] < upper_lim)]

In [9]:
upper_lim = data['TC'].mean() + data['TC'].std() * factor
lower_lim = data['TC'].mean() - data['TC'].std() * factor

data = data[(data['TC'] > lower_lim) & (data['TC'] < upper_lim)]

In [10]:
upper_lim = data['HDL'].mean() + data['HDL'].std() * factor
lower_lim = data['HDL'].mean() - data['HDL'].std() * factor

data = data[(data['HDL'] > lower_lim) & (data['HDL'] < upper_lim)]

In [11]:
upper_lim = data['RISK'].mean() + data['RISK'].std() * factor
lower_lim = data['RISK'].mean() - data['RISK'].std() * factor

data = data[(data['RISK'] > lower_lim) & (data['RISK'] < upper_lim)]

In [12]:
# catedorical encoding

data['SEX'] = data['SEX'].astype('category')
data['SMOKE_'] = data['SMOKE_'].astype('category')
data['BPMED'] = data['BPMED'].astype('category')
data['DIAB_noyes'] = data['DIAB_noyes'].astype('category')
print(data.dtypes)

SEX           category
AGEIR          float64
TC             float64
HDL              int64
SMOKE_        category
BPMED         category
DIAB_noyes    category
RISK           float64
dtype: object


In [13]:
data['SEX'] = data['SEX'].cat.codes
data['SMOKE_'] = data['SMOKE_'].cat.codes
data['BPMED'] = data['BPMED'].cat.codes
data['DIAB_noyes'] = data['DIAB_noyes'].cat.codes
data.head()

Unnamed: 0,SEX,AGEIR,TC,HDL,SMOKE_,BPMED,DIAB_noyes,RISK
0,0,48.0,236.0,66,0,1,0,1.1
1,1,44.0,260.0,51,0,1,1,7.0
2,1,44.0,187.0,49,1,1,0,7.0
3,0,42.0,216.0,57,1,1,0,0.4
4,0,56.0,156.0,42,0,1,0,2.2


In [16]:
# Convert to Array
dataset = data.values

data = dataset[:,:7]
target = dataset[:,7]

In [21]:
# Quantile Transform
from sklearn.preprocessing import QuantileTransformer

model_qntl_data = QuantileTransformer(output_distribution = 'normal', random_state = 0)
data_scaled = model_qntl_data.fit_transform(data)

model_qntl_target = QuantileTransformer(output_distribution = 'normal', random_state = 0)
target_scaled = model_qntl_target.fit_transform(target.reshape(-1,1))

In [24]:
# Polynomia
from sklearn.preprocessing import PolynomialFeatures

model_poly = PolynomialFeatures(degree = 3, include_bias = False)
data_high = model_poly.fit_transform(data_scaled)

In [28]:
# Training

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

train_data,test_data,train_target,test_target = train_test_split(data_high,target_scaled,test_size=0.1)

model = LinearRegression()
model.fit(train_data,train_target)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [29]:
from sklearn.metrics import r2_score

predicted_target = model.predict(test_data)
r2 = r2_score(test_target,predicted_target)

print('Score',r2)

Score 0.9126956594534529


In [30]:
import joblib as jb

jb.dump(model,'heart_risk_regression.sav')
jb.dump(model_qntl_data,'model_qntl_data.sav')
jb.dump(model_qntl_target, 'model_qntl_target.sav')
jb.dump(

['heart_risk_regression.sav']