In [1]:
#Import all the required libraries.
import numpy as np
import pandas as pd
import sklearn
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_rows = 9999

In [2]:
#Import the dataset
data = pd.read_csv("cardio_train.csv")

In [3]:
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
#Dropping column - ID as it is not siginificant
df = pd.DataFrame(data) 
df = data.drop(['id'],axis=1)

In [5]:
#Retreiving the column names
col_list = df.columns.tolist()

In [6]:
#Converting age from days to years
df['age'] = (df['age'] / 365).round().astype('int64')

In [7]:
# Sort the duplicate values by descending order
duplicated = df[df.duplicated(keep=False)]
duplicated = duplicated.sort_values(by=col_list, ascending= False) 

duplicated.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
59745,65,2,174,70.0,120,80,1,1,0,0,1,1
67250,65,2,174,70.0,120,80,1,1,0,0,1,1
48386,65,1,168,64.0,120,80,1,1,0,0,0,1
69356,65,1,168,64.0,120,80,1,1,0,0,0,1
38511,65,1,163,63.0,120,80,1,1,0,0,0,1


In [8]:
# Dropping the duplicates since they wont affect the model
df.drop_duplicates(inplace=True)
print("There is {} duplicated values in data frame".format(df.duplicated().sum()))

There is 0 duplicated values in data frame


In [9]:
df = df.drop(df[df['ap_lo']> df['ap_hi']].index)

In [10]:
df.drop(df[(df['ap_hi'] > df['ap_hi'].quantile(0.975)) | (df['ap_hi'] < df['ap_hi'].quantile(0.025))].index,inplace=True)
df.drop(df[(df['ap_lo'] > df['ap_lo'].quantile(0.975)) | (df['ap_lo'] < df['ap_lo'].quantile(0.025))].index,inplace=True)

In [11]:
#diving data into features and label
features = df.iloc[:,:-1]
label = df.iloc[:,-1]

In [12]:
x_train,  x_test, y_train, y_test = train_test_split(features, label, test_size = 0.2, random_state=9)
    
svc_model = SVC().fit(x_train, y_train)

print("Test score: {}".format(svc_model.score(x_test,y_test)))
print("Train score: {}".format(svc_model.score(x_train,y_train)))

Test score: 0.6934359220966578
Train score: 0.8186153692014828


In [13]:
## Prediction
y_pred=svc_model.predict(x_test)

In [14]:
### Check Accuracy
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)

In [15]:
score

0.6934359220966578

In [16]:
import pickle

In [20]:
# Creating a pickle file for the classifier 
pickle.dump(svc_model, open('cvd-prediction-svc-model.pkl', 'wb'))

In [21]:
features.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,50,2,168,62.0,110,80,1,1,0,0,1
1,55,1,156,85.0,140,90,3,1,0,0,1
2,52,1,165,64.0,130,70,3,1,0,0,0
3,48,2,169,82.0,150,100,1,1,0,0,1
4,48,1,156,56.0,100,60,1,1,0,0,0


In [22]:
label.head()

0    0
1    1
2    1
3    1
4    0
Name: cardio, dtype: int64

In [23]:
svc_model.predict([[35,1,168,62,110,70,1,0,1,0,1]])

array([1], dtype=int64)

In [26]:
pred = svc_model.predict([[62,2,162,56,120,70,1,1,1,0,1]])

In [27]:
if pred:
    print("yes")
else:
    print("no")

no
