## Importing necessary libraries

In [41]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import mean_absolute_error

## Importing kaggle hosted body measurement dataset

In [42]:
df=pd.read_csv('D:/VIT Hack Proj/1520792014405.csv')
df.head()

Unnamed: 0,weight,height,diastolic_blood_pressure,systolic_blood_pressure,heart_pulse,temperature,pulse_wave_velocity,oxygen_saturation
0,107.21,175,84.96,138.41,154,37.52,7.69,88
1,112.0,186,88.71,121.23,85,37.76,6.28,97
2,93.72,207,89.89,118.24,107,35.76,6.78,81
3,55.75,189,86.05,116.02,141,37.44,9.8,82
4,107.27,199,87.89,117.1,86,37.3,9.85,95


In [43]:
df.dtypes

weight                      float64
height                        int64
diastolic_blood_pressure    float64
systolic_blood_pressure     float64
heart_pulse                   int64
temperature                 float64
pulse_wave_velocity         float64
oxygen_saturation             int64
dtype: object

## Converting all the int values to float ones

In [44]:
df['height']=df['height'].astype(float)
df['heart_pulse']=df['heart_pulse'].astype(float)
df['oxygen_saturation']=df['oxygen_saturation'].astype(float)

In [45]:
df.dtypes

weight                      float64
height                      float64
diastolic_blood_pressure    float64
systolic_blood_pressure     float64
heart_pulse                 float64
temperature                 float64
pulse_wave_velocity         float64
oxygen_saturation           float64
dtype: object

## Splitting dataset into features and labels
We will be using pulse wave velocity as a label while all others will be used as features.

In [46]:
x_train=df.iloc[:,[0,1,2,3,4,5,7]]
x_train

Unnamed: 0,weight,height,diastolic_blood_pressure,systolic_blood_pressure,heart_pulse,temperature,oxygen_saturation
0,107.21,175.0,84.96,138.41,154.0,37.52,88.0
1,112.00,186.0,88.71,121.23,85.0,37.76,97.0
2,93.72,207.0,89.89,118.24,107.0,35.76,81.0
3,55.75,189.0,86.05,116.02,141.0,37.44,82.0
4,107.27,199.0,87.89,117.10,86.0,37.30,95.0
...,...,...,...,...,...,...,...
19884,107.23,205.0,80.51,121.41,106.0,37.95,84.0
19885,51.05,175.0,89.76,120.21,57.0,36.32,89.0
19886,54.91,189.0,91.37,126.26,153.0,37.19,98.0
19887,110.59,161.0,88.36,128.80,75.0,36.77,76.0


In [47]:
y_train=df.iloc[:,6]
y_train

0        7.69
1        6.28
2        6.78
3        9.80
4        9.85
         ... 
19884    7.75
19885    9.10
19886    8.72
19887    7.80
19888    9.68
Name: pulse_wave_velocity, Length: 19889, dtype: float64

## Suppport Vector Regression
We will be using SVR with radial bias function as kernel

In [48]:
from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
svr_lin = SVR(kernel='linear', C=100, gamma='auto')
svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1)


In [49]:
svr_rbf.fit(x_train,y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

## Importing different test dataset
Importing test dataset and applying similar pre-processing to earlier

In [50]:
df2=pd.read_csv('D:/VIT Hack Proj/3560848988588.csv')
df2.head()

Unnamed: 0,weight,height,diastolic_blood_pressure,systolic_blood_pressure,heart_pulse,temperature,pulse_wave_velocity,oxygen_saturation
0,125.84,164,83.71,122.31,151,37.79,8.1,93
1,87.25,169,81.7,128.17,167,36.14,7.14,95
2,62.0,202,88.51,115.18,74,36.6,9.23,92
3,63.42,170,81.2,119.82,111,37.54,8.76,77
4,127.82,202,86.59,137.51,174,35.6,9.67,80


In [51]:
df2['height']=df2['height'].astype(float)
df2['heart_pulse']=df2['heart_pulse'].astype(float)
df2['oxygen_saturation']=df2['oxygen_saturation'].astype(float)

In [52]:
x_test=df2.iloc[:,[0,1,2,3,4,5,7]]
x_test

Unnamed: 0,weight,height,diastolic_blood_pressure,systolic_blood_pressure,heart_pulse,temperature,oxygen_saturation
0,125.84,164.0,83.71,122.31,151.0,37.79,93.0
1,87.25,169.0,81.70,128.17,167.0,36.14,95.0
2,62.00,202.0,88.51,115.18,74.0,36.60,92.0
3,63.42,170.0,81.20,119.82,111.0,37.54,77.0
4,127.82,202.0,86.59,137.51,174.0,35.60,80.0
...,...,...,...,...,...,...,...
14367,57.31,188.0,90.16,126.35,144.0,37.78,99.0
14368,78.70,187.0,93.25,129.04,123.0,37.87,78.0
14369,103.95,189.0,85.79,135.74,170.0,36.94,89.0
14370,126.82,204.0,91.71,119.88,59.0,35.25,94.0


In [53]:
y_test=df2.iloc[:,6]
y_test

0        8.10
1        7.14
2        9.23
3        8.76
4        9.67
         ... 
14367    8.47
14368    9.72
14369    8.86
14370    8.32
14371    8.95
Name: pulse_wave_velocity, Length: 14372, dtype: float64

## Error calculation using Mean Absolute Error as a metric

In [54]:
from sklearn.metrics import mean_absolute_error


In [55]:
y_pred=svr_rbf.predict(x_test)
mean_absolute_error(y_test,y_pred)

0.9953853026095617

In [58]:
df2['pulse_wave_velocity'].describe()

count    14372.000000
mean         8.006094
std          1.149034
min          6.000000
25%          7.030000
50%          8.000000
75%          8.990000
max         10.000000
Name: pulse_wave_velocity, dtype: float64

## Error inference
Since the PWV values range between 6 and 10 with a std deviation of 1.14, an error of 0.99 is pretty accurate for prediction and inferring.