In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

data = pd.read_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Data\ventilator-pressure-prediction\train.csv")


In [2]:
breath_ids = data['breath_id'].unique()

test_size = int(breath_ids.size*0.1)
test_breath_ids = np.empty([test_size], dtype=int)

random.seed(4)
for i in range(test_size):
    test_breath_ids[i] = random.choice(breath_ids)
    
data_test = data[data['breath_id'].isin(test_breath_ids)]
data_train_full = data[~data['breath_id'].isin(test_breath_ids)]
data_train = data_train_full.loc[:, data_train_full.columns != 'pressure']
data_train_target = data_train_full.loc[:, data_train_full.columns == 'pressure']

In [40]:
data_test.to_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\data_testSplit.csv", index=False)

In [41]:
data_train.to_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\data_trainSplit.csv", index=False)

In [3]:
## Data Engineering
## add additonal feature to catagorize input based on time step catagory.
## seems like there are two types of time step delta/increment for a given breath. One has average time step delta around 0.0340
## the other has time step delta/increment around 0.03215125

#Add time index and remove time_step, id and breath_id columns
data_train['time_index'] = (data_train_full['id']-1).mod(80)
data_train.drop(columns=['id'], inplace=True)

time_step_mean = data_train.groupby('breath_id')[['time_step']].mean().reset_index()
time_step_mean.rename(columns={'time_step':'time_step_catagory'}, inplace=True)
time_step_catagory_df = pd.merge(data_train, time_step_mean, on=['breath_id'])
time_step_catagory_df.loc[time_step_catagory_df['time_step_catagory'] >=1.3 , 'time_step_catagory'] = 1
time_step_catagory_df.loc[(time_step_catagory_df['time_step_catagory'] < 1.3) & (time_step_catagory_df['time_step_catagory'] > 1), 'time_step_catagory'] = 0

In [48]:
##Data Engineering: 
## add additional 80 features. Use time series as features. Because lung's pressure at any given time is affected by a series of u_in.
u_in_duplicates = data_train['u_in'].to_numpy()
tiled = np.tile(u_in_duplicates, (80,1)).transpose()
numberOfBreathIds = data_train_full['breath_id'].nunique()
x_reshaped = np.reshape(tiled, (numberOfBreathIds, 80, 80))
y = x_reshaped.transpose((0, 2, 1))
triangular = np.tril(y, 0)
row_count, column_count = data_train.shape
updated_data = np.reshape(triangular, (row_count,80))

numberOfAddedFeatures = 80
columnsNames = list(range(0, numberOfAddedFeatures))
new_feature_df = pd.DataFrame(updated_data, columns =columnsNames)
added_features_data = pd.concat([time_step_catagory_df, new_feature_df], axis=1)

#drop breath_id and time_step columns
print(added_features_data['breath_id'].nunique())
added_features_data.drop(columns=['breath_id'], inplace=True)
added_features_data.drop(columns=['time_step'], inplace=True)
added_features_data.drop(columns=['u_in'], inplace=True)

7168


In [None]:
from sklearn.ensemble import RandomForestRegressor 
forest = RandomForestRegressor(max_depth=60, n_estimators = 200, random_state=0, oob_score = True, bootstrap = True, n_jobs = -1) 
forest.fit(added_features_data, data_train_target) 
print(forest.oob_score_)

prediction = forest.predict(added_features_data)
from sklearn.metrics import r2_score
print(r2_score(data_train_target, prediction))

from sklearn.metrics import mean_squared_error
print(mean_squared_error(data_train_target, prediction))

test = pd.DataFrame({'target':data_train_target['pressure'], 'prediction':prediction})
test2 = pd.concat([added_features_data, test], axis=1)
test2.to_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\time_index_catagory_with_u_in_deep60_estimator200_splitTest.csv", index=False)

  forest.fit(added_features_data, data_train_target)


In [45]:
data_train_full = data_test
data_train = data_train_full.loc[:, data_train_full.columns != 'pressure']
data_train_target = data_train_full.loc[:, data_train_full.columns == 'pressure']

In [20]:
data_train_full = data[data['breath_id'].isin(test_breath_ids)]
data_train = data_test_full.loc[:, data_train_full.columns != 'pressure']
data_train_target = data_test_full.loc[:, data_train_full.columns == 'pressure']

In [46]:
data_train.shape

(573440, 7)

In [49]:
added_features_data.shape

(573440, 85)

In [23]:
prediction_test = forest.predict(added_features_data)
from sklearn.metrics import r2_score
print(r2_score(data_train_target, prediction_test))

from sklearn.metrics import mean_squared_error
print(mean_squared_error(data_train_target, prediction_test))

0.9909504375155586
0.5983936255464611


In [27]:
test_30_50_test_split = pd.DataFrame({'target':data_train_target['pressure'], 'prediction':prediction_test})
test_30_50_full = pd.concat([added_features_data, test_30_50_test_split], axis=1)
test_30_50_full.to_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\time_index_catagory_with_u_in_deep30_estimator50_splitTest.csv", index=False)

In [28]:
forest.feature_importances_

array([3.68166619e-02, 3.56075502e-02, 2.94250145e-04, 4.92637581e-01,
       1.21799057e-02, 5.33965049e-03, 3.06030385e-02, 4.39592756e-02,
       7.75041121e-03, 1.44422924e-02, 4.61444103e-03, 1.95417366e-01,
       4.98077643e-03, 6.68112919e-03, 7.70650426e-03, 3.61911890e-03,
       1.33601660e-02, 2.47621151e-02, 2.99134368e-03, 1.77412927e-02,
       2.03351368e-03, 4.33041394e-03, 3.85653074e-03, 4.10405040e-03,
       2.87075794e-03, 3.93904712e-03, 2.44332344e-03, 1.79959481e-03,
       1.06699457e-03, 9.20762304e-04, 7.93514739e-04, 6.54248410e-04,
       4.07776009e-04, 3.15011404e-04, 1.58516347e-03, 2.37795170e-03,
       1.79227850e-03, 6.64891329e-05, 1.29119111e-04, 1.15236351e-04,
       1.08235756e-04, 8.75992659e-06, 4.35599777e-05, 1.57212091e-05,
       9.87038838e-05, 1.19130060e-04, 1.80161378e-05, 1.23707677e-03,
       1.20426702e-05, 1.47584315e-05, 3.11161030e-05, 1.83143800e-05,
       2.62697547e-05, 2.73357537e-05, 2.59658385e-05, 8.32703139e-05,
      

In [None]:
from joblib import dump, load
dump(forest, r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\RandomForest_depth30_estimators50.joblib") 

In [None]:
import pickle
filename = 'RandomForest_depth30_estimators50.sav'
pickle.dump(forest, open(filename, 'wb'))

In [88]:
forest

RandomForestRegressor(max_depth=60, n_estimators=200, n_jobs=-1, oob_score=True,
                      random_state=0)

In [89]:
forest.feature_importances_

array([3.69497465e-02, 3.56200681e-02, 2.73663633e-04, 4.92359398e-01,
       1.23539927e-02, 5.36623168e-03, 3.05590462e-02, 4.38209331e-02,
       7.84945727e-03, 1.43454598e-02, 4.75003917e-03, 1.95355053e-01,
       4.98978084e-03, 6.65324299e-03, 8.06363165e-03, 3.36372635e-03,
       1.36023446e-02, 2.41957443e-02, 3.29371301e-03, 1.73080622e-02,
       2.11479863e-03, 4.42450382e-03, 3.96587289e-03, 4.38818643e-03,
       2.84001564e-03, 3.87591874e-03, 2.36669086e-03, 1.79761013e-03,
       1.13296537e-03, 9.36859936e-04, 7.82612796e-04, 6.67011314e-04,
       4.20866556e-04, 3.20974499e-04, 1.59921280e-03, 2.26571556e-03,
       1.77701803e-03, 6.29270733e-05, 1.42022576e-04, 1.02671081e-04,
       1.09407628e-04, 8.33960336e-06, 4.68025964e-05, 1.75012302e-05,
       9.79535858e-05, 1.13822283e-04, 1.79653034e-05, 1.23748665e-03,
       1.08006262e-05, 1.56639012e-05, 3.13662376e-05, 2.00337373e-05,
       2.59610687e-05, 2.79336098e-05, 4.29622078e-05, 8.22944694e-05,
      

In [9]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(data_train_target, prediction))

0.03946101137033272


In [50]:
added_features_data.shape

(573440, 85)

In [51]:
prediction = forest.predict(added_features_data)
from sklearn.metrics import r2_score
print(r2_score(data_train_target, prediction))

from sklearn.metrics import mean_squared_error
print(mean_squared_error(data_train_target, prediction))


0.991275325761369
0.5769107024060122


In [76]:
test = pd.DataFrame({'target':data_train_target['pressure'], 'prediction':prediction})
test = test.reset_index()
test2 = pd.concat([added_features_data, test], axis=1)
test2.to_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\time_index_catagory_with_u_in_deep60_estimator200_splitTest.csv", index=False)

In [53]:
####test preparation and submission:
## Data Engineering
## add additonal feature to catagorize input based on time step catagory.
## seems like there are two types of time step delta/increment for a given breath. One has average time step delta around 0.0340
## the other has time step delta/increment around 0.03215125

#Add time index and remove time_step, id and breath_id columns
data_test = pd.read_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Data\ventilator-pressure-prediction\test.csv")
data_test['time_index'] = (data_test['id']-1).mod(80)
data_test.drop(columns=['id'], inplace=True)

time_step_mean_test = data_test.groupby('breath_id')[['time_step']].mean().reset_index()
time_step_mean_test.rename(columns={'time_step':'time_step_catagory'}, inplace=True)
time_step_catagory_df_test = pd.merge(data_test, time_step_mean_test, on=['breath_id'])
time_step_catagory_df_test.loc[time_step_catagory_df_test['time_step_catagory'] >=1.3 , 'time_step_catagory'] = 1
time_step_catagory_df_test.loc[(time_step_catagory_df_test['time_step_catagory'] < 1.3) & (time_step_catagory_df_test['time_step_catagory'] > 1), 'time_step_catagory'] = 0

##Data Engineering: 
## add additional 80 features. Use time series as features. Because lung's pressure at any given time is affected by a series of u_in.
u_in_duplicates_test = data_test['u_in'].to_numpy()
tiled_test = np.tile(u_in_duplicates_test, (80,1)).transpose()
numberOfBreathIds = data_test['breath_id'].nunique()
x_reshaped_test = np.reshape(tiled_test, (numberOfBreathIds, 80, 80))
y = x_reshaped_test.transpose((0, 2, 1))
triangular_test = np.tril(y, 0)
row_count_test, column_count_test = data_test.shape
updated_data_test = np.reshape(triangular_test, (row_count_test,80))

numberOfAddedFeatures = 80
columnsNames = list(range(0, numberOfAddedFeatures))
new_feature_df_test = pd.DataFrame(updated_data_test, columns =columnsNames)
added_features_data_test = pd.concat([time_step_catagory_df_test, new_feature_df_test], axis=1)

#drop breath_id and time_step columns
print(added_features_data_test['breath_id'].nunique())
added_features_data_test.drop(columns=['breath_id'], inplace=True)
added_features_data_test.drop(columns=['time_step'], inplace=True)
added_features_data_test.drop(columns=['u_in'], inplace=True)

50300


In [55]:
####time index & catagory & u_in feature

prediction_test= forest.predict(added_features_data_test)

test_40_60_submission1 = pd.DataFrame({'prediction': prediction_test})
test_40_60_full_submission1 = pd.concat([added_features_data_test, test_40_60_submission1], axis=1)

kaggle_submision_2 = pd.DataFrame({'id': data_test['id'] , 'pressure': prediction_test})
kaggle_submision_2.to_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\kaggle_submission2.csv", index=False)

KeyError: 'id'

In [57]:
data_test = pd.read_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Data\ventilator-pressure-prediction\test.csv")

In [60]:
kaggle_submision_2 = pd.DataFrame({'id': data_test['id'] , 'pressure': prediction_test})
kaggle_submision_2.to_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\kaggle_submission2.csv", index=False)

In [72]:
test2

Unnamed: 0,R,C,u_out,time_index,time_step_catagory,0,1,2,3,4,...,72,73,74,75,76,77,78,79,target,prediction
0,50.0,20.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1,50.0,20.0,0.0,1.0,0.0,0.0,27.490064,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2,50.0,20.0,0.0,2.0,0.0,0.0,27.490064,2.355393,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,50.0,20.0,0.0,3.0,0.0,0.0,27.490064,2.355393,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4,50.0,20.0,0.0,4.0,0.0,0.0,27.490064,2.355393,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035915,,,,,,,,,,,...,,,,,,,,,7.313837,6.926120
6035916,,,,,,,,,,,...,,,,,,,,,7.102930,6.833322
6035917,,,,,,,,,,,...,,,,,,,,,6.540513,6.867770
6035918,,,,,,,,,,,...,,,,,,,,,7.032628,6.833673


In [63]:
added_features_data.shape

(573440, 85)

In [64]:
added_features_data

Unnamed: 0,R,C,u_out,time_index,time_step_catagory,0,1,2,3,4,...,70,71,72,73,74,75,76,77,78,79
0,50,20,0,0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
1,50,20,0,1,0.0,0.000000,27.490064,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
2,50,20,0,2,0.0,0.000000,27.490064,2.355393,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
3,50,20,0,3,0.0,0.000000,27.490064,2.355393,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
4,50,20,0,4,0.0,0.000000,27.490064,2.355393,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573435,50,50,1,75,0.0,15.564236,23.588836,20.298696,12.326433,10.807906,...,4.877733,4.895692,4.911043,4.924128,4.93528,4.944776,0.000000,0.000000,0.000000,0.000000
573436,50,50,1,76,0.0,15.564236,23.588836,20.298696,12.326433,10.807906,...,4.877733,4.895692,4.911043,4.924128,4.93528,4.944776,4.952874,0.000000,0.000000,0.000000
573437,50,50,1,77,0.0,15.564236,23.588836,20.298696,12.326433,10.807906,...,4.877733,4.895692,4.911043,4.924128,4.93528,4.944776,4.952874,4.959785,0.000000,0.000000
573438,50,50,1,78,0.0,15.564236,23.588836,20.298696,12.326433,10.807906,...,4.877733,4.895692,4.911043,4.924128,4.93528,4.944776,4.952874,4.959785,4.965696,0.000000


In [67]:
test.reset_index()

Unnamed: 0,index,target,prediction
0,720,6.681117,6.274592
1,721,6.118700,5.942945
2,722,9.001088,9.025342
3,723,14.695562,15.245676
4,724,9.704110,10.558984
...,...,...,...
573435,6035915,7.313837,6.926120
573436,6035916,7.102930,6.833322
573437,6035917,6.540513,6.867770
573438,6035918,7.032628,6.833673


In [75]:
test2

Unnamed: 0,R,C,u_out,time_index,time_step_catagory,0,1,2,3,4,...,73,74,75,76,77,78,79,index,target,prediction
0,50,20,0,0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,720,6.681117,6.274592
1,50,20,0,1,0.0,0.000000,27.490064,0.000000,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,721,6.118700,5.942945
2,50,20,0,2,0.0,0.000000,27.490064,2.355393,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,722,9.001088,9.025342
3,50,20,0,3,0.0,0.000000,27.490064,2.355393,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,723,14.695562,15.245676
4,50,20,0,4,0.0,0.000000,27.490064,2.355393,0.000000,0.000000,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,724,9.704110,10.558984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573435,50,50,1,75,0.0,15.564236,23.588836,20.298696,12.326433,10.807906,...,4.924128,4.93528,4.944776,0.000000,0.000000,0.000000,0.000000,6035915,7.313837,6.926120
573436,50,50,1,76,0.0,15.564236,23.588836,20.298696,12.326433,10.807906,...,4.924128,4.93528,4.944776,4.952874,0.000000,0.000000,0.000000,6035916,7.102930,6.833322
573437,50,50,1,77,0.0,15.564236,23.588836,20.298696,12.326433,10.807906,...,4.924128,4.93528,4.944776,4.952874,4.959785,0.000000,0.000000,6035917,6.540513,6.867770
573438,50,50,1,78,0.0,15.564236,23.588836,20.298696,12.326433,10.807906,...,4.924128,4.93528,4.944776,4.952874,4.959785,4.965696,0.000000,6035918,7.032628,6.833673


In [77]:
prediction.shape

(573440,)

In [92]:
data.to_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\breath_id_full.csv", index=False)

In [91]:
data.shape

(6036000, 8)

In [87]:
data_test[data_test['breath_id']==125745]

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
6035840,6035841,125745,50,50,0.000000,15.564236,0,6.962326
6035841,6035842,125745,50,50,0.031680,23.588836,0,8.790182
6035842,6035843,125745,50,50,0.063351,20.298696,0,12.445893
6035843,6035844,125745,50,50,0.095110,12.326433,0,21.303964
6035844,6035845,125745,50,50,0.126876,10.807906,0,22.991215
...,...,...,...,...,...,...,...,...
6035915,6035916,125745,50,50,2.381708,4.944776,1,7.313837
6035916,6035917,125745,50,50,2.413423,4.952874,1,7.102930
6035917,6035918,125745,50,50,2.445142,4.959785,1,6.540513
6035918,6035919,125745,50,50,2.476939,4.965696,1,7.032628


In [4]:
time_step_catagory_df

Unnamed: 0,breath_id,R,C,time_step,u_in,u_out,time_index,time_step_catagory
0,1,20,50,0.000000,0.083334,0,0,1.0
1,1,20,50,0.033652,18.383041,0,1,1.0
2,1,20,50,0.067514,22.509278,0,2,1.0
3,1,20,50,0.101542,22.808822,0,3,1.0
4,1,20,50,0.135756,25.355850,0,4,1.0
...,...,...,...,...,...,...,...,...
5462555,125749,50,10,2.504603,1.489714,1,75,1.0
5462556,125749,50,10,2.537961,1.488497,1,76,1.0
5462557,125749,50,10,2.571408,1.558978,1,77,1.0
5462558,125749,50,10,2.604744,1.272663,1,78,1.0
