In [5]:
##custom transformer to clean and transform data
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

##Time Step Transformer
## add time step binary category
## add additonal feature to catagorize input based on time step catagory.
## seems like there are two types of time step delta/increment for a given breath. One has average time step delta around 0.0340
## the other has time step delta/increment around 0.03215125. And seems like lung pressure changes a lot even with small time increments.
class TimeStepTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.time_step_catagory_boundary = 1.3
        return self
    def transform(self, data, y=None):
        data['time_index'] = (data['id']-1).mod(80)
        time_step_mean = data.groupby('breath_id')[['time_step']].mean().reset_index()
        time_step_mean.rename(columns={'time_step':'time_step_catagory'}, inplace=True)
        time_step_catagory_df = pd.merge(data, time_step_mean, on=['breath_id'])
        time_step_catagory_df.loc[time_step_catagory_df['time_step_catagory'] >=self.time_step_catagory_boundary , 'time_step_catagory'] = 1
        time_step_catagory_df.loc[(time_step_catagory_df['time_step_catagory'] < self.time_step_catagory_boundary) & (time_step_catagory_df['time_step_catagory'] > 1), 'time_step_catagory'] = 0
        return time_step_catagory_df
    
##Time Series Transformer
## add additional 80 features. Use time series as features. Because lung's pressure at any given time is affected by a series of u_in.
class TimeSeriesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y= None):
        self.numberOfAddedFeatures = 80
        self.columnsNames = list(range(0, self.numberOfAddedFeatures))
        return self
    def transform(self,data, y = None):
        u_in_duplicates = data['u_in'].to_numpy()
        tiled = np.tile(u_in_duplicates, (80,1)).transpose()
        numberOfBreathIds = data['breath_id'].nunique()
        x_reshaped = np.reshape(tiled, (numberOfBreathIds, 80, 80))
        y = x_reshaped.transpose((0, 2, 1))
        triangular = np.tril(y, 0)
        row_count, column_count = data.shape
        updated_data = np.reshape(triangular, (row_count,80))
        new_feature_df = pd.DataFrame(updated_data, columns =self.columnsNames)
        added_features_data = pd.concat([data, new_feature_df], axis=1)
        return added_features_data

    
##RemoveColumnsTransformer
#drop breath_id and time_step columns
class RemoveColumnsTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y= None):
        return self
    def transform(self,data, y = None):
        data.drop(columns=['breath_id'], inplace=True)
        data.drop(columns=['time_step'], inplace=True)
        data.drop(columns=['u_in'], inplace=True)
        data.drop(columns=['id'], inplace=True)
        return data

    
##Building Transformer Pipeline
transformers_pipeline = Pipeline([
    ('time_step_trans', TimeStepTransformer()),
    ('time_serires_trans', TimeSeriesTransformer()),
    ('column_remover', RemoveColumnsTransformer()),
])

In [6]:
#break up test and training data and target
class Utility():
    @staticmethod
    def split(data):
        breath_ids = data['breath_id'].unique()
        test_size = int(breath_ids.size*0.1)
        test_breath_ids = np.empty([test_size], dtype=int)
        random.seed(4)
        
        for i in range(test_size):
            test_breath_ids[i] = random.choice(breath_ids)
            
        data_train_full = data[~data['breath_id'].isin(test_breath_ids)]
        data_train = data_train_full.loc[:, data_train_full.columns != 'pressure']
        data_train_target = data_train_full.loc[:, data_train_full.columns == 'pressure']
        data_test_full = data[data['breath_id'].isin(test_breath_ids)]
        data_test = data_test_full.loc[:, data_test_full.columns != 'pressure']
        data_test_target = data_test_full.loc[:, data_test_full.columns == 'pressure']
        return data_train_full,data_train,data_train_target,data_test_full,data_test,data_test_target;


In [7]:
###Execution Script
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

data = pd.read_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Data\ventilator-pressure-prediction\train.csv")
data_train_full,data_train,data_train_target,data_test_full,data_test,data_test_target = Utility.split(data)

#data engineering
transformed_data = transformers_pipeline.fit_transform(data_train)

#Train model and Predict
#RandomForest
forest = RandomForestRegressor(max_depth=15, n_estimators = 30, random_state=0, oob_score = True, bootstrap = True, n_jobs = -1) 
forest.fit(transformed_data, data_train_target) 
prediction = forest.predict(transformed_data)
print(forest.oob_score_)
print(r2_score(data_train_target, prediction))
print(mean_squared_error(data_train_target, prediction, squared=False))

#test set (for cross validation)
transformed_test_data = transformers_pipeline.fit_transform(data_test)
test_prediction = forest.predict(transformed_test_data)
print(r2_score(data_test_target, test_prediction))
print(mean_squared_error(data_test_target, test_prediction, squared=False))

  forest.fit(transformed_data, data_train_target)
  warn("Some inputs do not have OOB scores. "


0.9743214196058928
0.9760751061430322
1.2540243371782975
0.9717777833435969
1.366077217390916


In [3]:
####test preparation and submission:
data_final_test = pd.read_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Data\ventilator-pressure-prediction\test.csv")
transformed_final_test_data = transformers_pipeline.fit_transform(data_final_test)

#Make Prediction using training data
test_prediction = forest.predict(transformed_test_data)

test_60_200 = pd.DataFrame({'prediction': prediction_test})
test_60_200_with_features = pd.concat([transformed_test_data, test_60_200], axis=1)

kaggle_submision_2 = pd.DataFrame({'id': transformed_test_data['id'] , 'pressure': test_prediction})
kaggle_submision_2.to_csv(r"D:\MachineLearning\Projects\Kaggle\Ventilator_Pressure_Prediction_Kaggle\Output\kaggle_submission2.csv", index=False)



In [4]:
###XGBoost Model
import xgboost

xgb_reg = xgboost.XGBRegressor( learning_rate = 0.6, n_estimators = 200)
xgb_reg.fit(transformed_data, data_train_target, eval_set=[(transformed_test_data, data_test_target)], early_stopping_rounds=5)
xgb_pred = xgb_reg.predict(transformed_data)
xgb_test_prediction = xgb_reg.predict(transformed_test_data)

print(r2_score(data_train_target, xgb_pred))
print(mean_squared_error(data_train_target, xgb_pred, squared=False))
print(r2_score(data_test_target, xgb_test_prediction))
print(mean_squared_error(data_test_target, xgb_test_prediction, squared=False))

[0]	validation_0-rmse:6.25666
[1]	validation_0-rmse:3.70470
[2]	validation_0-rmse:2.81768
[3]	validation_0-rmse:2.56716
[4]	validation_0-rmse:2.45062
[5]	validation_0-rmse:2.38523
[6]	validation_0-rmse:2.29937
[7]	validation_0-rmse:2.23472
[8]	validation_0-rmse:2.18923
[9]	validation_0-rmse:2.16947
[10]	validation_0-rmse:2.14030
[11]	validation_0-rmse:2.10343
[12]	validation_0-rmse:2.08164
[13]	validation_0-rmse:2.06675
[14]	validation_0-rmse:2.03378
[15]	validation_0-rmse:2.01299
[16]	validation_0-rmse:1.97467
[17]	validation_0-rmse:1.95592
[18]	validation_0-rmse:1.94049
[19]	validation_0-rmse:1.92167
[20]	validation_0-rmse:1.91071
[21]	validation_0-rmse:1.89652
[22]	validation_0-rmse:1.87761
[23]	validation_0-rmse:1.86177
[24]	validation_0-rmse:1.84913
[25]	validation_0-rmse:1.83858
[26]	validation_0-rmse:1.82524
[27]	validation_0-rmse:1.81840
[28]	validation_0-rmse:1.80973
[29]	validation_0-rmse:1.79057
[30]	validation_0-rmse:1.77729
[31]	validation_0-rmse:1.76657
[32]	validation_0-