In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
def plot_create(x, y):
    plt.plot(x, y, '-', label=y.name)

def process_visualisation_with_preds(df, df_preds, breath_id):
    plt.figure(figsize=(14, 6))
    plt.title('Breath Id - {}'.format(breath_id))
    plot_create(df[df['breath_id'] == breath_id]['time_step'],
                df[df['breath_id'] == breath_id]['pressure'])
    plot_create(df[df['breath_id'] == breath_id]['time_step'],
                df_preds)
    plt.grid()
    plt.legend()
    plt.ylabel('Value')
    plt.show()

def add_features(df):
    df['u_in_cumsum'] = df['u_in'].groupby(df['breath_id']).cumsum()
    df['u_in_lag_1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_in_lag_2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_in_lag_3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_in_rolling_mean_3'] = df.groupby('breath_id')['u_in'].shift().rolling(3).mean()
    df['u_in_rolling_mean_5'] = df.groupby('breath_id')['u_in'].shift().rolling(5).mean()
    df['u_in_begin'] = df.groupby('breath_id')['u_in'].transform('first')
    df['u_in_end'] = df.groupby('breath_id')['u_in'].transform('last')
    df['u_in_min'] = df.groupby('breath_id')['u_in'].transform('min')
    df['u_in_max'] = df.groupby('breath_id')['u_in'].transform('max')
    df['u_in_median'] = df.groupby('breath_id')['u_in'].transform('median')
    df = df.fillna(0)
    df = df.drop(['breath_id', 'u_in', 'u_out'], axis=1)
    return df

def train_and_score(model, X_train, X_valid, y_train, y_valid):
    model.fit(X_train, y_train)
    return mean_absolute_error(y_valid, model.predict(X_valid))

In [3]:
df_train = pd.read_csv('train.csv')
df_train = df_train.drop('id', axis=1)

In [4]:
N = 150000

X = df_train[:N].copy()
X = X.drop('pressure', axis=1)
X = add_features(X)
y = df_train[:N]['pressure']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=555)

In [5]:
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()
et_model = ExtraTreesRegressor()

In [None]:
display(
    pd.DataFrame(data=([train_and_score(linear_model, X_train, X_valid, y_train, y_valid)],
                       [train_and_score(tree_model, X_train, X_valid, y_train, y_valid)],
                       [train_and_score(rf_model, X_train, X_valid, y_train, y_valid)],
                       [train_and_score(et_model, X_train, X_valid, y_train, y_valid)]
                      ),
                 columns=['Result MAE'],
                 index=['Linear', 'Decision Tree', 'Random Forest', 'Extra Trees'])
)

In [None]:
bid = 16600

X_df_vis = df_train[df_train['breath_id'] == bid].reset_index()
X_df_vis = add_features(X_df_vis)
X_df_vis = X_df_vis.drop(['index', 'pressure'], axis=1)

print('Pressure predictions by Linear Model:')
process_visualisation_with_preds(df_train, pd.Series(linear_model.predict(X_df_vis), name='predictions'), bid)
print('Pressure predictions by Tree Model:')
process_visualisation_with_preds(df_train, pd.Series(tree_model.predict(X_df_vis), name='predictions'), bid)
print('Pressure predictions by Random Forest Model:')
process_visualisation_with_preds(df_train, pd.Series(rf_model.predict(X_df_vis), name='predictions'), bid)
print('Pressure predictions by Extra Trees Model:')
process_visualisation_with_preds(df_train, pd.Series(et_model.predict(X_df_vis), name='predictions'), bid)