In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing data

In [2]:
# load dataset
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987


## Data overview

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6036000 entries, 0 to 6035999
Data columns (total 8 columns):
 #   Column     Dtype  
---  ------     -----  
 0   id         int64  
 1   breath_id  int64  
 2   R          int64  
 3   C          int64  
 4   time_step  float64
 5   u_in       float64
 6   u_out      int64  
 7   pressure   float64
dtypes: float64(3), int64(5)
memory usage: 368.4 MB


In [4]:
df.shape

(6036000, 8)

## Missing values

In [5]:
# check for missing values
df.isnull().sum()

id           0
breath_id    0
R            0
C            0
time_step    0
u_in         0
u_out        0
pressure     0
dtype: int64

## Outliers

In [6]:
#drop the id column, breath_id column and R column
df.drop(['id', 'breath_id', 'R'], axis=1, inplace=True)
df.head()

Unnamed: 0,C,time_step,u_in,u_out,pressure
0,50,0.0,0.083334,0,5.837492
1,50,0.033652,18.383041,0,5.907794
2,50,0.067514,22.509278,0,7.876254
3,50,0.101542,22.808822,0,11.742872
4,50,0.135756,25.35585,0,12.234987


In [7]:
# split the data into train and test
from sklearn.model_selection import train_test_split
X = df.drop('pressure', axis=1)
y = df['pressure']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((4828800, 4), (1207200, 4))

In [8]:
# train the model using multiple models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_squared_error

models = {
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Ada Boost': AdaBoostRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Support Vector Regressor': SVR(),
    'K-Neighbors Regressor': KNeighborsRegressor(),
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso()
}


In [None]:
# train the models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'{name} RMSE: {rmse:.4f}')

In [None]:
# create a model using neural network
