In [None]:
# import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# suppressing warnings
import warnings
warnings.filterwarnings('ignore')

Columns

id - globally-unique time step identifier across an entire file

breath_id - globally-unique time step for breaths

R - lung attribute indicating how restricted the airway is (in cmH2O/L/S). Physically, this is the change in pressure per change in flow (air volume per time). Intuitively, one can imagine blowing up a balloon through a straw. We can change R by changing the diameter of the straw, with higher R being harder to blow.

C - lung attribute indicating how compliant the lung is (in mL/cmH2O). Physically, this is the change in volume per change in pressure. Intuitively, one can imagine the same balloon example. We can change C by changing the thickness of the balloon’s latex, with higher C having thinner latex and easier to blow. time_step - the actual time stamp.

u_in - the control input for the inspiratory solenoid valve. Ranges from 0 to 100.

u_out - the control input for the exploratory solenoid valve. Either 0 or 1.

pressure - the airway pressure measured in the respiratory circuit, measured in cmH2O.

In [None]:
# Read the data
train = pd.read_csv("../input/ventilator-pressure-prediction/train.csv")
test  = pd.read_csv("../input/ventilator-pressure-prediction/test.csv")
sub = pd.read_csv("../input/ventilator-pressure-prediction/sample_submission.csv")

In [None]:
# check dataset information
train.info()

In [None]:
# check sample data
train.head()

In [None]:
train.tail()

In [None]:
train['breath_id'].value_counts().sort_values()

In [None]:
test.head()

In [None]:
#Drop the id column from both train and test. Drop target variable 'pressure' from input dataframe
X = train.drop(columns = ['pressure', 'id'], axis=1)

In [None]:
X.head()

In [None]:
test.drop(columns = 'id', inplace = True)

In [None]:
test.head()

In [None]:
# Select target
y = pd.DataFrame(train['pressure'])
y.head()


In [None]:
print("Train shape: ", train.shape, "\nTest shape: ", test.shape)

In [None]:
# Checking if there are missing values in the datasets
train.isna().sum().sum(), test.isna().sum().sum()

In [None]:
#Getting summary statistics
train.describe().T

In [None]:
test.describe().T

In [None]:
# Select categorical columns 
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
categorical_cols

In [None]:
# Select numerical columns
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
numerical_cols

In [None]:
#visualize output varaible
plt.figure(figsize = (10,5))
sns.distplot(a = y['pressure'], kde = False)

In [None]:
# Visualisation of all other numerical variables 
for col in numerical_cols: 
    plt.figure(figsize = (10,5))
    sns.distplot(a = X[col], kde = False) 

In [None]:
#Pressure magnitude distributions with respect to expiratory valves [closed (0) & open(1)]
plt.title("Pressure when Expiratory valve closed");
sns.kdeplot(train[train["u_out"]==0]["pressure"]);


In [None]:
plt.title("Pressure when Expiratory valve opened");
sns.kdeplot(train[train["u_out"]==1]["pressure"]);

In [None]:
sns.scatterplot(x='u_in',y='pressure',hue='u_out',data=train);

In [None]:
# Checks if there is any variables with zero variance
X.std() 
# Drops variables with 0 variance

Zero_std_cols = X.std()[X.std() == 0].index

X = X.drop(Zero_std_cols, axis = 1) 

In [None]:
X.head()

In [None]:
print("Time-steps count for each breath in train set: ",train.groupby("breath_id").size().value_counts().keys()[0])
print("Time-steps count for each breath in test set: ",test.groupby("breath_id").size().value_counts().keys()[0])

In [None]:
# Select numerical columns
updated_numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
updated_numerical_cols

In [None]:
# creates a min max scaler for numerical columns
from sklearn import preprocessing
data_scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) 
X[updated_numerical_cols] = data_scaler.fit_transform(X[updated_numerical_cols])
test[updated_numerical_cols] = data_scaler.transform(test[updated_numerical_cols])

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
model = XGBRegressor(n_estimators=1000, learning_rate=0.05, tree_method = 'gpu_hist')
model.fit(X_train, y_train)   

In [None]:
model.score(X_test, y_test)

In [None]:
preds = model.predict(test)

In [None]:
pred_pressure = pd.DataFrame({'pressure': preds[:]})

In [None]:
pred_pressure

In [None]:
sub['pressure'] = pred_pressure['pressure']

In [None]:
sub.head()

In [None]:
 sub.to_csv("submission3.csv", index=False)