In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.shape

(6036000, 8)

In [4]:
test.shape

(4024000, 7)

In [5]:
train.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987


In [6]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

train = add_features(train)
test = add_features(test)

targets = train['pressure']
train.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)
test = test.drop(['id', 'breath_id'], axis=1)

# RS = RobustScaler()
# train = RS.fit_transform(train)
# test = RS.transform(test)

# train = train.reshape(-1, 80, train.shape[-1])
# test = test.reshape(-1, 80, train.shape[-1])

In [7]:
median_sub = pd.read_csv('submission.csv')
pseudo_l = median_sub['pressure']

In [8]:
import lightgbm as lgb

lr = 0.2
p_m = 0
new_y = 0

for w in range(10):
    
    params_loss_1 = {
                    'n_estimators': 100000,
                    'learning_rate': 0.01, 'device' : 'gpu'}
    
    
    lgbm_reg_1 = lgb.LGBMRegressor(
                                **params_loss_1, 
                                objective='mae',
                                metric='mae',
                                n_jobs=-1, verbose = 0
                                )
    
    lgbm_reg_1.fit(test, pseudo_l)

    p_m = lgbm_reg_1.predict(train)

    new_y = targets.values - p_m
    
    
    
    params_loss_2 = {
                    'n_estimators': 100000,
                    'learning_rate': 0.01, 'device': 'gpu'}
    
    
    lgbm_reg_2 = lgb.LGBMRegressor(
                                **params_loss_2, 
                                objective='mae',
                                metric='mae', n_jobs=-1, verbose = 0)
    
    lgbm_reg_2.fit(
                train, 
                new_y
                )

    p_a = lgbm_reg_2.predict(test)

    my_y = median_sub['pressure'].values + (p_a * lr)

KeyboardInterrupt: 

In [9]:
from sklearn.metrics import mean_absolute_error

print(mean_absolute_error(targets, p_m))

0.4589119971350398


In [None]:
my_y

In [None]:
print(mean_absolute_error(median_sub['pressure'].values, my_y))

In [None]:
train_1 = pd.read_csv('train.csv')
all_pressure = np.sort(train_1.pressure.unique())
print('The first 25 unique pressures...')
PRESSURE_MIN = all_pressure[0].item()
PRESSURE_MAX = all_pressure[-1].item()
all_pressure[:25]

In [None]:
PRESSURE_STEP = ( all_pressure[1] - all_pressure[0] ).item()
all_pressure[1:26] - all_pressure[:25]

In [None]:
median_sub['pressure'] = my_y
median_sub["pressure"] =\
    np.round( (median_sub.pressure - PRESSURE_MIN)/PRESSURE_STEP ) * PRESSURE_STEP + PRESSURE_MIN
median_sub.pressure = np.clip(median_sub.pressure, PRESSURE_MIN, PRESSURE_MAX)
median_sub.to_csv('submission5.csv', index=False)