In [15]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [16]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [17]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

train = add_features(train)
test = add_features(test)

# RS = RobustScaler()
# train = RS.fit_transform(train)
# test = RS.transform(test)

# train = train.reshape(-1, 80, train.shape[-1])
# test = test.reshape(-1, 80, train.shape[-1])

In [18]:
targets = train['pressure']
train.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)
test = test.drop(['id', 'breath_id'], axis=1)

In [19]:
median_sub = pd.read_csv('submission.csv')
pseudo_l = median_sub['pressure']

In [20]:
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

lr = 0.2
p_m = 0
new_y = 0
my_y = 0


def objective(trial, data=test, target=pseudo_l):

    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.3, shuffle = False, random_state=2021)
    param = {
        'n_estimators': 20000,
        'num_leaves': trial.suggest_int('num_leaves', 2, 500),
        'max_depth': trial.suggest_categorical('max_depth', [-1,20, 30,50,80, 100,200, 300]),
        'reg_alpha': trial.suggest_float('reg_alpha', 1E-5, 100),
        'reg_lambda': trial.suggest_float('reg_lambda', 1E-5, 100),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 1E-3, 1.0),
        #'subsample': trial.suggest_float('subsample ', 1E-16, 1.0),
        'cat_smooth': trial.suggest_float('cat_smooth', 1.0, 100),  
        'subsample': trial.suggest_categorical('subsample', [0.2,0.3,0.4,0.5,0.6,0.7,0.8,1.0]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1E-3, 10),
        'subsample_freq': trial.suggest_int('subsample_freq', 0, 20),
        'metric': 'mae', 
        'random_state': 2021,
        'learning_rate': trial.suggest_loguniform('learning_rate', 1E-3, 1.0) 
        
    }

    model = lgb.LGBMRegressor(**param)  
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=50,verbose=False)

    preds = model.predict(test_x)    
    mae = mean_absolute_error(test_y, preds)
    
    return mae

study = optuna.create_study()
study.optimize(objective, n_trials=100)
print('Best trial:', study.best_params)

[32m[I 2021-10-19 10:43:20,591][0m A new study created in memory with name: no-name-e804db87-e1e3-4aa8-a0cc-4f852eb1704b[0m
[32m[I 2021-10-19 10:57:36,325][0m Trial 0 finished with value: 2.436180271369652 and parameters: {'num_leaves': 368, 'max_depth': 80, 'reg_alpha': 9.651243752095864, 'reg_lambda': 50.40682402023616, 'colsample_bytree': 0.017032246489922333, 'cat_smooth': 41.208119517489735, 'subsample': 0.8, 'min_child_samples': 244, 'min_child_weight': 1.0346898790184724, 'subsample_freq': 17, 'learning_rate': 0.0023846504495965244}. Best is trial 0 with value: 2.436180271369652.[0m
[32m[I 2021-10-19 10:58:04,492][0m Trial 1 finished with value: 0.5722647364576676 and parameters: {'num_leaves': 442, 'max_depth': -1, 'reg_alpha': 14.213741007609562, 'reg_lambda': 18.02100450807621, 'colsample_bytree': 0.927270205972183, 'cat_smooth': 72.9979488021712, 'subsample': 0.2, 'min_child_samples': 102, 'min_child_weight': 0.0505329914509953, 'subsample_freq': 18, 'learning_rate':

[32m[I 2021-10-20 02:28:09,512][0m Trial 17 finished with value: 0.383842702093722 and parameters: {'num_leaves': 145, 'max_depth': 200, 'reg_alpha': 52.37048064148203, 'reg_lambda': 61.240592650977966, 'colsample_bytree': 0.986278556965961, 'cat_smooth': 12.854835047608432, 'subsample': 0.5, 'min_child_samples': 217, 'min_child_weight': 0.015249383869502094, 'subsample_freq': 20, 'learning_rate': 0.05561811443639728}. Best is trial 13 with value: 0.37085485445399297.[0m
[32m[I 2021-10-20 03:10:25,956][0m Trial 18 finished with value: 0.38903979657493376 and parameters: {'num_leaves': 387, 'max_depth': -1, 'reg_alpha': 23.373295902350492, 'reg_lambda': 3.218354541330207, 'colsample_bytree': 0.594518819045296, 'cat_smooth': 48.90218345241772, 'subsample': 0.3, 'min_child_samples': 275, 'min_child_weight': 0.005636627323621311, 'subsample_freq': 16, 'learning_rate': 0.14857272748028655}. Best is trial 13 with value: 0.37085485445399297.[0m
[32m[I 2021-10-20 03:50:12,833][0m Trial

KeyboardInterrupt: 

In [None]:
train_1 = pd.read_csv('train.csv')
all_pressure = np.sort(train_1.pressure.unique())
print('The first 25 unique pressures...')
PRESSURE_MIN = all_pressure[0].item()
PRESSURE_MAX = all_pressure[-1].item()
all_pressure[:25]