In [115]:
import numpy as np
import pandas as pd

In [116]:
base_path='../input/month-dacon11/'

train=pd.read_csv(base_path+'train_features.csv')
test=pd.read_csv(base_path+'test_features.csv')
train_labels=pd.read_csv(base_path+'train_labels.csv')
submission=pd.read_csv(base_path+'sample_submission.csv')

In [117]:
features=['acc_x','acc_y','acc_z','gy_x','gy_y','gy_z']

In [118]:
train=train.groupby('id',as_index=False)[features].mean()

In [119]:
data=pd.merge(train,train_labels,on='id')
data=data.drop(['id','label_desc'],axis=1)

In [122]:
target=data['label']
train=data.drop('label',axis=1)

In [123]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

def objective(trial,data=data,target=target):
    
    X_tr,X_val,y_tr,y_val=train_test_split(data, target, test_size=0.2,random_state=71)
    param = {
        'objective':'multiclass',
        'metric': 'multi_logloss', 
        'random_state': 71,
        'n_estimators': 200,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample',[0.6,0.7,0.8,0.9,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate',[0.002,0.007,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [15,25,100]),
        'num_leaves' : trial.suggest_int('num_leaves',8, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 31)
    }
    model = LGBMClassifier(**param)  
    
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=25,verbose=False)
    
    preds = model.predict_proba(X_val)
    
    score = log_loss(y_val, preds)
    
    return score

In [124]:
import optuna

study=optuna.create_study(direction='minimize')
study.optimize(objective,n_trials=10)
print('Number of finished trials:',len(study.trials))
print('Best trial:',study.best_trial.params)

[32m[I 2021-02-13 08:45:08,747][0m A new study created in memory with name: no-name-34396f88-8435-4c4b-aa63-a6a3041a95d9[0m
[32m[I 2021-02-13 08:45:11,440][0m Trial 0 finished with value: 0.04000116428112165 and parameters: {'reg_alpha': 0.023153567824848047, 'reg_lambda': 1.4530496683118532, 'colsample_bytree': 0.8, 'subsample': 0.9, 'learning_rate': 0.02, 'max_depth': 15, 'num_leaves': 11, 'min_child_samples': 25}. Best is trial 0 with value: 0.04000116428112165.[0m
[32m[I 2021-02-13 08:45:14,495][0m Trial 1 finished with value: 0.06872968318169972 and parameters: {'reg_alpha': 0.07748156535424186, 'reg_lambda': 1.7933762574078174, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.02, 'max_depth': 100, 'num_leaves': 19, 'min_child_samples': 12}. Best is trial 0 with value: 0.04000116428112165.[0m
[32m[I 2021-02-13 08:45:16,620][0m Trial 2 finished with value: 0.6185651547714472 and parameters: {'reg_alpha': 8.535049631356403, 'reg_lambda': 4.671036422183311, 'c

Number of finished trials: 10
Best trial: {'reg_alpha': 0.0011261903395910833, 'reg_lambda': 0.003237819112801843, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 15, 'num_leaves': 15, 'min_child_samples': 20}


In [125]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_learning_rate,params_max_depth,params_min_child_samples,params_num_leaves,params_reg_alpha,params_reg_lambda,params_subsample,state
0,0,0.040001,2021-02-13 08:45:08.749702,2021-02-13 08:45:11.439347,0 days 00:00:02.689645,0.8,0.02,15,25,11,0.023154,1.45305,0.9,COMPLETE
1,1,0.06873,2021-02-13 08:45:11.441046,2021-02-13 08:45:14.494565,0 days 00:00:03.053519,0.7,0.02,100,12,19,0.077482,1.793376,0.8,COMPLETE
2,2,0.618565,2021-02-13 08:45:14.496642,2021-02-13 08:45:16.619998,0 days 00:00:02.123356,0.8,0.02,15,5,20,8.53505,4.671036,0.7,COMPLETE
3,3,0.006294,2021-02-13 08:45:16.621923,2021-02-13 08:45:19.185372,0 days 00:00:02.563449,1.0,0.02,15,20,15,0.001126,0.003238,0.6,COMPLETE
4,4,0.07505,2021-02-13 08:45:19.187367,2021-02-13 08:45:22.329605,0 days 00:00:03.142238,0.9,0.01,100,12,19,0.009147,0.036419,0.9,COMPLETE
5,5,0.062239,2021-02-13 08:45:22.333603,2021-02-13 08:45:24.951818,0 days 00:00:02.618215,1.0,0.01,100,19,12,0.036686,0.002652,0.8,COMPLETE
6,6,0.66458,2021-02-13 08:45:24.955594,2021-02-13 08:45:26.892102,0 days 00:00:01.936508,0.8,0.01,15,26,16,9.64213,0.344838,1.0,COMPLETE
7,7,0.14851,2021-02-13 08:45:26.896233,2021-02-13 08:45:29.473152,0 days 00:00:02.576919,1.0,0.007,25,30,8,0.011651,0.026996,0.6,COMPLETE
8,8,0.031579,2021-02-13 08:45:29.475311,2021-02-13 08:45:32.663520,0 days 00:00:03.188209,0.7,0.017,15,11,20,0.069404,0.009364,0.8,COMPLETE
9,9,0.008079,2021-02-13 08:45:32.665616,2021-02-13 08:45:35.510480,0 days 00:00:02.844864,1.0,0.02,25,9,8,0.002511,0.001065,0.9,COMPLETE


In [126]:
params=study.best_params   
params['random_state']=71
params['n_estimators']=200 
params['objective']='multiclass'
params['metric']='multi_logloss'
params

{'reg_alpha': 0.0011261903395910833,
 'reg_lambda': 0.003237819112801843,
 'colsample_bytree': 1.0,
 'subsample': 0.6,
 'learning_rate': 0.02,
 'max_depth': 15,
 'num_leaves': 15,
 'min_child_samples': 20,
 'random_state': 71,
 'n_estimators': 200,
 'objective': 'multiclass',
 'metric': 'multi_logloss'}

In [128]:
test=test.groupby('id')[features].mean()

KeyError: 'id'

In [None]:
from sklearn.model_selection import KFold

kf=KFold(n_splits=4,random_state=71,shuffle=True)
scores=[]
models=[]
preds=np.zeros((test.shape[0],61))
n=0
for train_idx,valid_idx in kf.split(data[cols],data['label']):
    X_tr,X_val=data[features].iloc[train_idx],data[features].iloc[valid_idx]
    y_tr,y_val=data['label'].iloc[train_idx],data['label'].iloc[valid_idx]
    model = LGBMClassifier(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=25,verbose=False)
    preds+=model.predict_proba(test[features])/4
    try:
        scores.append(log_loss(y_val, model.predict_proba(X_val)))
    except:
        scores.append('no')
    print(n+1,scores[n])
    n+=1

In [None]:
submission.iloc[:,1:]=preds
display(submission)

In [None]:
# submission.to_csv()