In [25]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import log_loss
from math import sqrt

import lightgbm as lgb
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import KFold, StratifiedKFold

from hyperopt import fmin, hp, tpe, space_eval

import gc
import time

In [26]:
train = pd.read_csv('../dataset/train.csv', index_col = 0)
test = pd.read_csv('../dataset/test.csv', index_col = 0)
sample_submission = pd.read_csv('../dataset/sample_submission.csv', index_col = 0)

In [27]:
def SpectralClass(x):
    if x < -0.3:
        return "O"
    
    elif (x >= -0.3) & (x < 0):
        return "B"
    
    elif (x >= 0) & (x < 0.33):
        return "A"
    
    elif (x >= 0.33) & (x < 0.6):
        return "F"
    
    elif (x >= 0.6) & (x < 0.81):
        return "G"
    
    elif (x >= 0.81) & (x < 1.4):
        return "K"
    
    else:
        return "M"

In [28]:
def ugriz(df):
    mag = ['psfMag', 'fiberMag', 'PetroMag', 'model']
    colors = list('ugriz')
    
    u = list(df.columns[df.columns.str.endswith('_u')])
    g = list(df.columns[df.columns.str.endswith('_g')])
    r = list(df.columns[df.columns.str.endswith('_r')])
    i = list(df.columns[df.columns.str.endswith('_i')])
    z = list(df.columns[df.columns.str.endswith('_z')])
    
    #u_g
    for idx, cols in enumerate(mag):
        df[cols+'_u_g'] = df[u[idx]] - df[g[idx]]
     
    #g_r
    for idx, cols in enumerate(mag):
        df[cols+'_g_r'] = df[g[idx]] - df[r[idx]]
        
    #r_i
    for idx, cols in enumerate(mag):
        df[cols+'_r_i'] = df[r[idx]] - df[i[idx]]
        
    #g_i
    for idx, cols in enumerate(mag):
        df[cols+'_g_i'] = df[g[idx]] - df[i[idx]] 
        
    #i_z
    for idx, cols in enumerate(mag):
        df[cols+'_i_z'] = df[i[idx]] - df[z[idx]]
    
    #model-[psf,fiber,petro]
    for color in colors:
        df['model_psf_'+ color] = df[locals()[color][3]] - df[locals()[color][0]]
        df['model_fiber_'+ color] = df[locals()[color][3]] - df[locals()[color][1]]
        df['model_petro_'+ color] = df[locals()[color][3]] - df[locals()[color][2]]
        
    #B_V
    for idx, cols in enumerate(mag):
        df[cols+'_b_v'] = 0.98 * (df[g[idx]] - df[r[idx]]) + 0.22
        df['star_spectrum_'+cols] = df[cols+'_b_v'].apply(lambda x: SpectralClass(x))
        
    return df

In [29]:
bins= [0, 100, 200, 300, 400, 500, 600, 640,1000]
labels = list('01234567')

train['fiberID2'] = pd.cut(train['fiberID'], bins= bins, labels = labels)
test['fiberID2'] = pd.cut(test['fiberID'], bins= bins, labels = labels)

In [30]:
train = ugriz(train)
test = ugriz(test)

In [31]:
star = list(train.columns[train.columns.str.startswith('star_')])
star.append('fiberID2')

train= pd.get_dummies(data=train, columns=star)
test = pd.get_dummies(data=test, columns=star)

In [32]:
column_number = {}

for i, column in enumerate(sample_submission.columns):
    column_number[column] = i

def to_number(x, dic):
    return dic[x]

train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number))

In [33]:
train_x = train.drop(columns = ['type', 'type_num'], axis = 1)
train_y = train['type_num']

test_x = test

In [34]:
space = {'objective' : 'multiclass',
         'num_class' : 19,
              'metric' : 'multi_logloss',
              'boosting' : 'gbdt',
    'max_depth':  hp.choice('max_depth', list(range(6, 10, 2))),
    'reg_alpha' : hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.4),
    'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.3, 0.9),
    'gamma' : hp.uniform('gamma', 0.01, .7),
    'num_leaves' : hp.choice('num_leaves', list(range(20,150,10))),
    'min_child_samples': hp.choice('min_child_samples', list(range(100,150,10))),
    'subsample': hp.choice('subsample', [.2, .4, .5, .6, .7, .8, .9]),
    'feature_fraction' : hp.uniform('feature_fraction', .4, .8),
    'bagging_fraction' : hp.uniform('bagging_fraction', .5, .9)
}

In [35]:
def evaluate_metric(params):
    time1 = time.time()
    print("******************** New Run *********************")
    print('Params : {}'.format(params))
    FOLDS = 5
    count = 1
    
    skf = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = 42)
    y_preds = np.zeros(test.shape[0])
    score_mean = 0
    
    for tr_idx, val_idx in skf.split(train_x, train_y):
        X_tr, X_vl = train_x.iloc[tr_idx, :], train_x.iloc[val_idx, :]
        y_tr, y_vl = train_y.iloc[tr_idx], train_y.iloc[val_idx]
        
        lgbtrain = lgb.Dataset(X_tr, label = y_tr)
        lgbval = lgb.Dataset(X_vl, label = y_vl)
        
        lgb_clf = lgb.train(params, lgbtrain, 1250, valid_sets = [lgbval],
                           categorical_feature = 'auto',
                           verbose_eval = 0,
                           early_stopping_rounds = 50)
        
        lgb_pred = lgb_clf.predict(X_vl, num_iteration = lgb_clf.best_iteration)
        
        score = make_scorer(log_loss)(lgb_clf, X_vl, y_vl)
        
        score_mean += score
        print('{} CV - Score: {}'.format(count, round(score,4)))
        
        count += 1
    
    time2 = time.time() - time1
    print('Total Time Run : {}'.format(round(time2/60, 2)))
    gc.collect()
    print('Mean Log Loss : {}'.format(score_mean/FOLDS))
    del X_tr, X_vl, y_tr, y_vl, lgb_clf, score
    
    return (score_mean/FOLDS)

In [36]:
best = fmin(fn = evaluate_metric,
            space = space,
            algo = tpe.suggest,
            max_evals = 10)

best_params = space_eval(space,best)
print("Best HyperParameters :", best_params)

******************** New Run *********************                                                                     
Params : {'bagging_fraction': 0.8334069299174205, 'boosting': 'gbdt', 'colsample_bytree': 0.38706237735915333, 'feature_fraction': 0.5634365760080988, 'gamma': 0.17100309272383174, 'learning_rate': 0.14894922800420424, 'max_depth': 6, 'metric': 'multi_logloss', 'min_child_samples': 100, 'num_class': 19, 'num_leaves': 60, 'objective': 'multiclass', 'reg_alpha': 0.27266785459822895, 'reg_lambda': 0.28139842943564547, 'subsample': 0.2}
1 CV - Score: 0.3476                                                                                                   
2 CV - Score: 0.345                                                                                                    
3 CV - Score: 0.3406                                                                                                   
4 CV - Score: 0.3455                                                               

Total Time Run : 17.78                                                                                                 
Mean Log Loss : 0.33280076216151944                                                                                    
******************** New Run *********************                                                                     
Params : {'bagging_fraction': 0.7685021664746272, 'boosting': 'gbdt', 'colsample_bytree': 0.4076740604321649, 'feature_fraction': 0.5233875929446218, 'gamma': 0.6762976155764132, 'learning_rate': 0.1961669953024764, 'max_depth': 6, 'metric': 'multi_logloss', 'min_child_samples': 110, 'num_class': 19, 'num_leaves': 130, 'objective': 'multiclass', 'reg_alpha': 0.09064793962410278, 'reg_lambda': 0.29655503181165643, 'subsample': 0.2}
1 CV - Score: 0.3571                                                                                                   
2 CV - Score: 0.3485                                                                 

In [None]:
import time
time = time.gmtime(time.time()) 
year = time[0]
month = time[1]
day = time[2]

joblib.dump(lgb_model, '../model/LGB_OPT_{}{}{}_3432_20020229.pkl'.format(year, month, day))

In [None]:
pred = lgb_model.predict(test_x, num_iteration = lgb_model.best_iteration)

In [None]:
submission = pd.DataFrame(data = pred, columns = sample_submission.columns, index = sample_submission.index)

In [None]:
submission.to_csv('../submission/lgb_opt_3432_0227.csv')