In [6]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb
from lightgbm import LGBMClassifier

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from sklearn.metrics import log_loss, make_scorer
from functools import partial

import gc
import time
import warnings
warnings.filterwarnings('ignore')

In [7]:
train = pd.read_csv('../dataset/pr_train.csv', index_col = 0)
test = pd.read_csv('../dataset/pr_test.csv', index_col = 0)
sample_submission = pd.read_csv('../dataset/sample_submission.csv', index_col = 0)

In [8]:
column_number = {}

for i, column in enumerate(sample_submission.columns):
    column_number[column] = i

def to_number(x, dic):
    return dic[x]

train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number))

In [9]:
train_x = train.drop(columns = ['type', 'type_num'], axis = 1)
train_y = train['type_num']

test_x = test

In [10]:
train_x.drop(columns = 'fiberID', inplace = True)
test_x.drop(columns = 'fiberID', inplace = True)

In [11]:
#train_x.drop(columns = ['psf_cluster', 'fiber_cluster', 'model_cluster'], inplace = True)
#test_x.drop(columns = ['psf_cluster', 'fiber_cluster', 'model_cluster'], inplace = True)

# Modeling

In [12]:
space = {'objective' : 'multiclass',
         'num_class' : 19,
              'metric' : 'multi_logloss',
              'boosting' : 'gbdt',
    'max_depth':  hp.choice('max_depth', list(range(6, 18, 2))),
    'reg_alpha' : hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.4),
    'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.3, 0.9),
    'gamma' : hp.uniform('gamma', 0.01, .7),
    'num_leaves' : hp.choice('num_leaves', list(range(20,150,10))),
    'min_child_samples': hp.choice('min_child_samples', list(range(100,150,10))),
    'subsample': hp.choice('subsample', [.2, .4, .5, .6, .7, .8, .9]),
    'feature_fraction' : hp.uniform('feature_fraction', .4, .8),
    'bagging_fraction' : hp.uniform('bagging_fraction', .4, .9)
}

In [15]:
def evaluate_metric(params):
    time1 = time.time()
    print('Params : {}'.format(params))
    FOLDS = 5
    count = 1
    
    skf = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = 42)
    y_preds = np.zeros(test.shape[0])
    score_mean = 0
    
    for tr_idx, val_idx in skf.split(train_x, train_y):
        X_tr, X_vl = train_x.iloc[tr_idx, :], train_x.iloc[val_idx, :]
        y_tr, y_vl = train_y.iloc[tr_idx], train_y.iloc[val_idx]
        
        lgbtrain = lgb.Dataset(X_tr, label = y_tr)
        lgbval = lgb.Dataset(X_vl, label = y_vl)
        
        lgb_clf = lgb.train(params, lgbtrain, 500, valid_sets = [lgbval],
                           categorical_feature = ['psf_cluster', 'fiber_cluster', 'model_cluster','over_641'],
                           verbose_eval = 0,
                           early_stopping_rounds = 50)
        
        lgb_pred = lgb_clf.predict(X_vl, num_iteration = lgb_clf.best_iteration)
        
        score = make_scorer(log_loss)(lgb_clf, X_vl, y_vl)
        
        score_mean += score
        print('{} CV - Score: {}'.format(count, round(score,4)))
        
        count += 1
    
    time2 = time.time() - time1
    print('Total Time Run : {}'.format(round(time2/60, 2)))
    gc.collect()
    print('Mean Log Loss : {}'.format(score_mean/FOLDS))
    del X_tr, X_vl, y_tr, y_vl, lgb_clf, score
    
    return (score_mean/FOLDS)

In [None]:
best = fmin(fn = evaluate_metric,
            space = space,
            algo = tpe.suggest,
            max_evals = 5)

best_params = space_eval(space,best)

Params : {'bagging_fraction': 0.8324350597993668, 'boosting': 'gbdt', 'colsample_bytree': 0.37635994430923303, 'feature_fraction': 0.5679244786233073, 'gamma': 0.32602301165938, 'learning_rate': 0.07161758538716705, 'max_depth': 10, 'metric': 'multi_logloss', 'min_child_samples': 130, 'num_class': 19, 'num_leaves': 70, 'objective': 'multiclass', 'reg_alpha': 0.39788638377761015, 'reg_lambda': 0.27947809747847263, 'subsample': 0.5}
  0%|                                                                            | 0/5 [00:00<?, ?trial/s, best loss=?]

In [None]:
submission = pd.DataFrame(data = pred, columns = sample_submission.columns, index = sample_submission.index)

In [None]:
qso_idx = test_x.loc[(test_x['fiberID2'] == '7') | (test_x['fiberID2'] == '8') | (test_x['fiberID2'] == '9')]

submission.loc[qso_idx.index] = np.nan
submission.loc[qso_idx.index, 'QSO'] = 1

submission.loc[qso_idx.index, submission.drop(columns = 'QSO').columns] = 0

In [None]:
submission.to_csv('../submission/abc1927.csv')

In [None]:
submission.head()