In [6]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb
from lightgbm import LGBMClassifier

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from sklearn.metrics import log_loss, make_scorer
from functools import partial

import gc
import time
import warnings
warnings.filterwarnings('ignore')

In [26]:
train = pd.read_csv('../dataset/pr_train.csv', index_col = 0)
test = pd.read_csv('../dataset/pr_test.csv', index_col = 0)
sample_submission = pd.read_csv('../dataset/sample_submission.csv', index_col = 0)

In [27]:
column_number = {}

for i, column in enumerate(sample_submission.columns):
    column_number[column] = i

def to_number(x, dic):
    return dic[x]

train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number))

In [28]:
bins= [0, 100, 200, 300, 400, 500, 600, 700, 800, 900 ,1000]
labels = ['0', '1','2','3','4','5','6','7','8','9']

train['fiberID2'] = pd.cut(train['fiberID'], bins= bins, labels = labels)
test['fiberID2'] = pd.cut(test['fiberID'], bins= bins, labels = labels)

In [29]:
train_x = train.drop(columns = ['type', 'type_num'], axis = 1)
train_y = train['type_num']

test_x = test

In [30]:
train_x.drop(columns = 'fiberID', inplace = True)
test_x.drop(columns = 'fiberID', inplace = True)

In [31]:
#train_x.drop(columns = ['psf_cluster', 'fiber_cluster', 'model_cluster'], inplace = True)
#test_x.drop(columns = ['psf_cluster', 'fiber_cluster', 'model_cluster'], inplace = True)

# Modeling

In [32]:
space = {'objective' : 'multiclass',
         'num_class' : 19,
              'metric' : 'multi_logloss',
              'boosting' : 'gbdt',
    'max_depth':  hp.choice('max_depth', list(range(6, 18, 2))),
    'reg_alpha' : hp.uniform('reg_alpha', 0.01, 0.4),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.4),
    'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.3, 0.9),
    'gamma' : hp.uniform('gamma', 0.01, .7),
    'num_leaves' : hp.choice('num_leaves', list(range(20,150,10))),
    'min_child_samples': hp.choice('min_child_samples', list(range(100,150,10))),
    'subsample': hp.choice('subsample', [.2, .4, .5, .6, .7, .8, .9]),
    'feature_fraction' : hp.uniform('feature_fraction', .4, .8),
    'bagging_fraction' : hp.uniform('bagging_fraction', .4, .9)
}

In [33]:
def evaluate_metric(params):
    time1 = time.time()
    print('Params : {}'.format(params))
    FOLDS = 5
    count = 1
    
    skf = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = 42)
    y_preds = np.zeros(test.shape[0])
    score_mean = 0
    
    for tr_idx, val_idx in skf.split(train_x, train_y):
        X_tr, X_vl = train_x.iloc[tr_idx, :], train_x.iloc[val_idx, :]
        y_tr, y_vl = train_y.iloc[tr_idx], train_y.iloc[val_idx]
        
        lgbtrain = lgb.Dataset(X_tr, label = y_tr)
        lgbval = lgb.Dataset(X_vl, label = y_vl)
        
        lgb_clf = lgb.train(params, lgbtrain, 500, valid_sets = [lgbval],
                           categorical_feature = ['psf_cluster', 'fiber_cluster', 'model_cluster','over_641', 'fiberID2'],
                           verbose_eval = 0,
                           early_stopping_rounds = 50)
        
        lgb_pred = lgb_clf.predict(X_vl, num_iteration = lgb_clf.best_iteration)
        
        score = make_scorer(log_loss)(lgb_clf, X_vl, y_vl)
        
        score_mean += score
        print('{} CV - Score: {}'.format(count, round(score,4)))
        
        count += 1
    
    time2 = time.time() - time1
    print('Total Time Run : {}'.format(round(time2/60, 2)))
    gc.collect()
    print('Mean Log Loss : {}'.format(score_mean/FOLDS))
    del X_tr, X_vl, y_tr, y_vl, lgb_clf, score
    
    return (score_mean/FOLDS)

In [34]:
best = fmin(fn = evaluate_metric,
            space = space,
            algo = tpe.suggest,
            max_evals = 5)

best_params = space_eval(space,best)

Params : {'bagging_fraction': 0.7111621120888049, 'boosting': 'gbdt', 'colsample_bytree': 0.433032891376141, 'feature_fraction': 0.4236693589707333, 'gamma': 0.07955829063920518, 'learning_rate': 0.04171682525949007, 'max_depth': 12, 'metric': 'multi_logloss', 'min_child_samples': 100, 'num_class': 19, 'num_leaves': 20, 'objective': 'multiclass', 'reg_alpha': 0.27487894180161454, 'reg_lambda': 0.07327339237163104, 'subsample': 0.7}
1 CV - Score: 0.3778                                                                                                   
2 CV - Score: 0.3828                                                                                                   
3 CV - Score: 0.3821                                                                                                   
4 CV - Score: 0.3782                                                                                                   
5 CV - Score: 0.3758                                                                

In [35]:
params = {'bagging_fraction': 0.7111621120888049, 'boosting': 'gbdt', 'colsample_bytree': 0.433032891376141, 'feature_fraction': 0.4236693589707333, 'gamma': 0.07955829063920518, 'learning_rate': 0.04171682525949007, 'max_depth': 12, 'metric': 'multi_logloss', 'min_child_samples': 100, 'num_class': 19, 'num_leaves': 20, 'objective': 'multiclass', 'reg_alpha': 0.27487894180161454, 'reg_lambda': 0.07327339237163104, 'subsample': 0.7}

In [45]:
lgb_train = lgb.Dataset(train_x, train_y)

In [47]:
lgb_model = lgb.train(params,
                      lgb_train,
                      categorical_feature = ['psf_cluster','fiber_cluster','model_cluster','fiberID2'],
                      verbose_eval = 1)

In [48]:
pred = lgb_model.predict(test_x)

In [49]:
submission = pd.DataFrame(data = pred, columns = sample_submission.columns, index = sample_submission.index)

In [50]:
qso_idx = test_x.loc[(test_x['fiberID2'] == '7') | (test_x['fiberID2'] == '8') | (test_x['fiberID2'] == '9')]

submission.loc[qso_idx.index] = np.nan
submission.loc[qso_idx.index, 'QSO'] = 1

submission.loc[qso_idx.index, submission.drop(columns = 'QSO').columns] = 0

KeyError: "None of [Int64Index([   13,    30,    35,    37,    38,    42,    89,    97,   103,\n              107,\n            ...\n             9792,  9803,  9841,  9855,  9870,  9884,  9897,  9901,  9986,\n            10006],\n           dtype='int64', length=615)] are in the [index]"

In [51]:
submission.to_csv('../submission/abc1927.csv')

In [52]:
submission.head()

Unnamed: 0_level_0,STAR_WHITE_DWARF,STAR_CATY_VAR,STAR_BROWN_DWARF,SERENDIPITY_RED,REDDEN_STD,STAR_BHB,GALAXY,SERENDIPITY_DISTANT,QSO,SKY,STAR_RED_DWARF,ROSAT_D,STAR_PN,SERENDIPITY_FIRST,STAR_CARBON,SPECTROPHOTO_STD,STAR_SUB_DWARF,SERENDIPITY_MANUAL,SERENDIPITY_BLUE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
199991,0.000685,0.001861,0.000222,0.038584,0.004001,0.003649,0.011494,0.001636,0.018917,5e-06,0.898518,0.006083,5e-06,0.002735,0.001033,0.003721,0.00043,0.000146,0.006274
199992,0.002575,0.005287,0.000106,0.001206,0.011367,0.010367,0.036796,0.00786,0.407241,7e-06,0.010767,0.264264,2.4e-05,0.208333,0.002677,0.010571,0.000936,0.000276,0.019338
199993,0.000753,0.001097,1.8e-05,0.000194,0.00232,0.002654,0.965604,0.000883,0.011169,1e-06,0.002036,0.004346,3e-06,0.001297,0.001846,0.002062,0.000201,2e-05,0.003496
199994,0.001029,0.002685,4.7e-05,0.000494,0.005664,0.005162,0.013756,0.011635,0.131326,4e-06,0.00519,0.015524,2.3e-05,0.022244,0.001334,0.005267,0.000478,0.001661,0.776476
199995,0.000441,0.001187,4.7e-05,0.000909,0.002551,0.002327,0.007092,0.001047,0.013638,2e-06,0.954388,0.00693,4e-06,0.002094,0.000699,0.002373,0.000211,5.7e-05,0.004005


In [53]:
submission.shape

(10009, 19)

In [54]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10009 entries, 199991 to 209999
Data columns (total 19 columns):
STAR_WHITE_DWARF       10009 non-null float64
STAR_CATY_VAR          10009 non-null float64
STAR_BROWN_DWARF       10009 non-null float64
SERENDIPITY_RED        10009 non-null float64
REDDEN_STD             10009 non-null float64
STAR_BHB               10009 non-null float64
GALAXY                 10009 non-null float64
SERENDIPITY_DISTANT    10009 non-null float64
QSO                    10009 non-null float64
SKY                    10009 non-null float64
STAR_RED_DWARF         10009 non-null float64
ROSAT_D                10009 non-null float64
STAR_PN                10009 non-null float64
SERENDIPITY_FIRST      10009 non-null float64
STAR_CARBON            10009 non-null float64
SPECTROPHOTO_STD       10009 non-null float64
STAR_SUB_DWARF         10009 non-null float64
SERENDIPITY_MANUAL     10009 non-null float64
SERENDIPITY_BLUE       10009 non-null float64
dtypes: flo