In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import itertools
from utils import *


In [2]:
dataset_df = pd.read_csv(r"C:\Users\ghibl\ICR\data\input\all_features.csv")
dataset_df.drop("Id",axis = 1, inplace = True)

In [3]:
X_df = dataset_df.drop("Class",axis =1)
y_df = dataset_df["Class"].copy()


In [4]:
y_df.unique()

array([1, 0], dtype=int64)

In [5]:
dtrain = lgb.Dataset(X_df.values, label = y_df.values)
y_true = dtrain.get_label()
y_pred = np.array([0.5]*len(y_true))
balanced_logloss(y_pred,y_true)


0.6931471805599452

In [17]:
fs = X_df.columns
scores= []
evals_result = {}
skf = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 42)

for tr_idx, va_idx in skf.split(X_df, y_df):
    X_train, X_valid = X_df.iloc[tr_idx], X_df.iloc[va_idx]
    y_train, y_valid = y_df.iloc[tr_idx], y_df.iloc[va_idx]
    
    dtrain = lgb.Dataset(X_train.values, label = y_train.values)
    dvalid = lgb.Dataset(X_valid.values, label = y_valid.values)

    evals_result = {}
    params = {'metric':'custom',
              'objective':'binary',
              'verbosity':-1,
              'random_state' : 42, 
              'learning_rate': 0.1,
              'early_stopping_round':20
             }

    callbacks = [lgb.early_stopping(20, verbose=0), lgb.log_evaluation(period=10)]
    model = lgb.train(params,
                      dtrain, 
                      num_boost_round = 500,
                      valid_sets = [dtrain,dvalid],
                      valid_names = ['train','valid'],
                      feval = original_balanced_logloss_metric,
                      callbacks = callbacks
                     )
    score = model.best_score['valid']['balanced_logloss']
    scores.append(score)
    print(f'balanced_logloss:{score:.4f}')
score_cv = np.mean(scores)
print(score_cv)

[10]	train's balanced_logloss: 0.595606	valid's balanced_logloss: 0.633591
[20]	train's balanced_logloss: 0.550565	valid's balanced_logloss: 0.605593
[30]	train's balanced_logloss: 0.526917	valid's balanced_logloss: 0.594
[40]	train's balanced_logloss: 0.515557	valid's balanced_logloss: 0.586914
[50]	train's balanced_logloss: 0.509549	valid's balanced_logloss: 0.583369
[60]	train's balanced_logloss: 0.506392	valid's balanced_logloss: 0.57604
[70]	train's balanced_logloss: 0.504781	valid's balanced_logloss: 0.574572
[80]	train's balanced_logloss: 0.504002	valid's balanced_logloss: 0.5771
balanced_logloss:0.5744
[10]	train's balanced_logloss: 0.596816	valid's balanced_logloss: 0.62299
[20]	train's balanced_logloss: 0.549859	valid's balanced_logloss: 0.599894
[30]	train's balanced_logloss: 0.527858	valid's balanced_logloss: 0.584723
[40]	train's balanced_logloss: 0.515998	valid's balanced_logloss: 0.578939
[50]	train's balanced_logloss: 0.510135	valid's balanced_logloss: 0.575248
[60]	tra

In [20]:
fs = X_df.columns
scores= []
evals_result = {}
skf = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 42)

for tr_idx, va_idx in skf.split(X_df, y_df):
    X_train, X_valid = X_df.iloc[tr_idx], X_df.iloc[va_idx]
    y_train, y_valid = y_df.iloc[tr_idx], y_df.iloc[va_idx]
    
    dtrain = lgb.Dataset(X_train.values, label = y_train.values)
    dvalid = lgb.Dataset(X_valid.values, label = y_valid.values)

    evals_result = {}
    params = {'metric':'custom',
              'objective':'custom',
              'verbosity':-1,
              'random_state' : 42, 
              'learning_rate': 0.1,
              'early_stopping_round':20
             }

    callbacks = [lgb.early_stopping(20, verbose=0), lgb.log_evaluation(period=10),lgb.record_evaluation(eval_result)]
    model = lgb.train(params,
                      dtrain, 
                      num_boost_round = 500,
                      valid_sets = [dtrain,dvalid],
                      valid_names = ['train','valid'],
                      feval = original_balanced_logloss_metric,
                      fobj = original_binary_logloss_objective,
                      callbacks = callbacks
                     )
    score = model.best_score['valid']['balanced_logloss']
    scores.append(score)
    print(f'balanced_logloss:{score:.4f}')
score_cv = np.mean(scores)
print(score_cv)

NameError: name 'eval_result' is not defined

# 0.38349947449517713

In [None]:
fig = make_subplots(rows = 2,cols = 2,
                    subplot_titles=("fold1", "fold2", "fold3", "fold4"))
tuples = [(1,1),(1,2),(2,1),(2,2)]
for i in range(len(evals_results)):
    train_log = evals_results[i]['train']['logloss'] 
    eval_log = evals_results[i]['eval']['logloss'] 
    x = np.arange(1,1+len(train_log))
    fig.add_trace(go.Scatter(x = x, y = train_log,mode = 'lines',name = 'train'),
                 row = tuples[i][0],col = tuples[i][1])
    fig.add_trace(go.Scatter(x = x, y = eval_log,mode = 'lines',name = 'eval'),
                 row = tuples[i][0],col = tuples[i][1])
fig.update_layout(
    title = "fold1, fold2, fold3, fold4"
)

In [None]:
best_score = 9999.0
candidates = np.random.RandomState(71).permutation(X_df.columns)
selected = set([])

print('start simple selection')
for feature in candidates:
    fs = list(selected) + [feature]
    score = evaluate(fs,X_df,y_df)
    
    if score < best_score:
        selected.add(feature)
        best_score = score
        print(f'selected:{feature}')
        print(f'score:{score}')

print(f'selected features:{selected}')

In [None]:
import numpy as np

x = np.array([0, 1, 1, 3, 2, 1, 7])
counts = np.bincount(x)
np.sum(counts)