In [3]:
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from optuna.samplers import TPESampler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [4]:
train = pd.read_csv("train.csv", index_col='id')
test = pd.read_csv("test.csv", index_col='id')

sample_submission = pd.read_csv('sample_solution.csv')

In [5]:
train.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [6]:
features = [x for x in train.columns if x.startswith('f')]

In [7]:
train['n_missing'] = train[features].isna().sum(axis=1)
train['abs_sum'] = train[features].abs().sum(axis=1)
train['std'] = train[features].std(axis=1)
train['avg'] = train[features].mean(axis=1)
train['max'] = train[features].max(axis=1)
train['min'] = train[features].min(axis=1)

test['n_missing'] = test[features].isna().sum(axis=1)
test['abs_sum'] = test[features].abs().sum(axis=1)
test['std'] = test[features].std(axis=1)
test['avg'] = test[features].mean(axis=1)
test['max'] = test[features].min(axis=1)
test['min'] = test[features].min(axis=1)

In [8]:
%%time
train = train.apply(lambda x: x.fillna(x.median()),axis=0)
test = test.apply(lambda x: x.fillna(x.median()), axis=0)

Wall time: 3.59 s


In [9]:
X, X_val, y, y_val = train_test_split(train.drop('claim', axis=1), train.claim, test_size=0.2, random_state=42)

In [10]:
def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 6)
    n_estimators = trial.suggest_int("n_estimators", 1, 150)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0000001, 1)
    num_leaves = trial.suggest_int("num_leaves", 2, 3000)
    min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    model = LGBMClassifier(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        num_leaves=num_leaves, 
        min_child_samples=min_child_samples,
        random_state=42
    )
    return model

class Optimizer:
    def __init__(self, metric, trials=15):
        self.metric = metric
        self.trials = trials
        self.sampler = TPESampler(seed=42)
        
    def objective(self, trial):
        model = create_model(trial)
        model.fit(X, y)
        preds = model.predict(X_val)
        if self.metric == 'acc':
            return accuracy_score(y_val, preds)
        else:
            return roc_auc_score(y_val, preds)
            
    def optimize(self):
        study = optuna.create_study(direction="maximize", sampler=self.sampler)
        study.optimize(self.objective, n_trials=self.trials)
        return study.best_params
    
optimizer = Optimizer('f1')
lgb_f1_params = optimizer.optimize()
lgb_f1_params['random_state'] = 42
lgb_f1 = LGBMClassifier(
    **lgb_f1_params
)
lgb_f1.fit(X, y)
preds = lgb_f1.predict(X_val)

print('Optimized on ROC AUC SCORE')
print('Optimized LightGBM accuracy: ', accuracy_score(y_val, preds))
print('Optimized LightGBM roc-auc-score: ', roc_auc_score(y_val, preds, average='macro'))

[32m[I 2021-11-06 10:05:16,503][0m A new study created in memory with name: no-name-c6a657ce-5b4b-47cd-9655-ed680128fda9[0m
[32m[I 2021-11-06 10:05:38,202][0m Trial 0 finished with value: 0.7719294509456572 and parameters: {'max_depth': 5, 'n_estimators': 93, 'learning_rate': 0.1834348715226848, 'num_leaves': 1097, 'min_child_samples': 191}. Best is trial 0 with value: 0.7719294509456572.[0m
[32m[I 2021-11-06 10:06:06,916][0m Trial 1 finished with value: 0.7694283604195409 and parameters: {'max_depth': 6, 'n_estimators': 103, 'learning_rate': 0.4458328082703159, 'num_leaves': 1240, 'min_child_samples': 77}. Best is trial 0 with value: 0.7719294509456572.[0m
[32m[I 2021-11-06 10:06:20,740][0m Trial 2 finished with value: 0.7718885401519257 and parameters: {'max_depth': 4, 'n_estimators': 88, 'learning_rate': 0.33370867776816077, 'num_leaves': 2921, 'min_child_samples': 154}. Best is trial 0 with value: 0.7719294509456572.[0m
[32m[I 2021-11-06 10:06:42,586][0m Trial 3 finis

Optimized on ROC AUC SCORE
Optimized LightGBM accuracy:  0.7714788291297812
Optimized LightGBM roc-auc-score:  0.7719294509456572


In [11]:
def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 6)
    n_estimators = trial.suggest_int("n_estimators", 1, 150)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0000001, 1)
    gamma = trial.suggest_uniform('gamma', 0.0000001, 1)
    subsample = trial.suggest_uniform('subsample', 0.0001, 1.0)
    model = XGBClassifier(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        gamma=gamma, 
        subsample=subsample,
        random_state=42
    )
    return model

optimizer = Optimizer('f1')
xgb_f1_params = optimizer.optimize()
xgb_f1_params['random_state'] = 42
xgb_f1 = XGBClassifier(
    **xgb_f1_params
)
xgb_f1.fit(X, y)
preds = xgb_f1.predict(X_val)

print('Optimized on ROC AUC score')
print('Optimized XGBoost accuracy: ', accuracy_score(y_val, preds))
print('Optimized XGBoost roc-auc-score: ', roc_auc_score(y_val, preds))

[32m[I 2021-11-06 10:10:19,137][0m A new study created in memory with name: no-name-b6b04ecd-44db-488d-8eb3-6f83fda35380[0m




[32m[I 2021-11-06 10:17:21,308][0m Trial 0 finished with value: 0.7717304860821509 and parameters: {'max_depth': 5, 'n_estimators': 93, 'learning_rate': 0.1834348715226848, 'gamma': 0.7796910223036693, 'subsample': 0.5968904729306923}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:19:46,069][0m Trial 1 finished with value: 0.7698574441590116 and parameters: {'max_depth': 3, 'n_estimators': 75, 'learning_rate': 0.459248946040978, 'gamma': 0.33370867776816077, 'subsample': 0.1429525312401486}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:27:25,906][0m Trial 2 finished with value: 0.7714557076563047 and parameters: {'max_depth': 4, 'n_estimators': 150, 'learning_rate': 0.05641167338594236, 'gamma': 0.7219988000669475, 'subsample': 0.9385588537448486}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:27:57,995][0m Trial 3 finished with value: 0.7469841658363024 and parameters: {'max_depth': 3, 'n_estimators': 21, 'learning_rate': 0.6174815478795656, 'gamma': 0.6116531993229648, 'subsample': 0.007165598589195434}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:28:41,476][0m Trial 4 finished with value: 0.7698712994139658 and parameters: {'max_depth': 2, 'n_estimators': 49, 'learning_rate': 0.524774707780923, 'gamma': 0.3998610317291583, 'subsample': 0.04676099664729407}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:29:17,829][0m Trial 5 finished with value: 0.7701756931605228 and parameters: {'max_depth': 5, 'n_estimators': 15, 'learning_rate': 0.45607003861003753, 'gamma': 0.7851759828754175, 'subsample': 0.1997538147801439}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:32:36,558][0m Trial 6 finished with value: 0.7710986213187567 and parameters: {'max_depth': 5, 'n_estimators': 64, 'learning_rate': 0.46676294657169065, 'gamma': 0.8599404207422798, 'subsample': 0.6803395078339209}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:33:02,403][0m Trial 7 finished with value: 0.7712686447952423 and parameters: {'max_depth': 2, 'n_estimators': 18, 'learning_rate': 0.9488855423647795, 'gamma': 0.9656320365113561, 'subsample': 0.8084165083816495}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:34:50,846][0m Trial 8 finished with value: 0.7712749649942096 and parameters: {'max_depth': 2, 'n_estimators': 90, 'learning_rate': 0.09767220423917247, 'gamma': 0.6842330580888543, 'subsample': 0.4402084784902273}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:35:15,210][0m Trial 9 finished with value: 0.7712537713925683 and parameters: {'max_depth': 5, 'n_estimators': 8, 'learning_rate': 0.034388617676366286, 'gamma': 0.9093204111467419, 'subsample': 0.2588541036018569}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:42:05,426][0m Trial 10 finished with value: 0.7709185150466654 and parameters: {'max_depth': 6, 'n_estimators': 115, 'learning_rate': 0.23581743523142756, 'gamma': 0.01796197381863207, 'subsample': 0.5595159899747266}. Best is trial 0 with value: 0.7717304860821509.[0m




[32m[I 2021-11-06 10:48:27,237][0m Trial 11 finished with value: 0.7720915317256372 and parameters: {'max_depth': 4, 'n_estimators': 146, 'learning_rate': 0.22393217989069103, 'gamma': 0.552650010393103, 'subsample': 0.9998603421226725}. Best is trial 11 with value: 0.7720915317256372.[0m




[32m[I 2021-11-06 10:54:59,437][0m Trial 12 finished with value: 0.7722962288398156 and parameters: {'max_depth': 4, 'n_estimators': 150, 'learning_rate': 0.23829480505076261, 'gamma': 0.5184304014395756, 'subsample': 0.9984415919487171}. Best is trial 12 with value: 0.7722962288398156.[0m




[32m[I 2021-11-06 11:01:27,965][0m Trial 13 finished with value: 0.771775569292644 and parameters: {'max_depth': 4, 'n_estimators': 141, 'learning_rate': 0.2974661654293682, 'gamma': 0.20684985219959368, 'subsample': 0.9941029199791809}. Best is trial 12 with value: 0.7722962288398156.[0m




[32m[I 2021-11-06 11:06:12,917][0m Trial 14 finished with value: 0.7718602511750011 and parameters: {'max_depth': 3, 'n_estimators': 129, 'learning_rate': 0.34197007075037944, 'gamma': 0.525253128348287, 'subsample': 0.8561473653787335}. Best is trial 12 with value: 0.7722962288398156.[0m


Optimized on ROC AUC score
Optimized XGBoost accuracy:  0.7718598630365793
Optimized XGBoost roc-auc-score:  0.7722962288398156


In [12]:
%%time
def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 6)
    n_estimators = trial.suggest_int("n_estimators", 2, 150)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    model = RandomForestClassifier(
        min_samples_leaf=min_samples_leaf, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        random_state=42
    )
    return model

optimizer = Optimizer('roc_auc')
rf_f1_params = optimizer.optimize()
rf_f1_params['random_state'] = 42
rf_f1 = RandomForestClassifier(
    **rf_f1_params
)
rf_f1.fit(X, y)
preds = rf_f1.predict(X_val)

print('Optimized on ROC AUC score')
print('Optimized Random Forest: ', accuracy_score(y_val, preds))
print('Optimized Random Forest roc-auc-score: ', roc_auc_score(y_val, preds))
# ~ 1 hour for 15 rounds of hyperparameter tuning 

[32m[I 2021-11-06 11:16:48,863][0m A new study created in memory with name: no-name-9c080d86-b90e-43d6-b425-0c1fcbfb55f3[0m
[32m[I 2021-11-06 11:28:12,941][0m Trial 0 finished with value: 0.7712645649026889 and parameters: {'max_depth': 5, 'n_estimators': 94, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.7712645649026889.[0m
[32m[I 2021-11-06 11:29:59,856][0m Trial 1 finished with value: 0.7712644862189688 and parameters: {'max_depth': 6, 'n_estimators': 22, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.7712645649026889.[0m
[32m[I 2021-11-06 11:33:17,238][0m Trial 2 finished with value: 0.7712645649026889 and parameters: {'max_depth': 3, 'n_estimators': 76, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.7712645649026889.[0m
[32m[I 2021-11-06 11:41:40,971][0m Trial 3 finished with value: 0.7712645649026889 and parameters: {'max_depth': 6, 'n_estimators': 101, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.7712645649026889.[0m
[32m[I 2021-11-

Optimized on ROC AUC score
Optimized Random Forest:  0.7708107148822448
Optimized Random Forest roc-auc-score:  0.7712852470601507
Wall time: 56min 20s


In [10]:
from collections import Counter
from sklearn.tree import DecisionTreeClassifier
np.random.seed(42)
def bootstrap_sample(X, y):
    n_samples = X.shape[0]
    idxs = np.random.choice(n_samples, n_samples, replace=True)
    return X[idxs], y[idxs]


def most_common_label(y):
    counter = Counter(y)
    most_common = counter.most_common(1)[0][0]
    return most_common
class RandomForest:
    def __init__(self, n_trees=100):
        self.n_trees = n_trees

        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTreeClassifier(
                criterion='gini', max_depth=None, min_samples_split=2, 
                min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
                min_impurity_decrease=0.0, min_impurity_split=None,  
            )
            X_samp, y_samp = bootstrap_sample(X, y)
            tree.fit(X_samp, y_samp)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        tree_preds = np.swapaxes(tree_preds, 0, 1)
        y_pred = [most_common_label(tree_pred) for tree_pred in tree_preds]
        return np.array(y_pred)

my_rf_cls = RandomForest()
my_rf_cls.fit(X.values, y.values)
# ~ takes 60 minutes with fixed hyperparameters

In [11]:
preds = my_rf_cls.predict(X_val)
print('Custom Random Forest ROC AUC SCORE: ', roc_auc_score(y_val, preds))

Custom Random Forest ROC AUC SCORE:  0.7707578276333077


In [13]:
%%time
models = [
    ('lgbm', LGBMClassifier(**lgb_f1_params)),
    ('rf', RandomForestClassifier(**rf_f1_params)),
    ('xgboost', XGBClassifier(**xgb_f1_params))
]

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
oof_pred_tmp = dict()
test_pred_tmp = dict()
scores_tmp = dict()

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    for name, model in models:
        if name not in scores_tmp:
            oof_pred_tmp[name] = list()
            oof_pred_tmp['y_valid'] = list()
            test_pred_tmp[name] = list()
            scores_tmp[name] = list()
     
        if name != 'rf':
            model.fit(
                X_train, y_train,
                eval_set=[(X_valid,y_valid)],
                verbose=0
            )
        else:
            model.fit(
                X_train, y_train,
            )
        
        pred_valid = model.predict_proba(X_valid)[:,1]
        score = roc_auc_score(y_valid, pred_valid)
        
        scores_tmp[name].append(score)
        oof_pred_tmp[name].extend(pred_valid)
        
        print(f"Fold: {fold + 1} Model: {name} Score: {score}")
        print('--'*20)
        
        y_hat = model.predict_proba(test)[:,1]
        test_pred_tmp[name].append(y_hat)
    
    oof_pred_tmp['y_valid'].extend(y_valid)
        
for name, model in models:
    print(f"Overall Validation Score | {name}: {np.mean(scores_tmp[name])}")
    print('='*20)

In [17]:
base_test_predictions = pd.DataFrame(
    {name: np.mean(np.column_stack(test_pred_tmp[name]), axis=1) 
    for name in test_pred_tmp.keys()}
)

base_test_predictions['simple_avg'] = base_test_predictions.mean(axis=1)
simple_blend_submission = sample_submission.copy()
simple_blend_submission['claim'] = base_test_predictions['simple_avg']
simple_blend_submission.to_csv('./simple_blend_submission.csv', index=False)

![alt text](subm.png "Title")