In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

In [2]:
import os
os.chdir('/content/drive/MyDrive/sasha_babuin/data_santander')

In [3]:
train_df = pd.read_csv('data/train 2.csv')
test_df = pd.read_csv('data/test.csv')
X = train_df.drop(['ID_code', 'target'], axis = 1)
y = train_df['target']
test_df.drop(['ID_code'], axis = 1, inplace = True)
scaler = MinMaxScaler()
scaler.fit(X)
scaler.fit(test_df)
X = pd.DataFrame(scaler.transform(X), columns = X.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns = test_df.columns)

In [4]:
param_initial = {
    'bagging_freq': 5,
    'bagging_fraction': 0.5, ## 
    'boost_from_average': False, ## 
    'boost': 'gbdt', ## 
    'feature_fraction': 0.1, ##
    'learning_rate': 0.01, 
    'max_depth': 5,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 16,
    'tree_learner': 'serial',
    'objective': 'binary'
}

In [5]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 102)

res_val = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
results = pd.DataFrame()
features = X.columns.to_list()

for fold_, (train_idx, val_idx) in enumerate(skf.split(X.values, y.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(X.iloc[train_idx], label=y.iloc[train_idx])
    val_data = lgb.Dataset(X.iloc[val_idx], label=y.iloc[val_idx])

    lgbt = lgb.train(param_initial, trn_data, num_boost_round = 500_000,
                    valid_sets = [trn_data, val_data], early_stopping_rounds = 3000, 
                     verbose_eval=1000)
    predictions_fold = lgbt.predict(X.iloc[val_idx])

    res_val[val_idx] = predictions_fold

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = lgbt.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([results, fold_importance_df], axis=0)

    predictions += lgbt.predict(test_df[features], num_iteration=lgbt.best_iteration)


predictions_df = pd.DataFrame({'ID_code' : test_df['ID_code'].values})
predictions_df['predictions'] = predictions




Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.892997	valid_1's auc: 0.864831
[2000]	training's auc: 0.916541	valid_1's auc: 0.882353
[3000]	training's auc: 0.929253	valid_1's auc: 0.89013
[4000]	training's auc: 0.937398	valid_1's auc: 0.894231
[5000]	training's auc: 0.943616	valid_1's auc: 0.896157
[6000]	training's auc: 0.948817	valid_1's auc: 0.89713
[7000]	training's auc: 0.953572	valid_1's auc: 0.897459
[8000]	training's auc: 0.957841	valid_1's auc: 0.897496
[9000]	training's auc: 0.961867	valid_1's auc: 0.897533
[10000]	training's auc: 0.965636	valid_1's auc: 0.897325
[11000]	training's auc: 0.969205	valid_1's auc: 0.897325
[12000]	training's auc: 0.972525	valid_1's auc: 0.897014
Early stopping, best iteration is:
[9050]	training's auc: 0.962065	valid_1's auc: 0.897571
Fold 1
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.892248	valid_1's auc: 0.867219
[2000]	training's auc: 0.916121	valid_

KeyError: ignored

In [None]:
import joblib
# save model
joblib.dump(gcv.best_estimator_, 'lgb.pkl')
# load model

In [9]:
predictions_df = pd.DataFrame(test_df)
predictions_df['predictions'] = predictions

In [10]:
predictions_df['predictions'] = predictions_df['predictions'] / 5

In [12]:
predictions_df = predictions_df['predictions']

In [13]:
predictions_df

0         0.086863
1         0.207560
2         0.187246
3         0.197280
4         0.041279
            ...   
199995    0.033802
199996    0.007421
199997    0.004043
199998    0.086295
199999    0.059993
Name: predictions, Length: 200000, dtype: float64

In [None]:
roc_auc_score(y, res_val)

0.8994517978181329

In [14]:
predictions_df.to_csv('test_gb.csv')

In [15]:
predictions_vsl = pd.DataFrame({'ID_code' : train_df['ID_code'].values})

In [20]:
predictions_vsl['prediction'] = res_val
predictions_vsl

Unnamed: 0,ID_code,prediction
0,train_0,0.009666
1,train_1,0.350433
2,train_2,0.007575
3,train_3,0.233441
4,train_4,0.084524
...,...,...
199995,train_199995,0.083923
199996,train_199996,0.044822
199997,train_199997,0.046748
199998,train_199998,0.047308


In [21]:
predictions_vsl.to_csv('train_gb.csv')

In [22]:
predictions_vsl

Unnamed: 0,ID_code,prediction
0,train_0,0.009666
1,train_1,0.350433
2,train_2,0.007575
3,train_3,0.233441
4,train_4,0.084524
...,...,...
199995,train_199995,0.083923
199996,train_199996,0.044822
199997,train_199997,0.046748
199998,train_199998,0.047308
