# Module

In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
from tqdm import tqdm_notebook as tqdm
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")
gc.enable()

In [2]:
pd.set_option('max_rows', 500)
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

# Load Data

In [3]:
train_raw = pd.read_csv('./data/train.csv')
test_raw = pd.read_csv('./data/test.csv')
train_raw.shape, test_raw.shape

((200000, 202), (200000, 201))

In [4]:
train = train_raw.copy()
test = test_raw.copy()

In [5]:
col_list = train.columns[2:]

In [6]:
train_0 = train[train.target == 0]
train_1 = train[train.target == 1]

In [7]:
pb_idx = np.load('./data_temp/public_LB.npy')
pv_idx = np.load('./data_temp/private_LB.npy')

In [8]:
test_pb = test.iloc[pb_idx].sort_index().copy()
test_pv = test.iloc[pv_idx].sort_index().copy()

test_real = test_pb.append(test_pv)

In [9]:
data = train.append(test_real)

# Extract Unique Value in All Data

In [10]:
unique_df = data[['ID_code']]
con_df = data[['ID_code']]

In [11]:
for col in tqdm(col_list):
    unique_df[col] = data[col].map(((data[col].value_counts() == 1) * 1).to_dict())
    con_df[col] = data[col].map((~(data[col].value_counts() == 1) * 1).to_dict())

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [12]:
for col in tqdm(col_list):
    data[col + '_unique'] = np.around(data[col] * unique_df[col], 4)
    data[col + '_con'] = np.around(data[col] * con_df[col], 4)

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [13]:
train = data[~data.target.isna()]
test = data[data.target.isna()]

In [14]:
target = train['target']

In [15]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average': False,
    'boost': 'gbdt',
    'feature_fraction_seed': 47,
    'feature_fraction': 0.041,
    'learning_rate': 0.01,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'num_threads': 8
}

In [16]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

train_columns = [c for c in train.columns if c not in ['ID_code', 'target']]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target.values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])

    num_round = 30000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=400, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits

    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
    print("CV score: {:<8.5f}".format(roc_auc_score(target.values[val_idx], oof_lgb[val_idx])))
        
print("CV score: {:<8.5f}".format(roc_auc_score(target.values, oof_lgb)))

fold n°0
Training until validation scores don't improve for 200 rounds.
[400]	training's auc: 0.896151	valid_1's auc: 0.882588
[800]	training's auc: 0.908063	valid_1's auc: 0.89331
[1200]	training's auc: 0.91579	valid_1's auc: 0.899737
[1600]	training's auc: 0.921465	valid_1's auc: 0.903533
[2000]	training's auc: 0.926619	valid_1's auc: 0.907027
[2400]	training's auc: 0.930723	valid_1's auc: 0.909608
[2800]	training's auc: 0.934314	valid_1's auc: 0.911416
[3200]	training's auc: 0.937326	valid_1's auc: 0.912916
[3600]	training's auc: 0.939967	valid_1's auc: 0.914021
[4000]	training's auc: 0.942283	valid_1's auc: 0.915044
[4400]	training's auc: 0.944488	valid_1's auc: 0.915898
[4800]	training's auc: 0.946486	valid_1's auc: 0.916558
[5200]	training's auc: 0.948352	valid_1's auc: 0.916926
[5600]	training's auc: 0.95016	valid_1's auc: 0.917256
[6000]	training's auc: 0.95188	valid_1's auc: 0.91753
[6400]	training's auc: 0.953501	valid_1's auc: 0.917727
[6800]	training's auc: 0.955111	valid_1

In [35]:
bbiggu = train[['ID_code', 'target']]
bbiggu['pred'] = oof_lgb

In [36]:
temp = pd.read_csv('./data_temp/new_bbiggu.csv')
temp['target'] = train.target

In [37]:
roc_auc_score(bbiggu[(bbiggu.pred < 0.1)].target, bbiggu[(bbiggu.pred < 0.1)].pred), roc_auc_score(temp[temp.pred<0.1].target, temp[temp.pred<0.1].pred)

(0.7751441984398802, 0.7707841507048583)

In [23]:
roc_auc_score(bbiggu[(bbiggu.pred >= 0.1)].target, bbiggu[(bbiggu.pred >= 0.1)].pred), roc_auc_score(temp[temp.pred>=0.1].target, temp[temp.pred>=0.1].pred)

(0.8167443836875399, 0.8045344524437575)

In [24]:
bbiggu[(bbiggu.pred < 0.1) & (bbiggu.target == 1)].shape[0], temp[(temp.pred < 0.1) & (temp.target == 1)].shape[0]

(3146, 3495)

In [25]:
bbiggu[(bbiggu.pred > 0.1) & (bbiggu.target == 0)].shape[0], temp[(temp.pred > 0.1) & (temp.target == 0)].shape[0]

(29914, 29518)

In [31]:
bbiggu.loc[bbiggu.ID_code.isin(bbiggu[(bbiggu.pred < 0.1) & (bbiggu.target == 1)].ID_code), 'pred'] = 1

In [32]:
bbiggu.loc[bbiggu.ID_code.isin(bbiggu[(bbiggu.pred >= 0.1) & (bbiggu.target == 0)].ID_code), 'pred'] = 0

In [34]:
roc_auc_score(bbiggu.target, bbiggu.pred)

1.0

In [38]:
bbiggu.to_csv('./data_temp/final_bbiggu.csv', index=False)

# sub

In [36]:
test['target'] = predictions_lgb
sub = pd.read_csv('./data/sample_submission.csv')
unchange = sub[~sub.ID_code.isin(test.ID_code)]
sub = test[['ID_code', 'target']].append(unchange).sort_index()

In [37]:
sub.to_csv('./data/sub_feature_unique_con_val.csv', index=False)