In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc
import random

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm.auto import tqdm
tqdm.pandas()

from sklearn.model_selection import KFold, train_test_split

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=4, progress_bar=True)

import lightgbm as lgb

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
train = pd.read_pickle('train.pickle')
valid = pd.read_pickle('valid.pickle')

display(train)
display(valid)

Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,1,17458663,0,1,14,21,19,15,330,0.042296,0.063444,0.057402,0.045317
1,0,15663294,0,68,4,2,0,0,337,0.011834,0.005917,0.000000,0.000000
2,0,16335410,0,286,4,5,1,0,456,0.008753,0.010941,0.002188,0.000000
3,0,69378275,0,292,7,9,1,0,2125,0.003293,0.004233,0.000470,0.000000
4,0,30386872,0,344,8,3,0,0,1226,0.006520,0.002445,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
329984,0,68572651,68891,173,2,1,0,0,72,0.027397,0.013699,0.000000,0.000000
329985,0,26757624,68891,430,5,1,0,0,1817,0.002750,0.000550,0.000000,0.000000
329986,0,61403575,68891,498,1,0,0,0,170,0.005848,0.000000,0.000000,0.000000
329987,0,53779907,68891,550,3,1,0,0,261,0.011450,0.003817,0.000000,0.000000


Unnamed: 0,target,candidate,id,recall_rank,words_overlap_count,2gram_overlap_count,3gram_overlap_count,4gram_overlap_count,text_token_length,words_overlap_ratio,2gram_overlap_ratio,3gram_overlap_ratio,4gram_overlap_ratio
0,0,70321679,0,501,2,3,0,0,1936,0.001033,0.001549,0.000000,0.000000
1,0,12860453,0,340,0,0,0,0,166,0.000000,0.000000,0.000000,0.000000
2,0,5419330,0,339,1,2,0,0,312,0.003195,0.006390,0.000000,0.000000
3,0,4575604,0,338,1,0,0,0,74,0.013333,0.000000,0.000000,0.000000
4,0,57989671,0,337,0,2,0,0,203,0.000000,0.009804,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526995,0,8490279,2526,661,2,0,0,0,544,0.003670,0.000000,0.000000,0.000000
2526996,0,321606,2526,660,0,0,0,0,1190,0.000000,0.000000,0.000000,0.000000
2526997,0,70545608,2526,659,0,0,0,0,21,0.000000,0.000000,0.000000,0.000000
2526998,0,59592,2526,672,3,1,0,0,2263,0.001325,0.000442,0.000000,0.000000


In [6]:
feature_names = [
    'recall_rank', 'words_overlap_count',
    '2gram_overlap_count', '3gram_overlap_count', '4gram_overlap_count',
    'text_token_length', 'words_overlap_ratio', '2gram_overlap_ratio',
    '3gram_overlap_ratio', '4gram_overlap_ratio'
]

ycol = 'target'

X_train, X_valid, Y_train, Y_valid = train_test_split(train[feature_names],
                                                      train[ycol],
                                                      test_size=0.15,
                                                      random_state=1983)

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=32,
                           max_depth=6,
                           learning_rate=0.1,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.6,
                           reg_alpha=0.1,
                           reg_lambda=0.2,
                           random_state=1983,
                           is_unbalance=True,
                           metric='auc')

lgb_model = model.fit(X_train,
                      Y_train,
                      eval_names=['train', 'valid'],
                      eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
                      verbose=10,
                      eval_metric='auc',
                      early_stopping_rounds=50)

[10]	train's auc: 0.999219	valid's auc: 0.999322
[20]	train's auc: 0.999336	valid's auc: 0.9994
[30]	train's auc: 0.999462	valid's auc: 0.999515
[40]	train's auc: 0.999578	valid's auc: 0.999583
[50]	train's auc: 0.999663	valid's auc: 0.999632
[60]	train's auc: 0.999717	valid's auc: 0.999634
[70]	train's auc: 0.999783	valid's auc: 0.999665
[80]	train's auc: 0.999813	valid's auc: 0.999674
[90]	train's auc: 0.999832	valid's auc: 0.99968
[100]	train's auc: 0.999843	valid's auc: 0.999658
[110]	train's auc: 0.999855	valid's auc: 0.999647
[120]	train's auc: 0.999863	valid's auc: 0.999647
[130]	train's auc: 0.999869	valid's auc: 0.999644


In [7]:
df_importance = pd.DataFrame({
    'column': feature_names,
    'importance': lgb_model.feature_importances_,
})
df_importance.sort_values('importance', ascending=False)

Unnamed: 0,column,importance
5,text_token_length,492
0,recall_rank,466
6,words_overlap_ratio,393
7,2gram_overlap_ratio,304
1,words_overlap_count,270
8,3gram_overlap_ratio,230
2,2gram_overlap_count,177
3,3gram_overlap_count,136
9,4gram_overlap_ratio,95
4,4gram_overlap_count,67


In [9]:
%%time

pred_val = lgb_model.predict_proba(valid[feature_names])

CPU times: user 16.8 s, sys: 145 ms, total: 17 s
Wall time: 1.17 s


In [10]:
df_metric = valid[['id', 'target']].copy()
df_metric['pred_proba'] = pred_val[:, 1]
df_metric = df_metric.sort_values(['id', 'pred_proba'], ascending=[True, False])
df_metric = df_metric.groupby('id').apply(lambda g: g.head(30)).reset_index(drop=True)
df_metric = df_metric.groupby('id')['target'].agg(list).to_frame().reset_index()
df_metric['target'] = df_metric['target'].apply(sum)
print('hitrate:', df_metric[df_metric['target'] > 0].shape[0] / len(df_metric))

hitrate: 0.998021369212505


In [8]:
import joblib
joblib.dump(lgb_model, 'lightgbm_model.pkl')

['lightgbm_model.pkl']

In [10]:
print(val_ids)

[47816, 49283, 47432, 47856, 49271, 47575, 47034, 48170, 48407, 47285, 47779, 47646, 48398, 47469, 47185, 47810, 47495, 47434, 48981, 47341, 48610, 47397, 48154, 47289, 47465, 47110, 47363, 48180, 48914, 48763, 48983, 48911, 47579, 47271, 48390, 48876, 48965, 47912, 48976, 47828, 49138, 48505, 47069, 48582, 47277, 48804, 47281, 47720, 48887, 48318, 47539, 47409, 48773, 48383, 48860, 47492, 47877, 48087, 47536, 47157, 48004, 48183, 47770, 48923, 48387, 48401, 47410, 47562, 47551, 48482, 48526, 48668, 47762, 48729, 49073, 48808, 47292, 47528, 47840, 47512, 48121, 48470, 48954, 47532, 47070, 48653, 48511, 48691, 47933, 47695, 48011, 48883, 47977, 47571, 48994, 48920, 48478, 48704, 48875, 47035, 47852, 47365, 47297, 48639, 48307, 48602, 47255, 49101, 47986, 47827, 49089, 48379, 47426, 48440, 48212, 47944, 48181, 48257, 48898, 49057, 47592, 49188, 48172, 49075, 48461, 49086, 48563, 48785, 48562, 48258, 48549, 47294, 48960, 48500, 47417, 49240, 47775, 47418, 48546, 48748, 48354, 48675, 49158