In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import warnings
from datetime import datetime
import lightgbm as lgb

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('data/data_a.csv')
data.replace('\\N', np.NaN, inplace=True)

In [3]:
f_features = ['gprs_fee', 'overrun_flux_fee', 'out_actvcall_dur', 'actvcall_fee',
       'out_activcall_fee', 'monfix_fee', 'gift_acct_amt', 'call_cnt',
       'up_flux', 'down_flux', 'p2psms_up_cnt',
       'p2psms_cmnct_fee', 'p2psms_pkg_fee']
data[f_features] = data[f_features].astype('float')

In [4]:
cat_cols = ['if_family', 'if_group', 'sms_inpkg_ind']
data.sort_values('month', inplace=True)
data.drop_duplicates('phone', inplace=True)
data.reset_index(drop=True, inplace=True)
data[cat_cols] = data[cat_cols].astype('category')

In [5]:
X_train = data[data['month'] == 202001]
X_test = data[data['month'] == 202003]
train_label = pd.read_csv('data/train_label.csv')
X_train = pd.merge(train_label, X_train, on='phone', how='left')
test_label = pd.read_csv('data/to_pred_a.csv')
X_test = pd.merge(test_label, X_test, on='phone', how='left')
y = X_train['label']
drop_cols = ['month', 'label', 'phone']
X_train.drop(drop_cols, axis=1, inplace=True)
X_test.drop(drop_cols, axis=1, inplace=True)
features = X_train.columns

In [6]:
features

Index(['if_family', 'if_group', 'chrg_cnt', 'chrg_amt', 'gprs_fee',
       'overrun_flux_fee', 'out_actvcall_dur', 'actvcall_fee',
       'out_activcall_fee', 'monfix_fee', 'gift_acct_amt', 'call_cnt',
       'up_flux', 'down_flux', 'sms_inpkg_ind', 'p2psms_up_cnt',
       'p2psms_cmnct_fee', 'p2psms_pkg_fee'],
      dtype='object')

In [7]:
KF = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
params = {
          'objective':'binary',
          'metric':'binary_error', 
          'learning_rate':0.05, 
          'subsample':0.8, 
          'subsample_freq':3, 
          'colsample_btree':0.8,
          'num_iterations': 10000, 
          'silent':True
}

oof_lgb = np.zeros(len(X_train))
predictions_lgb = np.zeros((len(X_test)))

# 五折交叉验证
for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y.values)):
    print("fold n°{}".format(fold_))
    print('trn_idx:',trn_idx)
    print('val_idx:',val_idx)
    trn_data = lgb.Dataset(X_train.iloc[trn_idx][features],label=y.iloc[trn_idx])    
    val_data = lgb.Dataset(X_train.iloc[val_idx][features],label=y.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(
        params,
        trn_data,
        num_round,
        valid_sets = [trn_data, val_data],
        verbose_eval=500,
        early_stopping_rounds=200,  
        categorical_feature=cat_cols,    
    )       
    oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) 
print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
print("F1 score: {}".format(f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Precision score: {}".format(precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Recall score: {}".format(recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))

fold n°0
trn_idx: [     0      2      3 ... 433410 433411 433412]
val_idx: [     1     13     15 ... 433402 433408 433409]
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[207]	training's binary_error: 0.00200444	valid_1's binary_error: 0.00203039
fold n°1
trn_idx: [     0      1      2 ... 433410 433411 433412]
val_idx: [     6     16     27 ... 433393 433406 433407]
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[120]	training's binary_error: 0.00196983	valid_1's binary_error: 0.00220343
fold n°2
trn_idx: [     0      1      3 ... 433409 433410 433412]
val_idx: [     2      4      5 ... 433396 433404 433411]
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[160]	training's binary_error: 0.00201886	valid_1's binary_error: 0.00197271
fold n°3
trn_idx: [     1      2      4 ... 433409 433410 433411]
val_idx: [     0      3     10 ... 433394 4

In [42]:
# AUC score: 0.9994364324594397
# F1 score: 0.9300286898310489
# 2809  
# 0.943803
test_label['label'] = [1 if i >=2.5 else 0 for i in predictions_lgb]
test_label.to_csv('submit.csv', index=False)

In [8]:
def find_best_t(y_pred, y_true):
    best_score = f1_score(y_true, np.where(y_pred >= 0.5, 1, 0))
    t = 0.5
    for i in tqdm(range(300, 700)):
        current_score = f1_score(y_true, np.where(y_pred >= i / 1000, 1, 0))
        if current_score > best_score:
            best_score = current_score
            t = i / 1000
            print(f'best score: {best_score}, best t: {t}')
    return t

t = 0.5
t = find_best_t(oof_lgb, y)

100%|████████████████████████████████████████| 400/400 [00:41<00:00,  9.66it/s]


In [38]:
sum(test_label['label'])

2809