In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss,explained_variance_score
warnings.filterwarnings('ignore')
import random
from sklearn.model_selection import train_test_split

In [2]:
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

In [3]:
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
print(numerical_fea)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
print(category_fea)
label = 'label'
numerical_fea.remove(label)

['sex', 'address', 'birthDT', 'apply_amount', 'apply_dateDT', 'credit_actual_dt', 'credit_mat_dt', 'credit_lmt', 'credit_term', 'to_credit_lmt', 'to_credit_term', 'draw_amt', 'in_pay_amt', 'loan_used_rate', 'out_pay_amt', 'to_term', 'to_overdue_term', 'max_overdue_loan_bal', 'to_contr', 'typeCT', 'debt', 'label']
['cust_id']


In [4]:
# 部分类别特征
cate_features = ['cust_id','address']
for f in cate_features:
    print(f, '类型数：', data_train[f].nunique())

cust_id 类型数： 8000
address 类型数： 1880


In [5]:
# 类型数在2之上，又不是高维稀疏的,且纯分类特征
# for data in [data_train, data_test]:
#     data = pd.get_dummies(data, columns=['address'], drop_first=True)

In [6]:
def find_outliers_by_3segama(data,fea):
    data_std = np.std(data[fea])
    data_mean = np.mean(data[fea])
    outliers_cut_off = data_std * 3
    lower_rule = data_mean - outliers_cut_off
    upper_rule = data_mean + outliers_cut_off
    data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
    return data

In [7]:
data_train = data_train.copy()
data_test = data_test.copy()
for fea in numerical_fea:
    data_train = find_outliers_by_3segama(data_train,fea)
    print(data_train[fea+'_outliers'].value_counts())
    print(data_train.groupby(fea+'_outliers')['label'].sum())
    print('*'*10)

正常值    8000
Name: sex_outliers, dtype: int64
sex_outliers
正常值    800
Name: label, dtype: int64
**********
正常值    8000
Name: address_outliers, dtype: int64
address_outliers
正常值    800
Name: label, dtype: int64
**********
正常值    7919
异常值      81
Name: birthDT_outliers, dtype: int64
birthDT_outliers
异常值      7
正常值    793
Name: label, dtype: int64
**********
正常值    7872
异常值     128
Name: apply_amount_outliers, dtype: int64
apply_amount_outliers
异常值      5
正常值    795
Name: label, dtype: int64
**********
正常值    7998
异常值       2
Name: apply_dateDT_outliers, dtype: int64
apply_dateDT_outliers
异常值      0
正常值    800
Name: label, dtype: int64
**********
正常值    8000
Name: credit_actual_dt_outliers, dtype: int64
credit_actual_dt_outliers
正常值    800
Name: label, dtype: int64
**********
正常值    8000
Name: credit_mat_dt_outliers, dtype: int64
credit_mat_dt_outliers
正常值    800
Name: label, dtype: int64
**********
正常值    7921
异常值      79
Name: credit_lmt_outliers, dtype: int64
credit_lmt_outliers
异常值    

In [8]:
data_train['apply_amount_bin1'] = np.floor_divide(data_train['apply_amount'], 1000)
data_test['apply_amount_bin1'] = np.floor_divide(data_test['apply_amount'], 1000)

In [9]:
## 通过对数函数映射到指数宽度分箱
data_train['apply_amount_bin2'] = np.floor(np.log10(data_train['apply_amount']))
data_test['apply_amount_bin2'] = np.floor(np.log10(data_test['apply_amount']))

In [10]:
data_train['apply_amount_bin3'] = pd.qcut(data_train['apply_amount'], 10, labels=False)
data_test['apply_amount_bin3'] = pd.qcut(data_test['apply_amount'], 10, labels=False)

In [11]:
# for col in ['credit_lmt', 'credit_term']: 
#     temp_dict = data_train.groupby([col])['label'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
#     temp_dict.index = temp_dict[col].values
#     temp_dict = temp_dict[col + '_target_mean'].to_dict()
    
#     data_test[col + '_target_mean'] = data_test[col].map(temp_dict)
#     data_train[col + '_target_mean'] = data_train[col].map(temp_dict)


In [12]:
data_train['address']=data_train['address'].astype('int')
data_test['address']=data_test['address'].astype('int')


In [13]:
features = [f for f in data_train.columns if f not in ['cust_id','label'] and '_outliers' not in f]
x_train = data_train[features]
x_test = data_test[features]
y_train = data_train['label']

In [14]:
def get_f1 (preds,dtrain):
    label=dtrain.get_label()
    preds=np.argmax(preds.reshape(len(label),-1), axis=1)
    f1=f1_score(label,preds,average='weighted')
    return 'f1-score',float(f1),True
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,feval=get_f1,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
        
            
        train[valid_index] = val_pred
        #test = test_pred / kf.n_splits
        test = test_pred
    return train, test

In [15]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

In [16]:

lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.993393	training's f1-score: 0.851954	valid_1's auc: 0.736885	valid_1's f1-score: 0.855344
Early stopping, best iteration is:
[1]	training's auc: 0.762302	training's f1-score: 0.851954	valid_1's auc: 0.720297	valid_1's f1-score: 0.855344
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.993292	training's f1-score: 0.851728	valid_1's auc: 0.743878	valid_1's f1-score: 0.856248
Early stopping, best iteration is:
[1]	training's auc: 0.767853	training's f1-score: 0.851728	valid_1's auc: 0.710651	valid_1's f1-score: 0.856248
************************************ 3 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.992699	training's f1-score: 0.852858	val

In [17]:

label_result = pd.read_csv('label.csv')


# lgb_test_copy = lgb.lgb_test.copy()

for j in range(1,300):
    lgb_test_copy = np.zeros((2000,))
    for i in range(lgb_test.shape[0]):
        if lgb_test[i] > 0.001*j:
            lgb_test_copy[i] = 1
        else:
            lgb_test_copy[i] = 0
        # if i<500 :
        #     print(i," ",lgb_test[i]," ",lgb_test_copy[i])
    s = f1_score(label_result['label'].values, lgb_test_copy)
    print(s)


0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
0.18181818181818182
