In [None]:
params = {'num_leaves': 491,
          'colsample_bytree': 0.75,
          'subsample': 0.75,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.45,
          'reg_lambda': 0.65,
          'random_state': 47,
          'n_jobs':-1,
          'n_estimators':10000,
         }

In [None]:
#用于五折交叉验证时看特征重要性
def get_imp(clfs,imp_type='gain',feature_names=None):
    feature_importances=pd.DataFrame()
    feature_importances['feature']=feature_names
    for i,clf in enumerate(clfs):
        if hasattr(clf,'predict_proba'):
            feature_importances[str(i)]=clf._Booster.feature_importance(imp_type)
        else:
            feature_importances[str(i)]=clf.feature_importance(imp_type)
    feature_importances['average']=np.exp(np.log1p(feature_importances[[str(i) for i in range(len(clfs))]]).mean(axis=1))
    return feature_importances

In [None]:
def lgb_model(params=None,cv=KFold(5),X=None,y=None,early_stopping_rounds=200,cols=None,categorical_feature=None,group_feature=None):

    #folds = StratifiedKFold(n_splits=NFOLDS)
    folds =cv
    if cols==None:
        columns = list(X.columns)
    else:
        columns=cols
    if group_feature:
        columns.remove(group_feature)
        splits = folds.split(X[columns], y,groups=X[group_feature])
    else:
        splits = folds.split(X[columns], y)
    y_oof = np.zeros(X.shape[0])
    score = 0


    clfs=[]
    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]


        clf = lgb.LGBMClassifier(**params)
        if categorical_feature:
            clf.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_valid,y_valid)], \
                     eval_metric='auc', verbose=100, early_stopping_rounds=early_stopping_rounds,categorical_feature=categorical_feature)
        else:
            clf.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_valid,y_valid)], \
                     eval_metric='auc', verbose=100, early_stopping_rounds=early_stopping_rounds)

        y_pred_valid = clf.predict_proba(X_valid)[:,1]
        y_oof[valid_index] = y_pred_valid
        print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")

        score += roc_auc_score(y_valid, y_pred_valid) / 5

        del X_train, X_valid, y_train, y_valid
        gc.collect()
        clfs.append(clf)

    print(f"\nMean AUC = {score}")
    print(f"Out of folds AUC = {roc_auc_score(y, y_oof)}")
    return clfs

In [None]:
clfs=lgb_model(params=params,cv=KFold(5),X=X,y=y,early_stopping_rounds=200,cols=cols,categorical_feature=cat,group_feature=None)

In [None]:
#需要特别警惕有特别强的变量，变量重要性以平衡为好，如果有一个特别强，并且发现泛化误差很大那说明这个强特征很有可能有偏移问题。
feature_imporatnces=get_imp(clfs,imp_type='gain',feature_names=cols)
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_imporatnces.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('50 TOP feature importance over {} folds average gain'.format(5));

In [None]:
#这时候可以先调整超参数，如果发现没有提高，则需要对特征进行处理。
params['max_depth']=5
params['subsample']=0.42
params['colsample_bytree']=0.38
params['reg_lambda'] = 3

In [None]:
clfs=lgb_model(params=params,cv=KFold(5),X=X,X_test=X_test,early_stopping_rounds=200,categorical_feature=cat)

In [None]:
lgb_params = {
        'boosting': 'gbdt',
        'application': 'binary',
        'metric': 'auc', 
        'learning_rate': 0.1,
        'num_leaves': 32,
        'max_depth': 8,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'n_estimators':1000
}
X['is_train']=1
X_test['is_train']=0

# Get folds for k-fold CV
df=pd.concat([X,X_test])
df[cat]=df[cat].astype('category')
df.index=list(range(df.shape[0]))
y=df.pop('is_train')


In [None]:
X.drop('is_train',axis=1,inplace=True)
X_test.drop('is_train',axis=1,inplace=True)

In [None]:
Clfs=lgb_model(params=lgb_params ,cv=StratifiedKFold(n_splits = 5, random_state = 123),X=df,y=y,early_stopping_rounds=100, \
               cols=cols,categorical_feature=cat,group_feature=None)

In [None]:
#看看哪些维度导致了训练和测试的分布差异
feature_importances=get_imp(Clfs,'gain',list(df.columns))
plt.figure(figsize=(16,16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('50 TOP feature importance over {} folds average'.format(5));

In [None]:
#删除太强的transactiondt然后看看其它特征的情况
cols=list(df.columns)
cols.remove('TransactionDT')
Clfs=lgb_model(params=lgb_params,cv=StratifiedKFold(n_splits = 5, random_state = 123),X=df,y=y,early_stopping_rounds=100, \
               cols=cols,categorical_feature=cat,group_feature=None)

In [None]:
feature_importances=get_imp(Clfs,'gain',cols)
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('50 TOP feature importance over {} folds average'.format(5));


In [None]:
X['nouse']=0 #lgb无法在单特征上训练(会报错)所以使用一个nouse特征避免错误，因为是常数特征值全都一样所以
#不用担心影响模型训练过程，模型完全不会在nouse特征上训练的
flag=0
cols = ['card1','card2']#放入需要验证的特征
for col in cols:#cols:
    folds=KFold(5)

    splits = folds.split(X[col], y)


    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = X[[col,'nouse']].iloc[train_index], X[[col,'nouse']].iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]


        clf = lgb.LGBMClassifier(n_estimators=500, objective='binary', 
                num_leaves=491, learning_rate=0.02,n_jobs=-1,max_depth=-1)
        h = clf.fit(X_train, y_train, eval_metric='auc', 
                eval_set=[(X_train,y_train),(X_valid,y_valid)],verbose=-1)

        auc_train = np.round(h._best_score['training']['auc'],4)
        auc_val = np.round(h._best_score['valid_1']['auc'],4)
        result.append([flag,auc_train,auc_val,auc_train-auc_val])

        del X_train, X_valid, y_train, y_valid,clf
        gc.collect()
    flag+=1
X.pop('nouse')
result=pd.DataFrame(result,columns=['flag','train_auc','valid_auc','auc_delta'])
results=pd.DataFrame()
results = result.groupby('flag')['train_auc','valid_auc','auc_delta'].apply(np.mean)
results['feature']=cols
results.to_csv('.\output\kfold_kris_feature_distribution_test.csv',index=False)

In [None]:
#与之前的特征重要性表进行对比
plt.figure(figsize=(16, 16))
sns.barplot(data=results.sort_values(by='train_auc', ascending=False).head(50), x='train_auc', y='feature');
plt.title('50 TOP feature importance over {} folds average train_auc'.format(5));

In [None]:
#与之前的特征重要性表和trainAUC表进行对比
plt.figure(figsize=(16, 16))
sns.barplot(data=results.sort_values(by='valid_auc', ascending=False).head(50), x='valid_auc', y='feature');
plt.title('50 TOP feature importance over {} folds average valid_auc'.format(5));

In [None]:
#AUC差表
plt.figure(figsize=(16, 16))
sns.barplot(data=results.sort_values(by='auc_delta', ascending=False).head(50), x='auc_delta', y='feature');
plt.title('50 TOP feature importance over {} folds average delta_auc'.format(5));

In [None]:
#验证集上表现小于0.5视作毒特征直接删除
to_drops=results.loc[results.valid_auc<=0.5,'feature'].values.tolist()