In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
import missingno as msno
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
import xgboost as xgb
from xgboost import XGBClassifier
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
warnings.filterwarnings("ignore")

In [3]:
train=pd.read_csv(r'C:\Users\Lenovo\Desktop\kesai\train_set.csv',na_values=[-1, "unknown"])
test =pd.read_csv(r'C:\Users\Lenovo\Desktop\kesai\test_set.csv',na_values=[-1, "unknown"])

In [4]:
#把列分为类别列，bin列和连续值列
bincol = ["default", "housing", "loan"]
catcol = ["marital", "education", "contact", "poutcome", "job"]
othercol = ['age', 'balance', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous']

In [5]:
def rename_col(st):
    if st in bincol:
        return st+"_bin"
    elif st in catcol:
        return st + "_cat"
    else:
        return st

train.columns = train.columns.map(lambda x:rename_col(x))
test.columns = test.columns.map(lambda x:rename_col(x))
bincol = list(map(lambda x: x+"_bin", bincol))
catcol = list(map(lambda x: x+"_cat", catcol))

In [6]:
misssing_col = train.isnull().any()[train.isnull().any()]
col_missing = train.isnull().any()[train.isnull().any()].index
miss_cat = [x for x in col_missing if x in catcol] + ["pdays"]
train[miss_cat] = train[miss_cat].replace(np.nan, -1)
test[miss_cat] = test[miss_cat].replace(np.nan, -1)

In [7]:
len_train = len(train)
len_test = len(test)

In [8]:
miss_nocat = [x for x in col_missing if x not in miss_cat]
train['pdays_miss'] = np.zeros(len_train)
train['pdays_miss'] [train.pdays.isnull()] = 1
test['pdays_miss'] = np.zeros(len_test)
test['pdays_miss'] [test.pdays.isnull()] = 1
train[miss_nocat] = train[miss_nocat].replace(np.nan,train[miss_nocat].median())
test[miss_nocat] = test[miss_nocat].replace(np.nan, train[miss_nocat].median())

In [9]:
poutcome_mapping = {-1:-1,"other":0,"success":1,"failure":-2}
train["poutcome_cat"] = train["poutcome_cat"].map(poutcome_mapping)
train["poutcome_pdays"] = train["poutcome_cat"] * train["pdays"]
test["poutcome_cat"] = test["poutcome_cat"].map(poutcome_mapping)
test["poutcome_pdays"] = test["poutcome_cat"] * test["pdays"]
for i in bincol:
    bin_mapping = {"yes":1, "no":0}
    train[i] = train[i].map(bin_mapping)
    test[i] = test[i].map(bin_mapping)

In [10]:
print(train.isnull().any()[train.isnull().any()].index)
test.isnull().any()[test.isnull().any()].index

Index([], dtype='object')


Index([], dtype='object')

In [11]:
data = pd.concat([train,test])
feature=data.columns.tolist()
data.isnull().any()[data.isnull().any()].index


Index(['y'], dtype='object')

In [12]:
feature.remove('ID')
feature.remove('y')
sparse_feature= ['contact_cat','default_bin','education_cat','housing_bin','job_cat','loan_bin','marital_cat','month','poutcome_cat']
dense_feature=list(set(feature)-set(sparse_feature))

In [13]:
def get_new_columns(name,aggs):
    l=[]
    for k in aggs.keys():
        for agg in aggs[k]:
            if str(type(agg))=="<class 'function'>":
                l.append(name + '_' + k + '_' + 'other')
            else:
                l.append(name + '_' + k + '_' + agg)
    return l


In [14]:
for d in tqdm(sparse_feature):
    aggs={}
    for s in sparse_feature:
        aggs[s]=['count','nunique']
    for den in dense_feature:
        aggs[den]=['mean','max','min','std']
    aggs.pop(d)
    temp=data.groupby(d).agg(aggs).reset_index()
    temp.columns=[d]+get_new_columns(d,aggs)
    data=pd.merge(data,temp,on=d,how='left')

  0%|                                                                                            | 0/9 [00:00<?, ?it/s] 11%|█████████▎                                                                          | 1/9 [00:00<00:01,  4.87it/s] 22%|██████████████████▋                                                                 | 2/9 [00:00<00:01,  4.14it/s] 33%|████████████████████████████                                                        | 3/9 [00:01<00:01,  3.01it/s] 44%|█████████████████████████████████████▎                                              | 4/9 [00:01<00:02,  2.04it/s] 56%|██████████████████████████████████████████████▋                                     | 5/9 [00:03<00:02,  1.40it/s] 67%|████████████████████████████████████████████████████████                            | 6/9 [00:04<00:03,  1.03s/it] 78%|█████████████████████████████████████████████████████████████████▎                  | 7/9 [00:07<00:02,  1.41s/it] 89%|██████████████████████████████████

In [15]:
data.isnull().any()[data.isnull().any()].index

Index(['y'], dtype='object')

In [16]:
for s in catcol:
    data=pd.concat([data,pd.get_dummies(data[s],prefix=s+'_')],axis=1)
    data.drop(s,axis=1,inplace=True)
# 月份编码
data=pd.concat([data,pd.get_dummies(data["month"],prefix="month"+'_')],axis=1)
data.drop("month",axis=1,inplace=True)

In [17]:
df_train=data[data['y'].notnull()]
df_test=data[data['y'].isnull()]

target=df_train['y']
df_train_columns=df_train.columns.tolist()
df_train_columns.remove('ID')
df_train_columns.remove('y')

In [18]:
df_train.head()
feature=df_train.columns.tolist()
feature.remove("ID")
feature.remove('y')

In [19]:

X_train, X_test, y_train, y_test = train_test_split(df_train[feature], df_train.y, train_size = 0.25, random_state = 33)

In [None]:
no_features = ["y", "ID"]
features = df_train[[x for x in df_train.columns if x not in no_features]].columns[:]
len(features)

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection

def get_top_n_features(train_data_X, train_data_Y, top_n_features):

    # random forest
    rf_est = RandomForestClassifier(random_state=0)
    rf_param_grid = {'n_estimators': [500], 'min_samples_split': [2, 3], 'max_depth': [20]}
    rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1)
    rf_grid.fit(train_data_X, train_data_Y)
    print('Top N Features Best RF Params:' + str(rf_grid.best_params_))
    print('Top N Features Best RF Score:' + str(rf_grid.best_score_))
    print('Top N Features RF Train Score:' + str(rf_grid.score(train_data_X, train_data_Y)))
    feature_imp_sorted_rf = pd.DataFrame({'feature': list(train_data_X),
                                          'importance': rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
    print('Sample 10 Features from RF Classifier')
    print(str(features_top_n_rf[:10]))

    # AdaBoost
    ada_est =AdaBoostClassifier(random_state=0)
    ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]}
    ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=25, cv=10, verbose=1)
    ada_grid.fit(train_data_X, train_data_Y)
    print('Top N Features Best Ada Params:' + str(ada_grid.best_params_))
    print('Top N Features Best Ada Score:' + str(ada_grid.best_score_))
    print('Top N Features Ada Train Score:' + str(ada_grid.score(train_data_X, train_data_Y)))
    feature_imp_sorted_ada = pd.DataFrame({'feature': list(train_data_X),
                                           'importance': ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']
    print('Sample 10 Feature from Ada Classifier:')
    print(str(features_top_n_ada[:10]))

    # ExtraTree
    et_est = ExtraTreesClassifier(random_state=0)
    et_param_grid = {'n_estimators': [500], 'min_samples_split': [3, 4], 'max_depth': [20]}
    et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=25, cv=10, verbose=1)
    et_grid.fit(train_data_X, train_data_Y)
    print('Top N Features Best ET Params:' + str(et_grid.best_params_))
    print('Top N Features Best ET Score:' + str(et_grid.best_score_))
    print('Top N Features ET Train Score:' + str(et_grid.score(train_data_X, train_data_Y)))
    feature_imp_sorted_et = pd.DataFrame({'feature': list(train_data_X),
                                          'importance': et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
    print('Sample 10 Features from ET Classifier:')
    print(str(features_top_n_et[:10]))
    
    # GradientBoosting
    gb_est =GradientBoostingClassifier(random_state=0)
    gb_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1], 'max_depth': [20]}
    gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid, n_jobs=25, cv=10, verbose=1)
    gb_grid.fit(train_data_X, train_data_Y)
    print('Top N Features Best GB Params:' + str(gb_grid.best_params_))
    print('Top N Features Best GB Score:' + str(gb_grid.best_score_))
    print('Top N Features GB Train Score:' + str(gb_grid.score(train_data_X, train_data_Y)))
    feature_imp_sorted_gb = pd.DataFrame({'feature': list(train_data_X),
                                           'importance': gb_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature']
    print('Sample 10 Feature from GB Classifier:')
    print(str(features_top_n_gb[:10]))
    
    # DecisionTree
    dt_est = DecisionTreeClassifier(random_state=0)
    dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]}
    dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=25, cv=10, verbose=1)
    dt_grid.fit(train_data_X, train_data_Y)
    print('Top N Features Best DT Params:' + str(dt_grid.best_params_))
    print('Top N Features Best DT Score:' + str(dt_grid.best_score_))
    print('Top N Features DT Train Score:' + str(dt_grid.score(train_data_X, train_data_Y)))
    feature_imp_sorted_dt = pd.DataFrame({'feature': list(train_data_X),
                                          'importance': dt_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature']
    print('Sample 10 Features from DT Classifier:')
    print(str(features_top_n_dt[:10]))
    
    # merge the three models
    features_top_n = pd.concat([features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt], 
                               ignore_index=True).drop_duplicates()
    
    features_importance = pd.concat([feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et, 
                                   feature_imp_sorted_gb, feature_imp_sorted_dt],ignore_index=True)
    
    return features_top_n , features_importance

feature_to_pick = 50
feature_top_n, feature_importance = get_top_n_features(df_train[features], df_train.y, feature_to_pick)

Fitting 10 folds for each of 2 candidates, totalling 20 fits
Top N Features Best RF Params:{'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 500}
Top N Features Best RF Score:0.89362878698108
Top N Features RF Train Score:0.9729430817237429
Sample 10 Features from RF Classifier
5                              duration
1                               balance
3                                   day
0                                   age
2                              campaign
10                       poutcome_pdays
8                                 pdays
432    poutcome_cat_education_cat_count
11                             previous
436          poutcome_cat_job_cat_count
Name: feature, dtype: object
Fitting 10 folds for each of 2 candidates, totalling 20 fits
Top N Features Best Ada Params:{'learning_rate': 0.1, 'n_estimators': 500}
Top N Features Best Ada Score:0.9027925899593159
Top N Features Ada Train Score:0.904175060236205
Sample 10 Feature from Ada Classifier:
5          

[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  4.2min remaining:  2.2min
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  4.2min finished
[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  9.6min remaining:  5.1min
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  9.7min finished
[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  7.0min remaining:  3.8min
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  7.2min finished
[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
