In [85]:
from __future__ import division
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
import time

In [86]:
train = pd.read_csv('../data/input/train.csv')
entbase = pd.read_csv('../data/input/1entbase.csv')
alter = pd.read_csv('../data/input/2alter.csv')
branch = pd.read_csv('../data/input/3branch.csv')
invest = pd.read_csv('../data/input/4invest.csv')
right = pd.read_csv('../data/input/5right.csv')
project = pd.read_csv('../data/input/6project.csv')
lawsuit = pd.read_csv('../data/input/7lawsuit.csv')
breakfaith = pd.read_csv('../data/input/8breakfaith.csv')
recruit = pd.read_csv('../data/input/9recruit.csv')
qualification = pd.read_csv('../data/input/10qualification.csv', encoding='gbk')
test = pd.read_csv('../data/input/evaluation_public.csv')

In [87]:
def translate_date(date):
    year = int(date[:4])
    month = int(date[-2:])
    return (year - 2010) * 12 + month

In [192]:
def get_entbase_feature(df):
    df = df.copy()
    
    mydf = df.fillna(value={'ZCZB': 0, 'MPNUM': 0, 'INUM': 0, 'ENUM': 0, 'FINZB': 0, 'FSTINUM': 0, 'TZINUM': 0})  # 未处理 HY；ZCZB 为 0 表示缺失或错误
    
    zczb_gb_prov = mydf.groupby('PROV')['ZCZB'].agg([sum, min, max, np.mean, np.ptp]).reset_index()
    tmp = pd.merge(mydf, zczb_gb_prov, how='left', on='PROV')
    mydf['ent_zczb/sum_gb_prov'] = mydf['ZCZB'] / tmp['sum']
    mydf['ent_zczb-min_gb_prov'] = mydf['ZCZB'] - tmp['min']
    mydf['ent_zczb-max_gb_prov'] = mydf['ZCZB'] - tmp['max']
    mydf['ent_zczb-mean_gb_prov'] = mydf['ZCZB'] - tmp['mean']
    
    # bad
    zczb_gb_rgyear = mydf.groupby('RGYEAR')['ZCZB'].agg([sum, min, max, np.mean, np.ptp]).reset_index()
    tmp = pd.merge(mydf, zczb_gb_rgyear, how='left', on='RGYEAR')
    mydf['ent_zczb/sum_gb_rgyear'] = mydf['ZCZB'] / tmp['sum']
    mydf['ent_zczb-min_gb_rgyear'] = mydf['ZCZB'] - tmp['min']
    mydf['ent_zczb-max_gb_rgyear'] = mydf['ZCZB'] - tmp['max']
    mydf['ent_zczb-mean_gb_rgyear'] = mydf['ZCZB'] - tmp['mean']
    
    zczb_gb_hy = mydf.groupby('HY')['ZCZB'].agg([sum, min, max, np.mean, np.ptp]).reset_index()
    tmp = pd.merge(mydf, zczb_gb_hy, how='left', on='HY')
    mydf['ent_zczb/sum_gb_hy'] = mydf['ZCZB'] / tmp['sum']
    mydf['ent_zczb-min_gb_hy'] = mydf['ZCZB'] - tmp['min']
    mydf['ent_zczb-max_gb_hy'] = mydf['ZCZB'] - tmp['max']
    mydf['ent_zczb-mean_gb_hy'] = mydf['ZCZB'] - tmp['mean']
    
    zczb_gb_etype = mydf.groupby('ETYPE')['ZCZB'].agg([sum, min, max, np.mean, np.ptp]).reset_index()
    tmp = pd.merge(mydf, zczb_gb_etype, how='left', on='ETYPE')
    mydf['ent_zczb/sum_gb_etype'] = mydf['ZCZB'] / tmp['sum']
    mydf['ent_zczb-min_gb_etype'] = mydf['ZCZB'] - tmp['min']
    mydf['ent_zczb-max_gb_etype'] = mydf['ZCZB'] - tmp['max']
    mydf['ent_zczb-mean_gb_etype'] = mydf['ZCZB'] - tmp['mean']
    
    ##### bad
    mpnum_gb_prov = mydf.groupby('PROV')['MPNUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, mpnum_gb_prov, how='left', on='PROV')
    mydf['ent_mpnum/sum_gb_prov'] = mydf['MPNUM'] / tmp['sum']
    mydf['ent_mpnum-min_gb_prov'] = mydf['MPNUM'] - tmp['min']
    mydf['ent_mpnum-max_gb_prov'] = mydf['MPNUM'] - tmp['max']
    mydf['ent_mpnum-median_gb_prov'] = mydf['MPNUM'] - tmp['median']
    
    mpnum_gb_rgyear = mydf.groupby('RGYEAR')['MPNUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, mpnum_gb_rgyear, how='left', on='RGYEAR')
    mydf['ent_mpnum/sum_gb_rgyear'] = mydf['MPNUM'] / tmp['sum']
    mydf['ent_mpnum-min_gb_rgyear'] = mydf['MPNUM'] - tmp['min']
    mydf['ent_mpnum-max_gb_rgyear'] = mydf['MPNUM'] - tmp['max']
    mydf['ent_mpnum-median_gb_rgyear'] = mydf['MPNUM'] - tmp['median']

    mpnum_gb_hy = mydf.groupby('HY')['MPNUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, mpnum_gb_hy, how='left', on='HY')
    mydf['ent_mpnum/sum_gb_hy'] = mydf['MPNUM'] / tmp['sum']
    mydf['ent_mpnum-min_gb_hy'] = mydf['MPNUM'] - tmp['min']
    mydf['ent_mpnum-max_gb_hy'] = mydf['MPNUM'] - tmp['max']
    mydf['ent_mpnum-median_gb_hy'] = mydf['MPNUM'] - tmp['median']

    mpnum_gb_etype = mydf.groupby('ETYPE')['MPNUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, mpnum_gb_etype, how='left', on='ETYPE')
    mydf['ent_mpnum/sum_gb_etype'] = mydf['MPNUM'] / tmp['sum']
    mydf['ent_mpnum-min_gb_etype'] = mydf['MPNUM'] - tmp['min']
    mydf['ent_mpnum-max_gb_etype'] = mydf['MPNUM'] - tmp['max']
    mydf['ent_mpnum-median_gb_etype'] = mydf['MPNUM'] - tmp['median']
    #####
    
    #####
    inum_gb_prov = mydf.groupby('PROV')['INUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, inum_gb_prov, how='left', on='PROV')
    mydf['ent_inum/sum_gb_prov'] = mydf['INUM'] / tmp['sum']
    mydf['ent_inum-min_gb_prov'] = mydf['INUM'] - tmp['min']
    mydf['ent_inum-max_gb_prov'] = mydf['INUM'] - tmp['max']
    mydf['ent_inum-median_gb_prov'] = mydf['INUM'] - tmp['median']

    inum_gb_rgyear = mydf.groupby('RGYEAR')['INUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, inum_gb_rgyear, how='left', on='RGYEAR')
    mydf['ent_inum/sum_gb_rgyear'] = mydf['INUM'] / tmp['sum']
    mydf['ent_inum-min_gb_rgyear'] = mydf['INUM'] - tmp['min']
    mydf['ent_inum-max_gb_rgyear'] = mydf['INUM'] - tmp['max']
    mydf['ent_inum-median_gb_rgyear'] = mydf['INUM'] - tmp['median']

    inum_gb_hy = mydf.groupby('HY')['INUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, inum_gb_hy, how='left', on='HY')
    mydf['ent_inum/sum_gb_hy'] = mydf['INUM'] / tmp['sum']
    mydf['ent_inum-min_gb_hy'] = mydf['INUM'] - tmp['min']
    mydf['ent_inum-max_gb_hy'] = mydf['INUM'] - tmp['max']
    mydf['ent_inum-median_gb_hy'] = mydf['INUM'] - tmp['median']

    inum_gb_etype = mydf.groupby('ETYPE')['INUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, inum_gb_etype, how='left', on='ETYPE')
    mydf['ent_inum/sum_gb_etype'] = mydf['INUM'] / tmp['sum']
    mydf['ent_inum-min_gb_etype'] = mydf['INUM'] - tmp['min']
    mydf['ent_inum-max_gb_etype'] = mydf['INUM'] - tmp['max']
    mydf['ent_inum-median_gb_etype'] = mydf['INUM'] - tmp['median']
    #####
    
    #####
    enum_gb_prov = mydf.groupby('PROV')['ENUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, enum_gb_prov, how='left', on='PROV')
    mydf['ent_enum/sum_gb_prov'] = mydf['ENUM'] / tmp['sum']
    mydf['ent_enum-min_gb_prov'] = mydf['ENUM'] - tmp['min']
    mydf['ent_enum-max_gb_prov'] = mydf['ENUM'] - tmp['max']
    mydf['ent_enum-median_gb_prov'] = mydf['ENUM'] - tmp['median']

    enum_gb_rgyear = mydf.groupby('RGYEAR')['ENUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, enum_gb_rgyear, how='left', on='RGYEAR')
    mydf['ent_enum/sum_gb_rgyear'] = mydf['ENUM'] / tmp['sum']
    mydf['ent_enum-min_gb_rgyear'] = mydf['ENUM'] - tmp['min']
    mydf['ent_enum-max_gb_rgyear'] = mydf['ENUM'] - tmp['max']
    mydf['ent_enum-median_gb_rgyear'] = mydf['ENUM'] - tmp['median']

    enum_gb_hy = mydf.groupby('HY')['ENUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, enum_gb_hy, how='left', on='HY')
    mydf['ent_enum/sum_gb_hy'] = mydf['ENUM'] / tmp['sum']
    mydf['ent_enum-min_gb_hy'] = mydf['ENUM'] - tmp['min']
    mydf['ent_enum-max_gb_hy'] = mydf['ENUM'] - tmp['max']
    mydf['ent_enum-median_gb_hy'] = mydf['ENUM'] - tmp['median']

    enum_gb_etype = mydf.groupby('ETYPE')['ENUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, enum_gb_etype, how='left', on='ETYPE')
    mydf['ent_enum/sum_gb_etype'] = mydf['ENUM'] / tmp['sum']
    mydf['ent_enum-min_gb_etype'] = mydf['ENUM'] - tmp['min']
    mydf['ent_enum-max_gb_etype'] = mydf['ENUM'] - tmp['max']
    mydf['ent_enum-median_gb_etype'] = mydf['ENUM'] - tmp['median']
    #####
    
    #####
    finzb_gb_prov = mydf.groupby('PROV')['FINZB'].agg([sum, min, max, np.mean, np.ptp]).reset_index()
    tmp = pd.merge(mydf, finzb_gb_prov, how='left', on='PROV')
    mydf['ent_finzb/sum_gb_prov'] = mydf['FINZB'] / tmp['sum']
    mydf['ent_finzb-min_gb_prov'] = mydf['FINZB'] - tmp['min']
    mydf['ent_finzb-max_gb_prov'] = mydf['FINZB'] - tmp['max']
    mydf['ent_finzb-mean_gb_prov'] = mydf['FINZB'] - tmp['mean']

    finzb_gb_rgyear = mydf.groupby('RGYEAR')['FINZB'].agg([sum, min, max, np.mean, np.ptp]).reset_index()
    tmp = pd.merge(mydf, finzb_gb_rgyear, how='left', on='RGYEAR')
    mydf['ent_finzb/sum_gb_rgyear'] = mydf['FINZB'] / tmp['sum']
    mydf['ent_finzb-min_gb_rgyear'] = mydf['FINZB'] - tmp['min']
    mydf['ent_finzb-max_gb_rgyear'] = mydf['FINZB'] - tmp['max']
    mydf['ent_finzb-mean_gb_rgyear'] = mydf['FINZB'] - tmp['mean']

    finzb_gb_hy = mydf.groupby('HY')['FINZB'].agg([sum, min, max, np.mean, np.ptp]).reset_index()
    tmp = pd.merge(mydf, finzb_gb_hy, how='left', on='HY')
    mydf['ent_finzb/sum_gb_hy'] = mydf['FINZB'] / tmp['sum']
    mydf['ent_finzb-min_gb_hy'] = mydf['FINZB'] - tmp['min']
    mydf['ent_finzb-max_gb_hy'] = mydf['FINZB'] - tmp['max']
    mydf['ent_finzb-mean_gb_hy'] = mydf['FINZB'] - tmp['mean']

    finzb_gb_etype = mydf.groupby('ETYPE')['FINZB'].agg([sum, min, max, np.mean, np.ptp]).reset_index()
    tmp = pd.merge(mydf, finzb_gb_etype, how='left', on='ETYPE')
    mydf['ent_finzb/sum_gb_etype'] = mydf['FINZB'] / tmp['sum']
    mydf['ent_finzb-min_gb_etype'] = mydf['FINZB'] - tmp['min']
    mydf['ent_finzb-max_gb_etype'] = mydf['FINZB'] - tmp['max']
    mydf['ent_finzb-mean_gb_etype'] = mydf['FINZB'] - tmp['mean']
    #####
    
    ##### bad
    fstinum_gb_prov = mydf.groupby('PROV')['FSTINUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, fstinum_gb_prov, how='left', on='PROV')
    mydf['ent_fstinum/sum_gb_prov'] = mydf['FSTINUM'] / tmp['sum']
    mydf['ent_fstinum-min_gb_prov'] = mydf['FSTINUM'] - tmp['min']
    mydf['ent_fstinum-max_gb_prov'] = mydf['FSTINUM'] - tmp['max']
    mydf['ent_fstinum-median_gb_prov'] = mydf['FSTINUM'] - tmp['median']

    fstinum_gb_rgyear = mydf.groupby('RGYEAR')['FSTINUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, fstinum_gb_rgyear, how='left', on='RGYEAR')
    mydf['ent_fstinum/sum_gb_rgyear'] = mydf['FSTINUM'] / tmp['sum']
    mydf['ent_fstinum-min_gb_rgyear'] = mydf['FSTINUM'] - tmp['min']
    mydf['ent_fstinum-max_gb_rgyear'] = mydf['FSTINUM'] - tmp['max']
    mydf['ent_fstinum-median_gb_rgyear'] = mydf['FSTINUM'] - tmp['median']

    fstinum_gb_hy = mydf.groupby('HY')['FSTINUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, fstinum_gb_hy, how='left', on='HY')
    mydf['ent_fstinum/sum_gb_hy'] = mydf['FSTINUM'] / tmp['sum']
    mydf['ent_fstinum-min_gb_hy'] = mydf['FSTINUM'] - tmp['min']
    mydf['ent_fstinum-max_gb_hy'] = mydf['FSTINUM'] - tmp['max']
    mydf['ent_fstinum-median_gb_hy'] = mydf['FSTINUM'] - tmp['median']

    fstinum_gb_etype = mydf.groupby('ETYPE')['FSTINUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, fstinum_gb_etype, how='left', on='ETYPE')
    mydf['ent_fstinum/sum_gb_etype'] = mydf['FSTINUM'] / tmp['sum']
    mydf['ent_fstinum-min_gb_etype'] = mydf['FSTINUM'] - tmp['min']
    mydf['ent_fstinum-max_gb_etype'] = mydf['FSTINUM'] - tmp['max']
    mydf['ent_fstinum-median_gb_etype'] = mydf['FSTINUM'] - tmp['median']
    #####
    
    ##### bad
    tzinum_gb_prov = mydf.groupby('PROV')['TZINUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, tzinum_gb_prov, how='left', on='PROV')
    mydf['ent_tzinum/sum_gb_prov'] = mydf['TZINUM'] / tmp['sum']
    mydf['ent_tzinum-min_gb_prov'] = mydf['TZINUM'] - tmp['min']
    mydf['ent_tzinum-max_gb_prov'] = mydf['TZINUM'] - tmp['max']
    mydf['ent_tzinum-median_gb_prov'] = mydf['TZINUM'] - tmp['median']

    tzinum_gb_rgyear = mydf.groupby('RGYEAR')['TZINUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, tzinum_gb_rgyear, how='left', on='RGYEAR')
    mydf['ent_tzinum/sum_gb_rgyear'] = mydf['TZINUM'] / tmp['sum']
    mydf['ent_tzinum-min_gb_rgyear'] = mydf['TZINUM'] - tmp['min']
    mydf['ent_tzinum-max_gb_rgyear'] = mydf['TZINUM'] - tmp['max']
    mydf['ent_tzinum-median_gb_rgyear'] = mydf['TZINUM'] - tmp['median']

    tzinum_gb_hy = mydf.groupby('HY')['TZINUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, tzinum_gb_hy, how='left', on='HY')
    mydf['ent_tzinum/sum_gb_hy'] = mydf['TZINUM'] / tmp['sum']
    mydf['ent_tzinum-min_gb_hy'] = mydf['TZINUM'] - tmp['min']
    mydf['ent_tzinum-max_gb_hy'] = mydf['TZINUM'] - tmp['max']
    mydf['ent_tzinum-median_gb_hy'] = mydf['TZINUM'] - tmp['median']

    tzinum_gb_etype = mydf.groupby('ETYPE')['TZINUM'].agg([sum, min, max, np.median, np.ptp]).reset_index()
    tmp = pd.merge(mydf, tzinum_gb_etype, how='left', on='ETYPE')
    mydf['ent_tzinum/sum_gb_etype'] = mydf['TZINUM'] / tmp['sum']
    mydf['ent_tzinum-min_gb_etype'] = mydf['TZINUM'] - tmp['min']
    mydf['ent_tzinum-max_gb_etype'] = mydf['TZINUM'] - tmp['max']
    mydf['ent_tzinum-median_gb_etype'] = mydf['TZINUM'] - tmp['median']
    #####
    
#     #####
#     mydf['mpnum*inum'] = mydf['MPNUM'] * mydf['INUM']
#     mydf['mpnum*enum'] = mydf['MPNUM'] * mydf['ENUM']
#     mydf['mpnum*fstinum'] = mydf['MPNUM'] * mydf['FSTINUM']
#     mydf['mpnum*tzinum'] = mydf['MPNUM'] * mydf['TZINUM']
#     mydf['inum*enum'] = mydf['INUM'] * mydf['ENUM']
#     mydf['inum*fstinum'] = mydf['INUM'] * mydf['FSTINUM']
#     mydf['inum*tzinum'] = mydf['INUM'] * mydf['TZINUM']
#     mydf['enum*fstinum'] = mydf['ENUM'] * mydf['FSTINUM']
#     mydf['enum*tzinum'] = mydf['ENUM'] * mydf['TZINUM']
#     mydf['fstinum*tzinum'] = mydf['FSTINUM'] * mydf['TZINUM']  
#     #####
    
#     ##### bad
#     mydf['mpnum*inum*enum'] = mydf['MPNUM'] * mydf['INUM'] * mydf['ENUM']
#     mydf['mpnum*inum*fstinum'] = mydf['MPNUM'] * mydf['INUM'] * mydf['FSTINUM']
#     mydf['mpnum*inum*tzinum'] = mydf['MPNUM'] * mydf['INUM'] * mydf['TZINUM']
#     mydf['mpnum*enum*fstinum'] = mydf['MPNUM'] * mydf['ENUM'] * mydf['FSTINUM']
#     mydf['mpnum*enum*tzinum'] = mydf['MPNUM'] * mydf['ENUM'] * mydf['TZINUM']
#     mydf['mpnum*fstinum*tzinum'] = mydf['MPNUM'] * mydf['FSTINUM'] * mydf['TZINUM']
#     mydf['inum*enum*fstinum'] = mydf['INUM'] * mydf['ENUM'] * mydf['FSTINUM']
#     mydf['inum*enum*tzinum'] = mydf['INUM'] * mydf['ENUM'] * mydf['TZINUM']
#     mydf['inum*fstinum*tzinum'] = mydf['INUM'] * mydf['FSTINUM'] * mydf['TZINUM']
#     mydf['enum*fstinum*tzinum'] = mydf['ENUM'] * mydf['FSTINUM'] * mydf['TZINUM']
#     #####
    
#     #####
#     mydf['mpnum*inum*enum*fstinum'] = mydf['MPNUM'] * mydf['INUM'] * mydf['ENUM'] * mydf['FSTINUM']
#     mydf['mpnum*inum*enum*tzinum'] = mydf['MPNUM'] * mydf['INUM'] * mydf['ENUM'] * mydf['TZINUM']
#     mydf['mpnum*inum*fstinum*tzinum'] = mydf['MPNUM'] * mydf['INUM'] * mydf['FSTINUM'] * mydf['TZINUM']
#     mydf['mpnum*enum*fstinum*tzinum'] = mydf['MPNUM'] * mydf['ENUM'] * mydf['FSTINUM'] * mydf['TZINUM']
#     mydf['inum*enum*fstinum*tzinum'] = mydf['INUM'] * mydf['ENUM'] * mydf['FSTINUM'] * mydf['TZINUM']
    
#     mydf['mpnum*inum*enum*fstinum*tzinum'] = mydf['MPNUM'] * mydf['INUM'] * mydf['ENUM'] * mydf['FSTINUM'] * mydf['TZINUM']
#     #####
    
#     #####
#     mydf['mpnum+inum+enum+fstinum+tzinum'] = mydf['MPNUM'] + mydf['INUM'] + mydf['ENUM'] + mydf['FSTINUM'] + mydf['TZINUM']
    
#     mydf['mpnum+inum*enum*fstinum*tzinum'] = mydf['MPNUM'] + mydf['inum*enum*fstinum*tzinum']
#     mydf['inum+mpnum*enum*fstinum*tzinum'] = mydf['INUM'] + mydf['mpnum*enum*fstinum*tzinum']
#     mydf['enum+mpnum*inum*fstinum*tzinum'] = mydf['ENUM'] + mydf['mpnum*inum*fstinum*tzinum']
#     mydf['fstinum+mpnum*inum*enum*tzinum'] = mydf['FSTINUM'] + mydf['mpnum*inum*enum*tzinum']
#     mydf['tzinum+mpnum*inum*enum*fstinum'] = mydf['TZINUM'] + mydf['mpnum*inum*enum*fstinum']
#     #####
    
    return mydf

In [89]:
def get_alter_feature(df):
    df = df.copy()

    alt_no = df.groupby(['EID', 'ALTERNO']).size().reset_index()
    alt_no = alt_no.groupby('EID')[0].agg([sum, len]).reset_index()
    alt_no.columns = ['EID', 'alt_count', 'alt_types_count']

    alt_no_oh = df.groupby(['EID', 'ALTERNO']).size().unstack().reset_index()
    alt_no_oh.columns = [i if i == 'EID' else 'alt_' + i for i in alt_no_oh.columns]

    df['date'] = df['ALTDATE'].apply(translate_date)
    date = df.groupby('EID')['date'].agg([min, max, np.ptp, np.std]).reset_index()
    date.columns = ['EID', 'alt_date_min', 'alt_date_max', 'alt_date_ptp', 'alt_date_std']

    df['altbe'] = df['ALTBE'].str.extract('(\d+\.?\d*)').astype(float)
    df['altaf'] = df['ALTAF'].str.extract('(\d+\.?\d*)').astype(float)
    alt_be_af = df.groupby('EID')['altbe', 'altaf'].agg([min, max, np.mean]).reset_index()
    alt_be_af.columns = ['EID', 'alt_be_min', 'alt_be_max', 'alt_be_mean', 'alt_af_min', 'alt_af_max', 'alt_af_mean']

    mydf = pd.merge(alt_no, alt_no_oh, how='left', on='EID')
    mydf = pd.merge(mydf, date, how='left', on='EID')
    mydf = pd.merge(mydf, alt_be_af, how='left', on='EID')

    return mydf

In [90]:
def get_right_feature(df):
    df = df.copy()
    
    rig_type = df.groupby(['EID', 'RIGHTTYPE']).size().reset_index()
    rig_type = rig_type.groupby('EID')[0].agg([sum, len]).reset_index()
    rig_type.columns = ['EID', 'rig_count', 'rig_types_count']
    
    rig_type_oh_rate = df.groupby(['EID', 'RIGHTTYPE']).size().unstack().reset_index()
    rig_type_oh_rate.iloc[:, 1:] = rig_type_oh_rate.iloc[:, 1:].div(rig_type['rig_count'], axis='index')
    rig_type_oh_rate.columns = [i if i == 'EID' else 'rig_rate_' + str(i) for i in rig_type_oh_rate.columns]
    
    df['ask_month'] = (pd.to_datetime(df['ASKDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    ask_date = df.groupby('EID')['ask_month'].agg([max, min, np.ptp, np.std]).reset_index()
    ask_date.columns = ['EID', 'rig_askdate_max', 'rig_askdate_min', 'rig_askdate_ptp', 'rig_askdate_std']

    df['get_month'] = (pd.to_datetime(df['FBDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    get_date = df.groupby('EID')['get_month'].agg([max, min, np.ptp, np.std]).reset_index()
    get_date.columns = ['EID', 'rig_getdate_max', 'rig_getdate_min', 'rig_getdate_ptp', 'rig_getdate_std']
    
    # bad
    unget = df[df.FBDATE.isnull()]
    unget = unget.groupby('EID').size().reset_index()
    unget.columns = ['EID', 'rig_unget_num']
    
    right_1year = df[df['ASKDATE'] >= '2015-01'].groupby('EID')['ASKDATE'].count().reset_index()
    right_1year.columns = ['EID', 'ask_num(1year)']
    right_2year = df[df['ASKDATE'] >= '2014-01'].groupby('EID')['ASKDATE'].count().reset_index()
    right_2year.columns = ['EID', 'ask_num(2year)']
    right_5year = df[df['ASKDATE'] >= '2010-01'].groupby('EID')['ASKDATE'].count().reset_index()
    right_5year.columns = ['EID', 'ask_num(5year)']
    right_end_1year = df[df['FBDATE'] >= '2015-01'].groupby('EID')['FBDATE'].count().reset_index()
    right_end_1year.columns = ['EID', 'get_num(1year)']
    right_end_2year = df[df['FBDATE'] >= '2014-01'].groupby('EID')['FBDATE'].count().reset_index()
    right_end_2year.columns = ['EID', 'get_num(2year)']
    right_end_5year = df[df['FBDATE'] >= '2010-01'].groupby('EID')['FBDATE'].count().reset_index()
    right_end_5year.columns = ['EID', 'get_num(5year)']
    
    mydf = pd.merge(rig_type, rig_type_oh_rate, how='left', on='EID')
    mydf = pd.merge(mydf, ask_date, how='left', on='EID')
    mydf = pd.merge(mydf, get_date, how='left', on='EID')
    mydf = pd.merge(mydf, unget, how='left', on='EID')
    mydf = pd.merge(mydf, right_1year, how='left', on='EID')
    mydf = pd.merge(mydf, right_2year, how='left', on='EID')
    mydf = pd.merge(mydf, right_5year, how='left', on='EID')
    mydf = pd.merge(mydf, right_end_1year, how='left', on='EID')
    mydf = pd.merge(mydf, right_end_2year, how='left', on='EID')
    mydf = pd.merge(mydf, right_end_5year, how='left', on='EID')
    
    # bad
    mydf['ask_rate(1year)'] = mydf['ask_num(1year)'] / mydf['rig_count']
    mydf['ask_rate(2year)'] = mydf['ask_num(2year)'] / mydf['rig_count']
    mydf['ask_rate(5year)'] = mydf['ask_num(5year)'] / mydf['rig_count']
    mydf['get_rate(1year)'] = mydf['get_num(1year)'] / mydf['rig_count']
    mydf['get_rate(2year)'] = mydf['get_num(2year)'] / mydf['rig_count']
    mydf['get_rate(5year)'] = mydf['get_num(5year)'] / mydf['rig_count']

    return mydf

In [91]:
def get_recruit_feature(df):
    df = df.copy()
    
    rec_wz = df.groupby(['EID', 'WZCODE']).size().reset_index()
    rec_wz = rec_wz.groupby('EID')[0].agg([sum, len]).reset_index()
    rec_wz.columns = ['EID', 'rec_wz_count', 'rec_wz_types_count']
    
    # bad
    rec_wz_oh = df.groupby(['EID', 'WZCODE']).size().unstack().reset_index()
    rec_wz_oh.columns = [i if i == 'EID' else 'rec_wz_' + i for i in rec_wz_oh.columns]
    
    # bad
    rec_pos = df.groupby(['EID', 'POSCODE']).size().reset_index()
    rec_pos = rec_pos.groupby('EID')[0].agg([sum, len]).reset_index()
    rec_pos.columns = ['EID', 'rec_pos_count', 'rec_pos_types_count']
    
    df['recdate'] = (pd.to_datetime(df['RECDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    rec_date = df.groupby('EID')['recdate'].agg([max, min, np.ptp, np.std]).reset_index()
    rec_date.columns = ['EID', 'rec_date_max', 'rec_date_min', 'rec_date_ptp', 'rec_date_std']
    
    # bad
    df['pnum'] = df['PNUM'].str.extract('(\d+)').fillna(1).astype(int)  # 若干=1
    rec_num = df.groupby('EID')['pnum'].agg([sum, max, min, np.ptp, np.std]).reset_index()
    rec_num.columns = ['EID' if i == 'EID' else 'rec_num_' + i for i in rec_num.columns]
    
    mydf = pd.merge(rec_wz, rec_wz_oh, how='left', on='EID')
    mydf = pd.merge(mydf, rec_pos, how='left', on='EID')
    mydf = pd.merge(mydf, rec_date, how='left', on='EID')
    mydf = pd.merge(mydf, rec_num, how='left', on='EID')

    return mydf

In [92]:
def get_branch_feature(df):
    df = df.copy()
    
    bra_num = df.groupby('EID')['TYPECODE'].size().reset_index()
    bra_num.columns = ['EID', 'bra_count']
    
    # bad
    bra_home = df.groupby(['EID', 'IFHOME']).size().unstack().reset_index()
    bra_home.columns = ['EID', 'bra_nothome', 'bra_home']
    
    bra_year = df.groupby('EID')['B_REYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    bra_year.columns = [i if i == 'EID' else 'bra_year_' + i for i in bra_year.columns]
    
    bra_endyear = df.groupby('EID')['B_ENDYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    bra_endyear.columns = [i if i == 'EID' else 'bra_endyear_' + i for i in bra_endyear.columns]
    
    bra_end_num = df[~df['B_ENDYEAR'].isnull()].groupby('EID').size().reset_index()
    bra_end_num.columns = ['EID', 'bra_end_num']
    bra_notend_num = df[df['B_ENDYEAR'].isnull()].groupby('EID').size().reset_index()
    bra_notend_num.columns = ['EID', 'bra_notend_num']
    
    mydf = pd.merge(bra_num, bra_home, how='left', on='EID')
    mydf = pd.merge(mydf, bra_year, how='left', on='EID')
    mydf = pd.merge(mydf, bra_endyear, how='left', on='EID')
    mydf = pd.merge(mydf, bra_end_num, how='left', on='EID')
    mydf = pd.merge(mydf, bra_notend_num, how='left', on='EID')

    return mydf

In [93]:
def get_invest_feature(df):
    df = df.copy()
    
    inv_num = df.groupby('EID').size().reset_index()
    inv_num.columns = ['EID', 'inv_count']
    
    # bad
    inv_home = df.groupby(['EID', 'IFHOME']).size().unstack().reset_index()
    inv_home.columns = ['EID', 'inv_nothome_num', 'inv_home_num']
    
    # bad
    inv_bl = df.groupby('EID')['BTBL'].agg([sum, min, max, np.ptp, np.std]).reset_index()
    inv_bl.columns = [i if i == 'EID' else 'inv_bl_' + i for i in inv_bl.columns]
    
    inv_year = df.groupby('EID')['BTYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    inv_year.columns = [i if i == 'EID' else 'inv_year_' + i for i in inv_year.columns]
    
    # bad
    inv_endyear = df.groupby('EID')['BTENDYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    inv_endyear.columns = [i if i == 'EID' else 'inv_endyear_' + i for i in inv_endyear.columns]
    
    # bad
    inved_num = df.groupby('BTEID').size().reset_index()
    inved_num.columns = ['EID', 'inved_num']
    
    inved_home = df.groupby(['BTEID', 'IFHOME']).size().unstack().reset_index()
    inved_home.columns = ['EID', 'inved_nothome_num', 'inved_home_num']
    
    # bad
    inved_bl = df.groupby('BTEID')['BTBL'].agg([sum, min, max, np.ptp, np.std]).reset_index()
    inved_bl.columns = ['EID' if i == 'BTEID' else 'inved_bl_' + i for i in inved_bl.columns]
    
    inved_year = df.groupby('BTEID')['BTYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    inved_year.columns = ['EID' if i == 'BTEID' else 'inved_year_' + i for i in inved_year.columns]
    
    inved_endyear = df.groupby('BTEID')['BTENDYEAR'].agg([min, max, np.ptp, np.std]).reset_index()
    inved_endyear.columns = ['EID' if i == 'BTEID' else 'inved_endyear_' + i for i in inved_endyear.columns]
    
    mydf = pd.merge(inv_num, inv_home, how='left', on='EID')
    mydf = pd.merge(mydf, inv_bl, how='left', on='EID')
    mydf = pd.merge(mydf, inv_year, how='left', on='EID')
    mydf = pd.merge(mydf, inv_endyear, how='left', on='EID')
    mydf = pd.merge(mydf, inved_num, how='left', on='EID')
    mydf = pd.merge(mydf, inved_home, how='left', on='EID')
    mydf = pd.merge(mydf, inved_bl, how='left', on='EID')
    mydf = pd.merge(mydf, inved_year, how='left', on='EID')
    mydf = pd.merge(mydf, inved_endyear, how='left', on='EID')
    
    return mydf

In [94]:
def get_lawsuit_feature(df):
    df = df.copy()
    
    law_num = df.groupby('EID').size().reset_index()
    law_num.columns = ['EID', 'law_count']
    
    # bad
    df['lawdate'] = df['LAWDATE'].apply(lambda x: x.replace('年', '-').replace('月', '')).apply(translate_date)
    law_date = df.groupby('EID')['lawdate'].agg([min, max, np.ptp, np.std]).reset_index()
    law_date.columns = [i if i == 'EID' else 'law_date_' + i for i in law_date.columns]
    
    # bad
    law_amout = df.groupby('EID')['LAWAMOUNT'].agg([sum, min, max, np.mean, np.ptp, np.std]).reset_index()
    law_amout.columns = [i if i == 'EID' else 'law_amout_' + i for i in law_amout.columns]
    
    mydf = pd.merge(law_num, law_date, how='left', on='EID')
    mydf = pd.merge(mydf, law_amout, how='left', on='EID')
    
    return mydf

In [95]:
def get_project_feature(df):
    df = df.copy()
    
    pro_num = df.groupby('EID').size().reset_index()
    pro_num.columns = ['EID', 'pro_count']
    
    df['djdate'] = df['DJDATE'].apply(translate_date)
    pro_date = df.groupby('EID')['djdate'].agg([min, max, np.ptp, np.std]).reset_index()
    pro_date.columns = [i if i == 'EID' else 'pro_date_' + i for i in pro_date.columns]
    
    # bad
    pro_home = df.groupby(['EID', 'IFHOME']).size().unstack().reset_index()
    pro_home.columns = ['EID', 'pro_nothome_num', 'pro_home_num']
    
    mydf = pd.merge(pro_num, pro_date, how='left', on='EID')
    mydf = pd.merge(mydf, pro_home, how='left', on='EID')
    
    return mydf

In [96]:
def get_qualification_feature(df):
    df = df.copy()
    
    qua_num = df.groupby('EID').size().reset_index()
    qua_num.columns = ['EID', 'qua_count']
    
    # bad
    qua_type = df.groupby(['EID', 'ADDTYPE']).size().unstack().reset_index()
    qua_type.columns = [i if i == 'EID' else 'qua_type_' + str(i) for i in qua_type.columns]
    
    # bad
    df['begindate'] = df['BEGINDATE'].apply(lambda x: x.replace(u'年', '-').replace(u'月', '')).apply(translate_date)
    qua_begindate = df.groupby('EID')['begindate'].agg([min, max, np.ptp, np.std]).reset_index()
    qua_begindate.columns = [i if i == 'EID' else 'qua_begindate_' + i for i in qua_begindate.columns]
    
    # bad
    df['expirydate'] = df['EXPIRYDATE'].apply(lambda x: x.replace(u'年', '-').replace(u'月', '') if not pd.isnull(x) else np.nan)
    df['expirydate'] = (pd.to_datetime(df['expirydate']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    qua_expirydate = df.groupby('EID')['expirydate'].agg([min, max, np.ptp, np.std]).reset_index()
    qua_expirydate.columns = [i if i == 'EID' else 'qua_expirydate_' + i for i in qua_expirydate.columns]
    
    mydf = pd.merge(qua_num, qua_type, how='left', on='EID')
    mydf = pd.merge(mydf, qua_begindate, how='left', on='EID')
    mydf = pd.merge(mydf, qua_expirydate, how='left', on='EID')
    
    return mydf

In [97]:
def get_breakfaith_feature(df):
    df = df.copy()
    
    bre_num = df.groupby('EID').size().reset_index()
    bre_num.columns = ['EID', 'bre_count']
    
    # bad
    df['fbdate'] = df['FBDATE'].apply(lambda x: x.replace('年', '-').replace('月', '')).apply(translate_date)
    bre_date = df.groupby('EID')['fbdate'].agg([min, max, np.ptp, np.std]).reset_index()
    bre_date.columns = [i if i == 'EID' else 'bre_date_' + i for i in bre_date.columns]
    
    df['sxenddate'] = (pd.to_datetime(df['SXENDDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    bre_enddate = df.groupby('EID')['sxenddate'].agg([min, max, np.ptp, np.std]).reset_index()
    bre_enddate.columns = [i if i == 'EID' else 'bre_enddate_' + i for i in bre_enddate.columns]
    
    mydf = pd.merge(bre_num, bre_date, how='left', on='EID')
    mydf = pd.merge(mydf, bre_enddate, how='left', on='EID')

    return mydf

In [193]:
entbase_feat = get_entbase_feature(entbase)

In [99]:
alter_feat = get_alter_feature(alter)



In [100]:
right_feature = get_right_feature(right)

In [101]:
recruit_feat = get_recruit_feature(recruit)



In [102]:
branch_feat = get_branch_feature(branch)

In [103]:
invest_feat = get_invest_feature(invest)

In [104]:
lawsuit_feat = get_lawsuit_feature(lawsuit)

In [105]:
project_feat = get_project_feature(project)

In [106]:
qualification_feat = get_qualification_feature(qualification)

In [107]:
breakfaith_feat = get_breakfaith_feature(breakfaith)

In [194]:
dataset = pd.merge(entbase_feat, alter_feat, on='EID', how='left')
dataset = pd.merge(dataset, right_feature, on='EID', how='left')
dataset = pd.merge(dataset, recruit_feat, on='EID', how='left')
dataset = pd.merge(dataset, branch_feat, on='EID', how='left')
dataset = pd.merge(dataset, invest_feat, on='EID', how='left')
dataset = pd.merge(dataset, lawsuit_feat, on='EID', how='left')
dataset = pd.merge(dataset, project_feat, on='EID', how='left')
dataset = pd.merge(dataset, qualification_feat, on='EID', how='left')
dataset = pd.merge(dataset, breakfaith_feat, on='EID', how='left')

In [195]:
trainset = pd.merge(train, dataset, on='EID', how='left')
testset = pd.merge(test, dataset, on='EID', how='left')

In [196]:
train_feature = trainset.drop(['TARGET', 'ENDDATE'], axis=1)
train_label = trainset.TARGET.values
test_feature = testset
test_index = testset.EID.values
print train_feature.shape, train_label.shape, test_feature.shape

(218264, 278) (218264L,) (218247, 278)


In [197]:
# EID 前面的字母代表不同省份，已提供了 PROV 列，因此字母是冗余信息，直接舍弃
train_feature['EID'] = train_feature['EID'].str.extract('(\d+)').astype(int)
test_feature['EID'] = test_feature['EID'].str.extract('(\d+)').astype(int)

  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [217]:
# pd.concat([train_feature, trainset.TARGET], axis=1).to_csv('../data/output/feat/train_xxy_local6864_online6923.csv', index=False)
# test_feature.to_csv('../data/output/feat/test_xxy_local6864_online6923.csv', index=False)

In [186]:
config = {
    'rounds': 10000,
    'folds': 3
}

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
#     'objective': 'rank:pairwise',
    'stratified': True,
    'scale_pos_weights ': 0,
    'max_depth': 9,
    'min_child_weight': 1,
    'gamma': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'lambda': 1,

    'eta': 0.01,
    'seed': 20,
    'silent': 1,
    'eval_metric': 'auc'
}

In [177]:
def xgb_cv(train_feature, train_label, params, rounds):
    start = time.clock()
    print train_feature.columns
    params['scale_pos_weights '] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    num_round = rounds
    print 'run cv: ' + 'round: ' + str(rounds)
    res = xgb.cv(params, dtrain, num_round, verbose_eval=10, early_stopping_rounds=100)
    elapsed = (time.clock() - start)
    print 'Time used:', elapsed, 's'
    return len(res), res.loc[len(res) - 1, 'test-auc-mean']


def xgb_predict(train_feature, train_label, test_feature, rounds, params):
    params['scale_pos_weights '] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    dtest = xgb.DMatrix(test_feature, label=np.zeros(test_feature.shape[0]))
    watchlist = [(dtrain, 'train')]
    num_round = rounds
    model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval=50)
    predict = model.predict(dtest)
    return model, predict


def store_result(test_index, pred, threshold, name):
    result = pd.DataFrame({'EID': test_index, 'FORTARGET': 0, 'PROB': pred})
    mask = result['PROB'] >= threshold
    result.at[mask, 'FORTARGET'] = 1
    # result['PROB'] = result['PROB'].apply(lambda x: round(x, 4))
    result.to_csv('../data/output/sub/' + name + '.csv', index=0)
    return result

In [187]:
iterations, best_score = xgb_cv(train_feature, train_label, params, config['rounds'])

Index([u'EID', u'PROV', u'RGYEAR', u'HY', u'ZCZB', u'ETYPE', u'MPNUM', u'INUM',
       u'ENUM', u'FINZB',
       ...
       u'qua_expirydate_std', u'bre_count', u'bre_date_min', u'bre_date_max',
       u'bre_date_ptp', u'bre_date_std', u'bre_enddate_min',
       u'bre_enddate_max', u'bre_enddate_ptp', u'bre_enddate_std'],
      dtype='object', length=310)
run cv: round: 10000


Will train until cv error hasn't decreased in 100 rounds.
[0]	cv-test-auc:0.631895666667+0.00709779826582	cv-train-auc:0.652329333333+0.00471109303194
[10]	cv-test-auc:0.663725+0.00239870395561	cv-train-auc:0.703206+0.00420763187553
[20]	cv-test-auc:0.666320333333+0.0034847747257	cv-train-auc:0.709755+0.00233518193438
[30]	cv-test-auc:0.668016666667+0.00302323935907	cv-train-auc:0.713953666667+0.00225188518555
[40]	cv-test-auc:0.669386+0.00293171212775	cv-train-auc:0.717448+0.00141758385995
[50]	cv-test-auc:0.670225333333+0.00302096864525	cv-train-auc:0.720183+0.00100285326278
[60]	cv-test-auc:0.671043333333+0.00281076458558	cv-train-auc:0.723117+0.000738941585422
[70]	cv-test-auc:0.671638333333+0.00300688325162	cv-train-auc:0.725617333333+0.000604099513509
[80]	cv-test-auc:0.672291666667+0.00311884875484	cv-train-auc:0.728247+0.000942299669249
[90]	cv-test-auc:0.672901333333+0.00294634828597	cv-train-auc:0.730574333333+0.00104526115822
[100]	cv-test-auc:0.673334666667+0.00278652427866

[900]	cv-test-auc:0.686389333333+0.00222767147987	cv-train-auc:0.822189+0.0019811920654
[910]	cv-test-auc:0.686403666667+0.0022225994291	cv-train-auc:0.822892666667+0.00192095554926
[920]	cv-test-auc:0.686399+0.00221135117066	cv-train-auc:0.823614+0.00191880031964
[930]	cv-test-auc:0.686411+0.00219476893241	cv-train-auc:0.824349666667+0.00189730727319
[940]	cv-test-auc:0.686413666667+0.00218879210728	cv-train-auc:0.825183666667+0.00202109249885
[950]	cv-test-auc:0.686439333333+0.00218582011052	cv-train-auc:0.825943333333+0.00209528619101
[960]	cv-test-auc:0.686436333333+0.00219058079564	cv-train-auc:0.826722666667+0.00213803092172
[970]	cv-test-auc:0.686438+0.00216758406219	cv-train-auc:0.827564333333+0.00201560848271
[980]	cv-test-auc:0.686458666667+0.00217154082521	cv-train-auc:0.828281666667+0.00202617378875
[990]	cv-test-auc:0.686469+0.00216303305569	cv-train-auc:0.828974+0.00200574491565
[1000]	cv-test-auc:0.686467333333+0.00218605494492	cv-train-auc:0.829660666667+0.0019365951794

Time used: 1871.1815087 s


In [188]:
import winsound
winsound.Beep(600,1000)

In [189]:
model, pred = xgb_predict(train_feature, train_label, test_feature, iterations, params)

[0]	train-auc:0.660541
[50]	train-auc:0.711973
[100]	train-auc:0.721101
[150]	train-auc:0.729243
[200]	train-auc:0.736644
[250]	train-auc:0.743824
[300]	train-auc:0.749417
[350]	train-auc:0.755179
[400]	train-auc:0.759895
[450]	train-auc:0.764829
[500]	train-auc:0.769332
[550]	train-auc:0.773301
[600]	train-auc:0.777194
[650]	train-auc:0.780988
[700]	train-auc:0.784439
[750]	train-auc:0.788210
[800]	train-auc:0.791807
[850]	train-auc:0.795155
[900]	train-auc:0.798286
[950]	train-auc:0.801571
[1000]	train-auc:0.804901
[1050]	train-auc:0.808207
[1100]	train-auc:0.811005
[1150]	train-auc:0.814199
[1181]	train-auc:0.815727


In [190]:
importance = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
importance.to_csv('../data/output/feat_imp/importance-1128-%f.csv' % best_score, index = False)

In [191]:
res = store_result(test_index, pred, 0.21, '1128-xgb-%f' % best_score)