In [153]:
from __future__ import division
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

In [154]:
train = pd.read_csv('../data/input/train.csv')
enbase = pd.read_csv('../data/input/1entbase.csv')
alter = pd.read_csv('../data/input/2alter.csv')
branch = pd.read_csv('../data/input/3branch.csv')
invest = pd.read_csv('../data/input/4invest.csv')
right = pd.read_csv('../data/input/5right.csv')
project = pd.read_csv('../data/input/6project.csv')
lawsuit = pd.read_csv('../data/input/7lawsuit.csv')
breakfaith = pd.read_csv('../data/input/8breakfaith.csv')
recruit = pd.read_csv('../data/input/9recruit.csv')
qualification = pd.read_csv('../data/input/10qualification.csv')
test = pd.read_csv('../data/input/evaluation_public.csv')

In [155]:
enbase = enbase.fillna(value={'ZCZB': 0, 'MPNUM': 0, 'INUM': 0, 'ENUM': 0, 'FINZB': 0, 'FSTINUM': 0, 'TZINUM': 0})  # 未处理 HY；ZCZB 为 0 表示缺失或错误

In [156]:
enbase.shape

(436798, 12)

In [157]:
def translate_date(date):
    year = int(date[:4])
    month = int(date[-2:])
    return (year - 2010) * 12 + month

In [158]:
def get_alter_feature(df):
    df = df.copy()

    alt_no = df.groupby(['EID', 'ALTERNO']).size().reset_index()
    alt_no = alt_no.groupby('EID')[0].agg([sum, len]).reset_index()
    alt_no.columns = ['EID', 'alt_count', 'alt_types_count']

    alt_no_oh = df.groupby(['EID', 'ALTERNO']).size().unstack().reset_index()
    alt_no_oh.columns = [i if i == 'EID' else 'alt_' + i for i in alt_no_oh.columns]

    df['date'] = df['ALTDATE'].apply(translate_date)
    date = df.groupby('EID')['date'].agg([min, max, np.ptp, np.std]).reset_index()
    date.columns = ['EID', 'alt_date_min', 'alt_date_max', 'alt_date_ptp', 'alt_date_std']

    df['altbe'] = df['ALTBE'].str.extract('(\d+\.?\d*)').astype(float)
    df['altaf'] = df['ALTAF'].str.extract('(\d+\.?\d*)').astype(float)
    alt_be_af = df.groupby('EID')['altbe', 'altaf'].agg([min, max, np.mean]).reset_index()
    alt_be_af.columns = ['EID', 'alt_be_min', 'alt_be_max', 'alt_be_mean', 'alt_af_min', 'alt_af_max', 'alt_af_mean']

    mydf = pd.merge(alt_no, alt_no_oh, how='left', on='EID')
    mydf = pd.merge(mydf, date, how='left', on='EID')
    mydf = pd.merge(mydf, alt_be_af, how='left', on='EID')

    return mydf

In [159]:
def get_right_feature(df):
    df = df.copy()
    
    rig_type = df.groupby(['EID', 'RIGHTTYPE']).size().reset_index()
    rig_type = rig_type.groupby('EID')[0].agg([sum, len]).reset_index()
    rig_type.columns = ['EID', 'rig_count', 'rig_types_count']
    
    rig_type_oh_rate = df.groupby(['EID', 'RIGHTTYPE']).size().unstack().reset_index()
    rig_type_oh_rate.iloc[:, 1:] = rig_type_oh_rate.iloc[:, 1:].div(rig_type['rig_count'], axis='index')
    rig_type_oh_rate.columns = [i if i == 'EID' else 'rig_rate_' + str(i) for i in rig_type_oh_rate.columns]
    
    df['ask_month'] = (pd.to_datetime(df['ASKDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    ask_date = df.groupby('EID')['ask_month'].agg([max, min, np.ptp, np.std]).reset_index()
    ask_date.columns = ['EID', 'rig_askdate_max', 'rig_askdate_min', 'rig_askdate_ptp', 'rig_askdate_std']

    df['get_month'] = (pd.to_datetime(df['FBDATE']).dt.to_period("M") - (pd.to_datetime('2010-01-01').to_period("M"))).fillna(-999).astype(int).replace(-999, np.NaN)
    get_date = df.groupby('EID')['get_month'].agg([max, min, np.ptp, np.std]).reset_index()
    get_date.columns = ['EID', 'rig_getdate_max', 'rig_getdate_min', 'rig_getdate_ptp', 'rig_getdate_std']
    
    # bad
    unget = df[df.FBDATE.isnull()]
    unget = unget.groupby('EID').size().reset_index()
    unget.columns = ['EID', 'rig_unget_num']
    
    right_1year = df[df['ASKDATE'] >= '2015-01'].groupby('EID')['ASKDATE'].count().reset_index()
    right_1year.columns = ['EID', 'ask_num(1year)']
    right_2year = df[df['ASKDATE'] >= '2014-01'].groupby('EID')['ASKDATE'].count().reset_index()
    right_2year.columns = ['EID', 'ask_num(2year)']
    right_5year = df[df['ASKDATE'] >= '2010-01'].groupby('EID')['ASKDATE'].count().reset_index()
    right_5year.columns = ['EID', 'ask_num(5year)']
    right_end_1year = df[df['FBDATE'] >= '2015-01'].groupby('EID')['FBDATE'].count().reset_index()
    right_end_1year.columns = ['EID', 'get_num(1year)']
    right_end_2year = df[df['FBDATE'] >= '2014-01'].groupby('EID')['FBDATE'].count().reset_index()
    right_end_2year.columns = ['EID', 'get_num(2year)']
    right_end_5year = df[df['FBDATE'] >= '2010-01'].groupby('EID')['FBDATE'].count().reset_index()
    right_end_5year.columns = ['EID', 'get_num(5year)']
    
    mydf = pd.merge(rig_type, rig_type_oh_rate, how='left', on='EID')
    mydf = pd.merge(mydf, ask_date, how='left', on='EID')
    mydf = pd.merge(mydf, get_date, how='left', on='EID')
    mydf = pd.merge(mydf, unget, how='left', on='EID')
    mydf = pd.merge(mydf, right_1year, how='left', on='EID')
    mydf = pd.merge(mydf, right_2year, how='left', on='EID')
    mydf = pd.merge(mydf, right_5year, how='left', on='EID')
    mydf = pd.merge(mydf, right_end_1year, how='left', on='EID')
    mydf = pd.merge(mydf, right_end_2year, how='left', on='EID')
    mydf = pd.merge(mydf, right_end_5year, how='left', on='EID')
    
    # bad
    mydf['ask_rate(1year)'] = mydf['ask_num(1year)'] / mydf['rig_count']
    mydf['ask_rate(2year)'] = mydf['ask_num(2year)'] / mydf['rig_count']
    mydf['ask_rate(5year)'] = mydf['ask_num(5year)'] / mydf['rig_count']
    mydf['get_rate(1year)'] = mydf['get_num(1year)'] / mydf['rig_count']
    mydf['get_rate(2year)'] = mydf['get_num(2year)'] / mydf['rig_count']
    mydf['get_rate(5year)'] = mydf['get_num(5year)'] / mydf['rig_count']

    return mydf

In [161]:
alter_feat = get_alter_feature(alter)



In [162]:
right_feature = get_right_feature(right)

In [163]:
dataset = pd.merge(enbase, alter_feat, on='EID', how='left')
dataset = pd.merge(dataset, right_feature, on='EID', how='left')

In [164]:
trainset = pd.merge(train, dataset, on='EID', how='left')
testset = pd.merge(test, dataset, on='EID', how='left')

In [165]:
# EID 前面的字母代表不同省份，已提供了 PROV 列，因此字母是冗余信息，直接舍弃
trainset['EID'] = trainset['EID'].str.extract('(\d+)').astype(int)
testset['EID'] = testset['EID'].str.extract('(\d+)').astype(int)

  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [166]:
train_feature = trainset.drop(['TARGET', 'ENDDATE'], axis=1)
train_label = trainset.TARGET.values
test_feature = testset
test_index = testset.EID.values
print train_feature.shape, train_label.shape, test_feature.shape

(218264, 66) (218264L,) (218247, 66)


In [167]:
config = {
    'rounds': 10000,
    'folds': 3
}

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
#     'objective': 'rank:pairwise',
    'stratified': True,
    'scale_pos_weights ': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'gamma': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'lambda': 1,

    'eta': 0.02,
    'seed': 20,
    'silent': 1,
    'eval_metric': 'auc'
}

In [168]:
def xgb_cv(train_feature, train_label, params, rounds):
    params['scale_pos_weights '] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    num_round = rounds
    print 'run cv: ' + 'round: ' + str(rounds)
    res = xgb.cv(params, dtrain, num_round, verbose_eval=10, early_stopping_rounds=100)
    return len(res)


def xgb_predict(train_feature, train_label, test_feature, rounds, params):
    params['scale_pos_weights '] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    dtest = xgb.DMatrix(test_feature, label=np.zeros(test_feature.shape[0]))
    watchlist = [(dtrain, 'train')]
    num_round = rounds
    model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval=50)
    predict = model.predict(dtest)
    return model, predict


def store_result(test_index, pred, threshold, name):
    result = pd.DataFrame({'EID': test_index, 'FORTARGET': 0, 'PROB': pred})
    mask = result['PROB'] >= threshold
    result.at[mask, 'FORTARGET'] = 1
    # result['PROB'] = result['PROB'].apply(lambda x: round(x, 4))
    result.to_csv('../data/output/sub/' + name + '.csv', index=0)
    return result

In [169]:
iterations = xgb_cv(train_feature, train_label, params, config['rounds'])

run cv: round: 10000


Will train until cv error hasn't decreased in 100 rounds.
[0]	cv-test-auc:0.633604+0.00164348592936	cv-train-auc:0.640705333333+0.00294726826438
[10]	cv-test-auc:0.65302+0.00156421929409	cv-train-auc:0.664669666667+0.00148143316495
[20]	cv-test-auc:0.655704333333+0.00217260785437	cv-train-auc:0.668525666667+0.0013279661977
[30]	cv-test-auc:0.657563+0.00246564028196	cv-train-auc:0.670435333333+0.00146821940534
[40]	cv-test-auc:0.658307+0.00247828058675	cv-train-auc:0.672164666667+0.00121746795532
[50]	cv-test-auc:0.659323666667+0.00267813645823	cv-train-auc:0.674248+0.00125572635023
[60]	cv-test-auc:0.660368333333+0.00294625596685	cv-train-auc:0.675843333333+0.00128116205931
[70]	cv-test-auc:0.661177666667+0.00275393855334	cv-train-auc:0.67723+0.00151482958337
[80]	cv-test-auc:0.662302+0.00253011343619	cv-train-auc:0.678832666667+0.00147435168426
[90]	cv-test-auc:0.663286666667+0.0025979461546	cv-train-auc:0.680235666667+0.00157965657308
[100]	cv-test-auc:0.663935666667+0.00275833456193

[900]	cv-test-auc:0.674616666667+0.00373436298301	cv-train-auc:0.746143+0.000991709971043
[910]	cv-test-auc:0.674647333333+0.00375146205573	cv-train-auc:0.746824333333+0.000951458295927
[920]	cv-test-auc:0.674706666667+0.00376369077133	cv-train-auc:0.747345333333+0.00103476964694
[930]	cv-test-auc:0.674764+0.00375000382222	cv-train-auc:0.747974333333+0.00106620021051
[940]	cv-test-auc:0.674756333333+0.00374715904238	cv-train-auc:0.748472+0.00105476158444
[950]	cv-test-auc:0.674779+0.00374114697208	cv-train-auc:0.749005333333+0.000945167474872
[960]	cv-test-auc:0.674809666667+0.00374115760457	cv-train-auc:0.749514+0.00091644567033
[970]	cv-test-auc:0.674820666667+0.00374217702533	cv-train-auc:0.750035666667+0.000882870067199
[980]	cv-test-auc:0.674849+0.00373675429573	cv-train-auc:0.750574+0.000874853511547
[990]	cv-test-auc:0.674858333333+0.00370442351191	cv-train-auc:0.751067666667+0.000850762141978
[1000]	cv-test-auc:0.674865+0.00368339580641	cv-train-auc:0.751606666667+0.00075451411

In [170]:
import winsound
winsound.Beep(600,1000)

In [None]:
model, pred = xgb_predict(train_feature, train_label, test_feature, iterations, params)

In [None]:
importance = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
importance.to_csv('../data/output/feat_imp/importance.csv', index = False)

In [None]:
res = store_result(test_index, pred, 0.5, '1122-xgb-entbase_only.csv')