In [8]:
import pandas as pd
import numpy as np
import scipy 
import scipy.stats
import matplotlib.pyplot as plt
import datetime
import time
import sklearn
from sklearn import metrics,ensemble
from sklearn.model_selection import KFold,train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
from numpy import sort
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

## trd 预处理

In [2]:
def trd_add(get_path,store_path):
    data_trd = pd.read_csv(get_path)
    week_day_dict = {0: '0',1: '0',2: '0',3: '0',4: '1',5: '1',6: '1'}
    data_trd['weekday'] = data_trd['trx_tm'].apply(lambda x: week_day_dict[datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S").weekday()])
    data_trd['day_time'] = data_trd['trx_tm'].apply(lambda x: x[11:13])
    data_trd['day_time'] = data_trd['day_time'].apply(lambda x: 0 if int(x)>=7 and int(x)<=17 else 1)
    
    if 'flag' in data_trd.keys():
        data_trd.drop('flag',axis=1,inplace=True)
    
    column_set = ['Dat_Flg1_Cd','Dat_Flg3_Cd','Trx_Cod1_Cd','Trx_Cod2_Cd','weekday','day_time']
    for i in column_set:
        subset = pd.get_dummies(data_trd[i])
        subset.columns = [i+'_'+str(jj) for jj in subset.keys()]
        data_trd =pd.concat([data_trd,subset],axis = 1)
    data_trd.drop(['Dat_Flg1_Cd','Dat_Flg3_Cd','Trx_Cod1_Cd','Trx_Cod2_Cd','trx_tm','weekday','day_time'],axis=1,inplace=True)
   
    key_set= data_trd.keys()[2:]
    all_trd = data_trd.groupby(['id']).\
                            agg({'cny_trx_amt':{'sum','max','min','std','mean'}})
    for i in key_set:
        subdata = data_trd.groupby(['id']).agg({i:'sum'})
        all_trd = pd.concat([all_trd,subdata],axis=1)
    all_trd.to_csv(store_path)

In [3]:
trd_add('train/trd.csv','train/all_trd.csv')
trd_add('b/trd_b.csv','b/all_trd_b.csv')

In [4]:
all_trd = pd.read_csv('train/all_trd.csv')
all_trd_1 = pd.read_csv('b/all_trd_b.csv')
all_trd.drop('Trx_Cod2_Cd_211',axis=1,inplace=True)
all_trd_1.drop('Trx_Cod2_Cd_124',axis=1,inplace=True)
all_trd.to_csv('train/all_trd.csv',index=False)
all_trd_1.to_csv('b/all_trd_b.csv',index=False)
print(all_trd.shape,all_trd_1.shape)

(31993, 73) (3190, 73)


## tag 预处理

In [5]:
def tag_add(get_path,store_path):
    data_tag = pd.read_csv(get_path)
    column_set1 = ['gdr_cd','mrg_situ_cd','edu_deg_cd','acdm_deg_cd','deg_cd']
    column_set2 = ['fin_rsk_ases_grd_cd','confirm_rsk_ases_lvl_typ_cd',\
          'cust_inv_rsk_endu_lvl_cd','l6mon_daim_aum_cd','tot_ast_lvl_cd',\
          'pot_ast_lvl_cd','bk1_cur_year_mon_avg_agn_amt_cd','pl_crd_lmt_cd',\
          'hld_crd_card_grd_cd','l1y_crd_card_csm_amt_dlm_cd','perm_crd_lmt_cd',\
          'cur_debit_crd_lvl']
    for i in column_set1:
        subset = pd.get_dummies(data_tag[i])
        subset.columns = [i+'_'+str(jj) for jj in subset.keys()]
        data_tag =pd.concat([data_tag,subset],axis = 1)
        data_tag.drop(i,axis=1,inplace=True)
    for i in column_set2:
        subset = pd.get_dummies(data_tag[i])
        subset.columns = [i+'_'+str(jj) for jj in subset.keys()]
        data_tag =pd.concat([data_tag,subset],axis = 1)
    for i in data_tag.keys()[1:]:
        data_tag.loc[data_tag[i]=='\\N',i]=np.nan
        data_tag[i] = data_tag[i].astype("float")
        
    subsub_split= pd.DataFrame()
    for i in ['age','job_year','cur_debit_min_opn_dt_cnt','cur_credit_min_opn_dt_cnt']:
        sub_split = pd.qcut(data_tag[i],5,labels=False,duplicates='drop')
        sub_split.columns = [i+"_split"]
        subsub_split =pd.concat([subsub_split,sub_split],axis = 1)
    subsub_split.columns = [i+"_split" for i in subsub_split.columns]
    data_tag =pd.concat([data_tag,subsub_split],axis = 1)
    data_tag.to_csv(store_path,index=False)

In [6]:
tag_add('train/tag.csv','train/all_tag.csv')
tag_add('b/tag_b.csv','b/all_tag_b.csv')

In [7]:
all_tag = pd.read_csv('train/all_tag.csv')
all_tag_1 = pd.read_csv('b/all_tag_b.csv')
all_tag_1.drop('fin_rsk_ases_grd_cd_10',axis=1,inplace=True)
all_tag.drop(['edu_deg_cd_J','fin_rsk_ases_grd_cd_11','fin_rsk_ases_grd_cd_6',\
'fin_rsk_ases_grd_cd_7','fin_rsk_ases_grd_cd_8','fin_rsk_ases_grd_cd_9',\
'confirm_rsk_ases_lvl_typ_cd_10','confirm_rsk_ases_lvl_typ_cd_13',\
'confirm_rsk_ases_lvl_typ_cd_8','confirm_rsk_ases_lvl_typ_cd_9',\
'cust_inv_rsk_endu_lvl_cd_9','tot_ast_lvl_cd_0','bk1_cur_year_mon_avg_agn_amt_cd_10',\
'hld_crd_card_grd_cd_60','hld_crd_card_grd_cd_70','perm_crd_lmt_cd_-1'],axis=1,inplace=True)
all_tag.to_csv('train/all_tag.csv',index=False)
all_tag_1.to_csv('b/all_tag_b.csv',index=False)
print(all_tag.shape,all_tag_1.shape)

(39923, 192) (4000, 191)


## Merge data

In [8]:
all_trd = pd.read_csv('train/all_trd.csv')
all_trd_1 = pd.read_csv('b/all_trd_b.csv')
all_tag = pd.read_csv('train/all_tag.csv')
all_tag_1 = pd.read_csv('b/all_tag_b.csv')

In [9]:
all_dataset = pd.merge(all_trd,all_tag,on='id',how='outer')
test_all_dataset = pd.merge(all_trd_1,all_tag_1,on='id',how='outer')

In [10]:
_dict={"frs_agn_dt_cnt":4,"cur_debit_cnt":2}
test_subsub_split= pd.DataFrame()
subsub_split= pd.DataFrame()
for i in _dict.keys():
    sub_split = pd.qcut(all_dataset[i],_dict[i],labels=False,duplicates='drop')
    test_sub_split = pd.qcut(test_all_dataset[i],_dict[i],labels=False,duplicates='drop')
    
    subsub_split =pd.concat([subsub_split,sub_split],axis = 1)
    test_subsub_split =pd.concat([test_subsub_split,test_sub_split],axis = 1)
    
subsub_split.columns = [i+"_split" for i in subsub_split.columns]
test_subsub_split.columns = [i+"_split" for i in test_subsub_split.columns]

all_dataset =pd.concat([all_dataset,subsub_split],axis = 1)
test_all_dataset =pd.concat([test_all_dataset,test_subsub_split],axis = 1)

In [11]:
all_dataset.to_csv('train/all_data.csv',index=False)
test_all_dataset.to_csv('b/all_data_b.csv',index=False)

## Missing Value handling

In [12]:
all_dataset=pd.read_csv('train/all_data.csv')
test_all_dataset=pd.read_csv('b/all_data_b.csv')

In [13]:
for i in all_dataset.keys()[1:]:
    all_dataset.loc[all_dataset[i]=='\\N',i]=np.nan
    all_dataset[i] = all_dataset[i].astype("float")
for i in test_all_dataset.keys()[1:]:
    test_all_dataset.loc[test_all_dataset[i]=='\\N',i]=np.nan
    test_all_dataset[i] = test_all_dataset[i].astype("float")

In [14]:
all_dataset['null_value'] = all_dataset.isnull().sum(axis=1)
test_all_dataset['null_value'] = test_all_dataset.isnull().sum(axis=1)

In [15]:
for i in all_dataset.keys():
    if "\\N" in i and "gdr" not in i and "tot" not in i:
        all_dataset.drop(i,axis=1,inplace=True)
        test_all_dataset.drop(i,axis=1,inplace=True)

In [16]:
print(all_dataset.shape)
keyset = list(all_dataset.keys()[1:])
keyset.remove("flag")
for i in keyset:
    while test_all_dataset[i].max()<all_dataset[i].max()-max(20,test_all_dataset[i].max()/5):
        print(i,test_all_dataset[i].max(),all_dataset[i].max())
        line_num=np.argmax(all_dataset[i],axis=1)
        all_dataset.drop(line_num,inplace=True)
print(all_dataset.shape)

(39923, 257)
('cny_trx_amt', 'sum') 3600835.5800000015 9916500.0
('cny_trx_amt', 'sum') 3600835.5800000015 7767903.679999998
('cny_trx_amt', 'sum') 3600835.5800000015 5537054.509999998
('cny_trx_amt', 'std') 1148006.8415341163 5000000.002014289
('cny_trx_amt', 'std') 1148006.8415341163 4601854.9908630615
('cny_trx_amt', 'std') 1148006.8415341163 3342169.999098176
('cny_trx_amt', 'std') 1148006.8415341163 2852540.1698367707
('cny_trx_amt', 'std') 1148006.8415341163 2828427.1250997256
('cny_trx_amt', 'std') 1148006.8415341163 2377539.80450011
('cny_trx_amt', 'std') 1148006.8415341163 1414213.5852338357
('cny_trx_amt', 'min') 30000.0 99294.07
('cny_trx_amt', 'min') 30000.0 60000.0
('cny_trx_amt', 'min') 30000.0 50000.0
('cny_trx_amt', 'min') 30000.0 49000.0
('cny_trx_amt', 'min') 30000.0 45000.0
('cny_trx_amt', 'min') 30000.0 40300.0
('cny_trx_amt', 'mean') 122380.786 149480.26499999998
('cny_trx_amt', 'max') 5000000.0 9900000.0
('cny_trx_amt', 'max') 5000000.0 7500000.0
Dat_Flg1_Cd_B 174

In [17]:
all_dataset.to_csv('train/all_data.csv',index=False)
test_all_dataset.to_csv('b/all_data_b.csv',index=False)

## Feature Selection

In [18]:
all_dataset=pd.read_csv('train/all_data.csv')
y = all_dataset['flag'].as_matrix()
X_all = all_dataset.drop(['id','flag'],axis=1,inplace=False)
X=X_all.as_matrix()
X_data, X_test, y_data, y_test = train_test_split(X, y, test_size=0.25)

In [19]:
model = lgb.LGBMClassifier()
model.fit(X_data, y_data)
predictions = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

roc_score = roc_auc_score(y_test, predictions)
print("AUC: %.2f%%" % (roc_score * 100.0))

dict_rank={}
feature_scores = pd.DataFrame(list(X_all.columns), columns=['feature'])
feature_scores['score'] = model.feature_importances_
feature_scores['rank'] = feature_scores['score'].rank(method = 'min', ascending=False)
feature_scores = feature_scores.sort_values('score',ascending=False)
for i in feature_scores.feature:
    dict_rank[i]=dict_rank.get(i,0)+float(feature_scores.loc[(feature_scores['feature'] == i)]['score'])
print(dict_rank)
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_data)
    selection_model = lgb.LGBMClassifier()
    selection_model.fit(select_X_train, y_data)
    select_X_test = selection.transform(X_test)
    if (select_X_test.shape[1])<=0:
        break
    predictions = selection_model.predict(select_X_test)
    accuracy = metrics.accuracy_score(y_test, predictions)
    roc_score = roc_auc_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Accuracy: %.8f%%,AUC: %.8f%%" % (thresh, select_X_train.shape[1], accuracy*100.0,roc_score*100.0))

Accuracy: 81.46%
AUC: 65.16%
{'cur_credit_min_opn_dt_cnt': 187.0, "('cny_trx_amt', 'std')": 140.0, 'cur_debit_min_opn_dt_cnt': 116.0, "('cny_trx_amt', 'max')": 105.0, 'Dat_Flg3_Cd_B': 95.0, "('cny_trx_amt', 'sum')": 93.0, "('cny_trx_amt', 'min')": 93.0, 'l1y_crd_card_csm_amt_dlm_cd': 80.0, "('cny_trx_amt', 'mean')": 78.0, 'age': 65.0, 'perm_crd_lmt_cd': 65.0, 'Trx_Cod2_Cd_309': 59.0, 'Trx_Cod1_Cd_3': 59.0, 'job_year': 54.0, 'day_time_0': 49.0, 'frs_agn_dt_cnt': 41.0, 'Trx_Cod2_Cd_308': 40.0, 'Trx_Cod2_Cd_102': 40.0, 'Dat_Flg1_Cd_C': 39.0, 'weekday_1': 39.0, 'Dat_Flg3_Cd_A': 38.0, 'day_time_1': 38.0, 'Trx_Cod2_Cd_136': 38.0, 'pot_ast_lvl_cd': 37.0, 'weekday_0': 37.0, 'Trx_Cod1_Cd_1': 35.0, 'Trx_Cod2_Cd_310': 32.0, 'Trx_Cod2_Cd_134': 32.0, 'Trx_Cod2_Cd_116': 31.0, 'Trx_Cod2_Cd_133': 28.0, 'Trx_Cod2_Cd_111': 28.0, 'Trx_Cod2_Cd_103': 27.0, 'fr_or_sh_ind': 26.0, 'Dat_Flg1_Cd_B': 24.0, 'Trx_Cod2_Cd_213': 23.0, 'hav_car_grp_ind': 23.0, 'Trx_Cod1_Cd_2': 23.0, 'Trx_Cod2_Cd_117': 23.0, 'hld_crd_

Thresh=5.000, n=105, Accuracy: 81.31724485%,AUC: 64.88354546%
Thresh=5.000, n=105, Accuracy: 81.31724485%,AUC: 64.88354546%
Thresh=5.000, n=105, Accuracy: 81.31724485%,AUC: 64.88354546%
Thresh=6.000, n=100, Accuracy: 81.31724485%,AUC: 64.94580401%
Thresh=6.000, n=100, Accuracy: 81.31724485%,AUC: 64.94580401%
Thresh=6.000, n=100, Accuracy: 81.31724485%,AUC: 64.94580401%
Thresh=6.000, n=100, Accuracy: 81.31724485%,AUC: 64.94580401%
Thresh=6.000, n=100, Accuracy: 81.31724485%,AUC: 64.94580401%
Thresh=7.000, n=95, Accuracy: 81.08597285%,AUC: 64.57815949%
Thresh=7.000, n=95, Accuracy: 81.08597285%,AUC: 64.57815949%
Thresh=7.000, n=95, Accuracy: 81.08597285%,AUC: 64.57815949%
Thresh=7.000, n=95, Accuracy: 81.08597285%,AUC: 64.57815949%
Thresh=7.000, n=95, Accuracy: 81.08597285%,AUC: 64.57815949%
Thresh=7.000, n=95, Accuracy: 81.08597285%,AUC: 64.57815949%
Thresh=7.000, n=95, Accuracy: 81.08597285%,AUC: 64.57815949%
Thresh=7.000, n=95, Accuracy: 81.08597285%,AUC: 64.57815949%
Thresh=7.000, n=

In [20]:
del_list=[]
for i in dict_rank:
    if dict_rank[i]<=1:
        del_list.append(i)

In [21]:
for i in del_list:
    all_dataset.drop(i,axis=1,inplace=True)

In [22]:
all_dataset.to_csv('train/all_data.csv',index=False)

## GridSearch

In [23]:
dataset = pd.read_csv('train/all_data.csv')
y_data = dataset['flag'].values
dataset.drop(['flag','id'], axis=1, inplace=True)
X_data = dataset.values

estimators = [50,100,200]
max_depth = [2,4,8,16]
learning_rate = [0.01,0.1,1]
param_grid = dict(max_depth=max_depth, n_estimators=estimators, learning_rate=learning_rate)
model = lgb.LGBMClassifier()
grid_search = GridSearchCV(model, param_grid, verbose=2, cv=10)
grid_result = grid_search.fit(X_data, y_data)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] learning_rate=0.01, max_depth=2, n_estimators=50 ................
[CV] . learning_rate=0.01, max_depth=2, n_estimators=50, total=   0.4s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] learning_rate=0.01, max_depth=2, n_estimators=50 ................
[CV] . learning_rate=0.01, max_depth=2, n_estimators=50, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, n_estimators=50 ................
[CV] . learning_rate=0.01, max_depth=2, n_estimators=50, total=   0.4s
[CV] learning_rate=0.01, max_depth=2, n_estimators=50 ................
[CV] . learning_rate=0.01, max_depth=2, n_estimators=50, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, n_estimators=50 ................
[CV] . learning_rate=0.01, max_depth=2, n_estimators=50, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, n_estimators=50 ................
[CV] . learning_rate=0.01, max_depth=2, n_estimators=50, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, n_estimators=50 ................
[CV] . learning_rate=0.01, max_depth=2, n_estimators=50, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, n_estimators=50 ................
[CV] . learning_rate=0.01, max_depth=2, n_estimators=50, total=   0.4s
[CV] l

[CV] learning_rate=0.01, max_depth=16, n_estimators=200 ..............
[CV]  learning_rate=0.01, max_depth=16, n_estimators=200, total=   2.4s
[CV] learning_rate=0.01, max_depth=16, n_estimators=200 ..............
[CV]  learning_rate=0.01, max_depth=16, n_estimators=200, total=   2.4s
[CV] learning_rate=0.01, max_depth=16, n_estimators=200 ..............
[CV]  learning_rate=0.01, max_depth=16, n_estimators=200, total=   2.6s
[CV] learning_rate=0.01, max_depth=16, n_estimators=200 ..............
[CV]  learning_rate=0.01, max_depth=16, n_estimators=200, total=   2.6s
[CV] learning_rate=0.1, max_depth=2, n_estimators=50 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=50, total=   0.4s
[CV] learning_rate=0.1, max_depth=2, n_estimators=50 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=50, total=   0.4s
[CV] learning_rate=0.1, max_depth=2, n_estimators=50 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=50, total=   0.4s
[C

[CV] learning_rate=0.1, max_depth=16, n_estimators=200 ...............
[CV]  learning_rate=0.1, max_depth=16, n_estimators=200, total=   1.6s
[CV] learning_rate=0.1, max_depth=16, n_estimators=200 ...............
[CV]  learning_rate=0.1, max_depth=16, n_estimators=200, total=   1.8s
[CV] learning_rate=0.1, max_depth=16, n_estimators=200 ...............
[CV]  learning_rate=0.1, max_depth=16, n_estimators=200, total=   1.6s
[CV] learning_rate=0.1, max_depth=16, n_estimators=200 ...............
[CV]  learning_rate=0.1, max_depth=16, n_estimators=200, total=   1.6s
[CV] learning_rate=0.1, max_depth=16, n_estimators=200 ...............
[CV]  learning_rate=0.1, max_depth=16, n_estimators=200, total=   1.6s
[CV] learning_rate=0.1, max_depth=16, n_estimators=200 ...............
[CV]  learning_rate=0.1, max_depth=16, n_estimators=200, total=   1.7s
[CV] learning_rate=0.1, max_depth=16, n_estimators=200 ...............
[CV]  learning_rate=0.1, max_depth=16, n_estimators=200, total=   1.6s
[CV] l

[CV] learning_rate=1, max_depth=16, n_estimators=100 .................
[CV] .. learning_rate=1, max_depth=16, n_estimators=100, total=   0.8s
[CV] learning_rate=1, max_depth=16, n_estimators=100 .................
[CV] .. learning_rate=1, max_depth=16, n_estimators=100, total=   0.9s
[CV] learning_rate=1, max_depth=16, n_estimators=100 .................
[CV] .. learning_rate=1, max_depth=16, n_estimators=100, total=   1.0s
[CV] learning_rate=1, max_depth=16, n_estimators=100 .................
[CV] .. learning_rate=1, max_depth=16, n_estimators=100, total=   1.0s
[CV] learning_rate=1, max_depth=16, n_estimators=200 .................
[CV] .. learning_rate=1, max_depth=16, n_estimators=200, total=   1.5s
[CV] learning_rate=1, max_depth=16, n_estimators=200 .................
[CV] .. learning_rate=1, max_depth=16, n_estimators=200, total=   1.5s
[CV] learning_rate=1, max_depth=16, n_estimators=200 .................
[CV] .. learning_rate=1, max_depth=16, n_estimators=200, total=   1.6s
[CV] l

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:  5.9min finished


Best: 0.783604 using {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 50}
0.775434 (0.000113) with: {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 50}
0.775434 (0.000113) with: {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 100}
0.781166 (0.004618) with: {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 200}
0.775434 (0.000113) with: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 50}
0.768572 (0.032251) with: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 100}
0.762312 (0.087453) with: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 200}
0.760427 (0.045080) with: {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 50}
0.756983 (0.082889) with: {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 100}
0.770633 (0.087147) with: {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 200}
0.760427 (0.045080) with: {'learning_rate': 0.01, 'max_depth': 16, 'n_estimators': 50}
0.757234 (0.082901) with: {'learning_rate': 0.01, 'ma

In [24]:
#'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 50

## Run test set

In [2]:
all_dataset = pd.read_csv('train/all_data.csv')
test_all_dataset = pd.read_csv('b/all_data_b.csv')

In [3]:
key_set = list(all_dataset.keys())
key_set.remove('flag')

In [4]:
test_all_dataset = test_all_dataset[key_set]
test_all_dataset.to_csv('b/all_data_b.csv',index=False)

In [5]:
print(all_dataset.shape,test_all_dataset.shape)

(39779, 151) (4000, 150)


In [17]:
def feature_selection():
    data_set = pd.read_csv('train/all_data.csv')
    kf = KFold(n_splits=8,shuffle=True)
    models = [0] * kf.get_n_splits()

    y = data_set['flag']
    data_set.drop(['flag','id'], axis=1, inplace=True)
    X = data_set.fillna(0)

    X_data = X.values
    y_data = y.values
    
    test_set=pd.read_csv('b/all_data_b.csv')
    test_set.drop(['id'], axis=1, inplace=True)
    X_test = test_set.values
    
    
    roc_scores = []
    accuracy = 0
    preds_store = np.zeros((X_test.shape[0]))
    preds_data = np.zeros((X_test.shape[0]))
    i=0
    for train_index, val_index in kf.split(X_data,y_data):
        X_train, X_val = X_data[train_index], X_data[val_index]
        y_train, y_val = y_data[train_index], y_data[val_index]
        eval_set = [(X_train, y_train), (X_val, y_val)]
        models[i] = sklearn.ensemble.GradientBoostingClassifier(learning_rate= 0.1,
                          max_depth=16,
                          n_estimators=150)
        models[i].fit(X_train, y_train)
        score = models[i].predict_proba(X_val)[:, 1]
        roc = roc_auc_score(y_val, score)
        roc_scores.append(roc)
    
        preds_data = models[i].predict_proba(X_test)[:,1]
        preds_store = preds_store+preds_data
        i+=1
    print(i)
    preds_store=preds_store/8
    print("mean_roc_score: %f"%(np.mean(roc_scores)))
    return (preds_store)

In [18]:
data_set = pd.read_csv('b/all_data_b.csv')
dict_rank={}
test_preds = pd.DataFrame({"id": data_set['id']})
test_preds['pred'] = feature_selection()
test_preds.to_csv('b/answer.csv',index=False)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [37]:
def write_into(path):
    a = open(path, "r")
    data_x= pd.read_csv(filepath_or_buffer = path, sep = ',')["id"].values
    data_y= pd.read_csv(filepath_or_buffer = path, sep = ',')["pred"].values
    f=open('b/answer.txt', 'a')
    for i in range(0,len(a.readlines())-1):
        f.write(str(data_x[i]))
        f.write('\t')
        f.write(str(float(data_y[i])-0.004))
        f.write('\n')
    f.close()

In [38]:
write_into('b/answer.csv')