In [1]:
# -*- coding: utf-8 -*-
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse
import warnings
import time
import sys
import os
import re
import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)


In [2]:
train = pd.read_csv('./data/jinnan_round1_train_20181227.csv', encoding='gb18030')
test = pd.read_csv('./data/jinnan_round1_testA_20181227.csv', encoding='gb18030')

In [3]:
for df in [train, test]:
    df.drop(['B3', 'B13', 'A13', 'A18', 'A23'], axis=1, inplace=True)

In [4]:
# 删除缺失率超过90%的列
good_cols = list(train.columns)
for col in train.columns:
    rate = train[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.9:
        good_cols.remove(col)
        print(col,rate)

A1 0.9863896848137536
A2 0.9699140401146131
A3 0.9570200573065902
A4 0.9570200573065902
B2 0.9842406876790831


In [5]:
# 删除异常值
train = train[train['收率']>0.87]

In [6]:
# 合并数据集
target = train['收率']
del train['收率']
data = pd.concat([train,test],axis=0,ignore_index=True)
data = data.fillna(-1)

In [7]:
data

Unnamed: 0,样本id,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A14,A15,A16,A17,A19,A20,A21,A22,A24,A25,A26,A27,A28,B1,B2,B4,B5,B6,B7,B8,B9,B10,B11,B12,B14
0,sample_1528,300,-1.0,405.0,700,13:30:00,38.0,-1,-1.0,15:30:00,100,16:30:00,102.0,17:30:00,103.0,18:30:00,104.0,300,21:00-21:30,50.0,9.0,22:00:00,75,22:30:00,70.0,6:30-7:00,350.0,3.50,7:00-8:00,8:00:00,65,11:30:00,45.0,11:30-13:00,14:00-15:30,-1,800.0,400
1,sample_1698,300,-1.0,405.0,700,14:00:00,29.0,-1,-1.0,16:00:00,101,17:00:00,103.0,18:00:00,104.0,19:00:00,105.0,200,19:00-20:00,50.0,9.0,20:00:00,80,21:00:00,73.0,21:00-22:00,320.0,3.50,22:00-23:00,23:00:00,80,6:00:00,45.0,6:00-7:30,7:30-9:00,9:00-10:00,1200.0,400
2,sample_639,300,-1.0,405.0,700,14:00:00,29.0,-1,-1.0,16:00:00,102,17:00:00,103.0,18:00:00,104.0,19:00:00,105.0,200,19:00-19:30,50.0,9.0,20:00:00,79,21:00:00,73.0,21:00-22:00,320.0,3.50,22:00-23:00,23:00:00,80,1:00:00,45.0,1:00-2:30,2:30-4:00,4:00-5:00,1200.0,400
3,sample_483,300,-1.0,405.0,700,1:30:00,38.0,-1,-1.0,3:00:00,100,4:00:00,102.0,5:00:00,103.0,6:00:00,104.0,200,6:30-7:00,50.0,10.0,7:30:00,70,8:00:00,78.0,13:30-14:30,290.0,3.50,14:30-15:30,15:30:00,65,18:00:00,45.0,19:00-20:30,21:30-23:00,-1,800.0,400
4,sample_617,300,-1.0,405.0,700,22:00:00,29.0,-1,-1.0,0:00:00,101,1:00:00,103.0,2:00:00,104.0,3:00:00,105.0,200,3:00-4:00,50.0,9.0,4:00:00,80,5:00:00,73.0,5:00-6:00,320.0,3.50,6:00-7:00,7:00:00,80,9:00:00,45.0,9:00-10:30,10:30-12:00,12:00-13:00,1200.0,420
5,sample_373,300,-1.0,405.0,700,2:00:00,39.0,2:30:00,80.0,3:30:00,100,4:30:00,103.0,5:30:00,104.0,6:30:00,102.0,300,11:30-12:00,50.0,9.0,12:00:00,70,12:30:00,75.0,17:30-18:00,-1.0,3.50,18:00-20:00,20:00:00,65,3:00:00,45.0,3:00-4:30,5:30-7:00,-1,800.0,420
6,sample_577,300,-1.0,405.0,700,8:00:00,29.0,-1,-1.0,10:00:00,101,11:00:00,103.0,12:00:00,104.0,13:00:00,105.0,200,13:00-14:00,50.0,9.0,14:00:00,80,15:00:00,73.0,15:00-16:00,320.0,3.50,16:00-17:00,17:00:00,80,19:00:00,45.0,19:00-20:30,20:30-22:00,22:00-23:00,1200.0,400
7,sample_212,300,-1.0,405.0,700,6:00:00,29.0,-1,-1.0,8:00:00,101,9:00:00,102.0,10:00:00,103.0,11:00:00,103.0,200,11:00-12:00,50.0,9.0,12:00:00,78,13:00:00,73.0,13:00-14:00,320.0,3.50,14:00-15:00,15:00:00,80,17:00:00,45.0,17:00-18:30,18:30-20:00,20:00-21:00,1200.0,400
8,sample_521,300,-1.0,405.0,700,17:30:00,21.0,-1,-1.0,19:00:00,100,20:00:00,101.0,21:00:00,102.0,22:00:00,105.0,300,22:30-23:00,50.0,9.0,23:30:00,70,0:00:00,78.0,2:00-3:00,340.0,3.50,3:00-4:30,4:30:00,65,6:30:00,45.0,6:30-8:00,8:30-10:00,-1,800.0,420
9,sample_1026,300,-1.0,405.0,700,11:00:00,21.0,-1,-1.0,12:30:00,100,13:30:00,102.0,14:30:00,103.0,15:30:00,105.0,200,16:00-16:30,50.0,9.0,17:00:00,70,17:30:00,78.0,22:00-23:00,280.0,3.50,23:00-0:00,0:00:00,64,2:00:00,50.0,6:00-7:30,8:00-9:30,-1,800.0,400


In [8]:
def timeTranSecond(t):
    try:
        t,m,s=t.split(":")
    except:
        if t=='1900/1/9 7:00':
            return 7*3600/3600
        elif t=='1900/1/1 2:30':
            return (2*3600+30*60)/3600
        elif t==-1:
            return -1
        else:
            return 0
    
    try:
        tm = (int(t)*3600+int(m)*60+int(s))/3600
    except:
        return (30*60)/3600
    
    return tm
for f in ['A5','A7','A9','A11','A14','A16','A24','A26','B5','B7']:
    try:
        data[f] = data[f].apply(timeTranSecond)
    except:
        print(f,'应该在前面被删除了！')

def getDuration(se):
    try:
        sh,sm,eh,em=re.findall(r"\d+\.?\d*",se)
    except:
        if se == -1:
            return -1 
        
    try:
        if int(sh)>int(eh):
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600 + 24
        else:
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600
    except:
        if se=='19:-20:05':
            return 1
        elif se=='15:00-1600':
            return 1
    
    return tm
for f in ['A20','A28','B4','B9','B10','B11']:
    data[f] = data.apply(lambda df: getDuration(df[f]), axis=1)


cate_columns = [f for f in data.columns if f != '样本id']


In [9]:
#label encoder
for f in cate_columns:
    data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))
train = data[:train.shape[0]]
test  = data[train.shape[0]:]

In [10]:
train['target'] = list(target)
train['intTarget'] = pd.cut(train['target'], 5, labels=False)
train = pd.get_dummies(train, columns=['intTarget'])
li = train.columns[-5:]
mean_features = []
li

Index(['intTarget_0', 'intTarget_1', 'intTarget_2', 'intTarget_3',
       'intTarget_4'],
      dtype='object')

In [11]:
for f1 in cate_columns:
    rate = train[f1].value_counts(normalize=True, dropna=False).values[0]
    if rate < 0.50:
        for f2 in li:
            col_name = f1+"_"+f2+'_mean'
            mean_features.append(col_name)
            order_label = train.groupby([f1])[f2].mean()
            for df in [train, test]:
                df[col_name] = df[f1].map(order_label)

In [12]:
train.drop(li, axis=1, inplace=True)

train.drop(['样本id','target'], axis=1, inplace=True)
test = test[train.columns]
X_train = train.values
y_train = target.values
X_test = test.values

In [13]:
param = {'num_leaves': 120,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, target)))

fold n°1
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.000231916	valid_1's l2: 0.000256668
[400]	training's l2: 0.000173504	valid_1's l2: 0.00021685
[600]	training's l2: 0.00014793	valid_1's l2: 0.000205709
[800]	training's l2: 0.000134189	valid_1's l2: 0.000200248
[1000]	training's l2: 0.000125604	valid_1's l2: 0.000198354
Early stopping, best iteration is:
[1006]	training's l2: 0.00012539	valid_1's l2: 0.000198219
fold n°2
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.000227514	valid_1's l2: 0.000262397
[400]	training's l2: 0.00016908	valid_1's l2: 0.000234039
[600]	training's l2: 0.000145023	valid_1's l2: 0.000225499
[800]	training's l2: 0.000131566	valid_1's l2: 0.000220931
[1000]	training's l2: 0.00012293	valid_1's l2: 0.00021894
[1200]	training's l2: 0.000116468	valid_1's l2: 0.00021794
[1400]	training's l2: 0.000111946	valid_1's l2: 0.000217295
[1600]	training's l2: 0.000108309	valid_1's l2: 0.000216

In [14]:
##### xgb
xgb_params = {'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 4}

folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_xgb = np.zeros(len(train))
predictions_xgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
    val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])

    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
    predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
    
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target)))

fold n°1
[0]	train-rmse:0.422934	valid_data-rmse:0.423824
Multiple eval metrics have been passed: 'valid_data-rmse' will be used for early stopping.

Will train until valid_data-rmse hasn't improved in 200 rounds.
[100]	train-rmse:0.257119	valid_data-rmse:0.258091
[200]	train-rmse:0.156792	valid_data-rmse:0.157712
[300]	train-rmse:0.096224	valid_data-rmse:0.09703
[400]	train-rmse:0.059811	valid_data-rmse:0.060521
[500]	train-rmse:0.038036	valid_data-rmse:0.038868
[600]	train-rmse:0.025087	valid_data-rmse:0.026516
[700]	train-rmse:0.017474	valid_data-rmse:0.019803
[800]	train-rmse:0.013043	valid_data-rmse:0.016346
[900]	train-rmse:0.010541	valid_data-rmse:0.014693
[1000]	train-rmse:0.009119	valid_data-rmse:0.013931
[1100]	train-rmse:0.008319	valid_data-rmse:0.013603
[1200]	train-rmse:0.007799	valid_data-rmse:0.013457
[1300]	train-rmse:0.007441	valid_data-rmse:0.013371
[1400]	train-rmse:0.007182	valid_data-rmse:0.013351
[1500]	train-rmse:0.006986	valid_data-rmse:0.013345
[1600]	train-rms

In [15]:
# 将lgb和xgb的结果进行stacking
train_stack = np.vstack([oof_lgb,oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,target)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values
    
    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)
    
    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10

fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9


In [16]:
mean_squared_error(target.values, oof_stack)

0.00019148960557363283

In [18]:
oof_stack[test_stack.shape[0]]

0.9364528436633934

In [19]:
predictions

array([0.92627381, 0.88607891, 0.93197994, 0.91637087, 0.91625421,
       0.9273154 , 0.92879629, 0.90028933, 0.93220258, 0.92523737,
       0.96828644, 0.96755611, 0.93753057, 0.95719872, 0.89987956,
       0.92651512, 0.89236596, 0.93363336, 0.95868525, 0.89648996,
       0.90755504, 0.97205596, 0.92964634, 0.94688992, 0.97141773,
       0.9440304 , 0.89807021, 0.92624112, 0.97150243, 0.90342941,
       0.97152122, 0.88996766, 0.89651606, 0.89511903, 0.91964799,
       0.89786548, 0.91637087, 0.89968604, 0.88901099, 0.90010532,
       0.89907215, 0.95752255, 0.97251967, 0.93614183, 0.91606626,
       0.89918627, 0.90858521, 0.90259315, 0.90062335, 0.92467017,
       0.8935002 , 0.92083482, 0.89348962, 0.89767706, 0.89158784,
       0.93510017, 0.92651512, 0.90038082, 0.93140167, 0.9440304 ,
       0.9331489 , 0.95799331, 0.91637087, 0.94556558, 0.93425905,
       0.92763814, 0.89507264, 0.93028916, 0.9449114 , 0.91489346,
       0.93489859, 0.97484962, 0.92434841, 0.8982415 , 0.92234

In [20]:
len(target.values)

1381

In [21]:
import time
time_name = time.strftime('%Y%m%d%H%M',time.localtime(time.time()))

sub_df = pd.read_csv('./data/jinnan_round1_submit_20181227.csv', header=None)
sub_df[1] = predictions
sub_df[1] = sub_df[1].apply(lambda x: round(x, 3))
sub_df.to_csv("./data/" + time_name + ".csv", index=False, header=None)
