In [1]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [2]:
all_data_path = feature_data_path + 'all_data_all_features_new_0512.pkl'
all_data = load_pickle(all_data_path)

target = 'is_trade'

features = load_pickle('all_features_day_4567.pkl')
categorical_feature = load_pickle('categorical_feature.pkl')

len(features), len(categorical_feature)

(253, 1)

In [3]:
train_data = all_data[(all_data.is_trade >= 0)]
test_data = all_data[all_data.is_trade == -2]

print(train_data.shape)
print(test_data.shape)

(5369478, 573)
(1209768, 573)


In [4]:
from sklearn.metrics import log_loss
import lightgbm as lgb

lgb_train_data = lgb.Dataset(
    train_data[features], label=train_data[target], feature_name=features, categorical_feature=categorical_feature)


param = {'application': 'binary',
         'metric': 'binary_logloss',

         'learning_rate': 0.05,

         'max_depth': 5,
         'num_leaves': 20,

         'min_data_in_leaf': 200,
         'min_sum_hessian_in_leaf': 0.001,
         'min_gain_to_split': 0.1,

         'feature_fraction': 0.8,
         'bagging_fraction': 0.7,
         'bagging_freq': 1,

         'lambda_l2': 10,
         'max_bin': 63,

         'device': 'gpu',
         'gpu_use_dp': True,
         
#          'num_threads': 1,
         }


valid_sets = [lgb_train_data,]

bst = lgb.train(param, lgb_train_data, 
                num_boost_round=2200, 
                categorical_feature=categorical_feature,
                valid_sets=valid_sets, verbose_eval=20,)


test_data['predicted_score'] = bst.predict(test_data[features])

test_data[['instance_id', 'predicted_score']].to_csv(
    '20180514-2200-day-4567.txt', index=False, sep=' ')


[20]	training's binary_logloss: 0.233822
[40]	training's binary_logloss: 0.117862
[100]	training's binary_logloss: 0.0694651
[120]	training's binary_logloss: 0.0683176
[140]	training's binary_logloss: 0.0677381
[160]	training's binary_logloss: 0.0673753
[180]	training's binary_logloss: 0.067126
[200]	training's binary_logloss: 0.0669357
[220]	training's binary_logloss: 0.066784
[240]	training's binary_logloss: 0.0666637
[260]	training's binary_logloss: 0.0665491
[280]	training's binary_logloss: 0.0664565
[300]	training's binary_logloss: 0.066372
[320]	training's binary_logloss: 0.0662915
[340]	training's binary_logloss: 0.0662156
[360]	training's binary_logloss: 0.0661457
[380]	training's binary_logloss: 0.0660867
[400]	training's binary_logloss: 0.06603
[420]	training's binary_logloss: 0.0659752
[440]	training's binary_logloss: 0.0659229
[460]	training's binary_logloss: 0.0658731
[480]	training's binary_logloss: 0.0658247
[500]	training's binary_logloss: 0.0657756
[520]	training's bin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [5]:
# predict_train = bst.predict(train_data[features])
predict_test = bst.predict(test_data[features])

# train_ctr = float(sum(predict_train)/float(len(predict_train)))
test_ctr = float(sum(predict_test)/float(len(predict_test)))

# train_ctr, 
test_ctr

0.034462307352375605

In [6]:
test_data['predicted_score'] = bst.predict(test_data[features])
ctr = test_data.groupby(['hour'])['predicted_score'].mean().reset_index().rename(columns={0: 'ctr'})
ctr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,hour,predicted_score
0,12,0.041335
1,13,0.039952
2,14,0.036357
3,15,0.036461
4,16,0.036538
5,17,0.037059
6,18,0.037062
7,19,0.036812
8,20,0.034777
9,21,0.030587


In [7]:

p1 = pd.read_csv('20180513-2900-day-47.txt', sep=' ')
p2 = pd.read_csv('201805012_7all_2050.txt', sep=' ')

p2['predicted_score'] = p2['predicted_score'] * 0.5 + p1['predicted_score'] * 0.5


p2[['instance_id', 'predicted_score']].to_csv(
    '20180513-combination.txt', index=False, sep=' ')

test_ctr = float(sum(p2['predicted_score'])/float(len(p2['predicted_score'])))

test_ctr

0.03949020931050776

In [None]:
p2.shape

## 检查提交数据 

In [None]:
test_data[-10:]

# 最终版本

In [None]:
201805012_7all_2050.txt
20180513-2900-day-47.txt