In [1]:
import os
import zipfile
import time
import pickle
import gc

import pandas as pd
import numpy as np
from tqdm import tqdm

from utils import load_pickle, dump_pickle, get_feature_value, feature_spearmanr, feature_target_spearmanr, addCrossFeature, calibration
from utils import raw_data_path, feature_data_path, cache_pkl_path, analyse

In [2]:
all_data_path = feature_data_path + 'all_data_all_features.pkl'
all_data = load_pickle(all_data_path)

target = 'is_trade'

features = load_pickle('all_features.pkl')
categorical_feature = load_pickle('categorical_feature.pkl')

# 只使用原始特征
# features = load_pickle('original_features.pkl')
# categorical_feature = ['user_gender_id', 'user_occupation_id']

len(features), len(categorical_feature)

(292, 15)

### 取7号上午数据进行训练和测试，以11点为分割

In [3]:
data = all_data[(all_data.day == 7) & (all_data.is_trade != -1)]

train_data = data[(data.hour < 11)]
test_data = data[data.hour >= 11]

print(train_data.shape)
print(test_data.shape)

(950233, 312)
(126942, 312)


In [7]:
from sklearn.metrics import log_loss
import lightgbm as lgb

lgb_train_data = lgb.Dataset(
    train_data[features], label=train_data[target], feature_name=features, categorical_feature=categorical_feature)
lgb_test_data = lgb_train_data.create_valid(
    test_data[features], label=test_data[target])

param = {'application': 'binary',
         'metric': 'binary_logloss',

         'learning_rate': 0.005,
         
         'boosting':'rf',

         'max_depth': 100,
         'num_leaves': 18,

         'min_data_in_leaf': 500,
         'min_sum_hessian_in_leaf': 0.01,
         'min_gain_to_split': 0.1,

         'feature_fraction': 0.9,
         'bagging_fraction': 0.8,
         'bagging_freq': 1,

         'lambda_l2': 1.0,
         'max_bin': 63,

#          'device': 'gpu',
#          'gpu_use_dp': False,
         }


valid_sets = [lgb_train_data, lgb_test_data]

bst = lgb.train(param, lgb_train_data, valid_sets=valid_sets, 
                num_boost_round=2000, early_stopping_rounds=100, verbose_eval=20, 
                categorical_feature=categorical_feature
               )

loss_train = log_loss(train_data[target], bst.predict(train_data[features]))
loss_test = log_loss(test_data[target], bst.predict(test_data[features]))

loss_train, loss_test



Training until validation scores don't improve for 100 rounds.
[20]	training's binary_logloss: 0.232141	valid_1's binary_logloss: 0.226506
[40]	training's binary_logloss: 0.232135	valid_1's binary_logloss: 0.226499
[60]	training's binary_logloss: 0.232134	valid_1's binary_logloss: 0.226501
[80]	training's binary_logloss: 0.232141	valid_1's binary_logloss: 0.226506
[100]	training's binary_logloss: 0.232141	valid_1's binary_logloss: 0.226506
[120]	training's binary_logloss: 0.232142	valid_1's binary_logloss: 0.226505
[140]	training's binary_logloss: 0.232141	valid_1's binary_logloss: 0.226503
Early stopping, best iteration is:
[44]	training's binary_logloss: 0.232133	valid_1's binary_logloss: 0.226497


(0.23213265947035749, 0.22649706671461134)

## 输出临时文件，用于组合

In [20]:
test_data['predicted_score'] = bst.predict(test_data[features])

test_data[['instance_id', 'predicted_score']].to_csv(
    'hour-1-depth-5-iter-1061-loss-16138.txt', index=False, sep=' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## 特征重要度

In [21]:
importance = pd.DataFrame(bst.feature_importance())
importance.columns = ['importance_20']
features = pd.DataFrame(features)
features.columns = ['features_20']
pd.set_option('max_rows',500)
merge = features.join(importance)
merge = merge.sort_values(by=['importance_20'], ascending=False).reset_index()
merge

Unnamed: 0,index,features_20,importance_20
0,138,item_property_topic,1665
1,4,item_sales_level,570
2,51,user_click_time_gap_after,323
3,61,user_category2_label_last_click_interval_day,263
4,11,user_star_level,244
5,146,item_id_smooth_CTR,232
6,15,shop_id,189
7,225,user_age_level_category2_label_smooth_CTR,180
8,276,item_id_click_hour,178
9,0,item_id,176


### 取6号和7号上午数据进行训练和测试，以7号11点为分割

In [45]:
data = all_data[all_data.is_trade != -1]

train_data = data[(data.day == 6) | ((data.hour < 11) & (data.day == 7))]
test_data = data[(data.hour >= 11) & (data.day == 7)]

print(train_data.shape)
print(test_data.shape)

(2884676, 169)
(126942, 169)
