# Amazu CTR 测试数据特征工程

In [1]:
# 首先 import 必要的模块
import pandas as pd 
import numpy as np

import random

import matplotlib.pyplot as plt
%matplotlib inline

## 1、数据准备

### 1.1 读取数据

In [3]:
# 数据路径
dpath = './data/'

filename = "test1.csv"
test = pd.read_csv(dpath + filename, dtype={'id':str})

In [4]:
#test.to_csv(dpath + 'test_tiny.csv', index=False, header=True)
test.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10000174058809263569,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,8330,320,50,761,3,175,100075,23
1,10000182526920855428,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
2,10000554139829213984,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
3,10001094637809798845,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,...,1,0,18648,320,50,1092,3,809,100156,61
4,10001377041558670745,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,...,1,0,23160,320,50,2667,0,47,-1,221


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4577464 entries, 0 to 4577463
Data columns (total 23 columns):
id                  object
hour                int64
C1                  int64
banner_pos          int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type         int64
device_conn_type    int64
C14                 int64
C15                 int64
C16                 int64
C17                 int64
C18                 int64
C19                 int64
C20                 int64
C21                 int64
dtypes: int64(13), object(10)
memory usage: 803.2+ MB


### 1.2 数据分类

In [6]:
# 保存ID列
ID = test['id']
# 训练数据
X_test = test.drop(['id'], axis=1)

#保存特征名字以备后用（可视化）
feat_names = X_test.columns 

# 数值型数据列名
numerical_features = ['hour', 'C1', 'banner_pos', 
                      'device_type', 'device_type', 'device_conn_type', 
                      'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
#类别型数据列名
categorical_features = ['site_id', 'site_domain', 'site_category', 
                        'app_id', 'app_domain', 'app_category', 
                        'device_id', 'device_ip', 'device_model']

### 1.3 数据处理

因为LigbtGBM中传入的数据集虽然支持离散特征，但是数据类型必须是数值型，所以需要把DataFrame中的Objective类型转变为int型。这里我用LabelEncoding完成。

In [7]:
# 导入编码工具包
from sklearn import preprocessing

for col in categorical_features:
    le = preprocessing.LabelEncoder()
    le.fit(X_test[col].values.tolist())

    X_test[col] = le.transform(X_test[col].values.tolist())

In [8]:
X_test.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,14103100,1005,0,393,3234,20,3643,94,0,193125,...,1,0,8330,320,50,761,3,175,100075,23
1,14103100,1005,0,355,3197,1,3643,94,0,193125,...,1,0,22676,320,50,2616,0,35,100083,51
2,14103100,1005,0,355,3197,1,3643,94,0,193125,...,1,0,22676,320,50,2616,0,35,100083,51
3,14103100,1005,0,1491,2590,5,1224,137,3,193125,...,1,0,18648,320,50,1092,3,809,100156,61
4,14103100,1005,0,1491,2590,5,2403,21,26,193125,...,1,0,23160,320,50,2667,0,47,-1,221


## 2、测试模型

### 2.1 导入模型

In [9]:
#load训练好的模型
import pickle

model = pickle.load(open("Train_LightGBM.pkl", 'rb'))

### 2.2 预测叶子节点

In [10]:
lgb_leaves = model.predict(X_test, pred_leaf=True)
print(lgb_leaves)

[[ 85  69  40 ...  81  24  94]
 [ 85  16  34 ... 114  24  94]
 [ 85  11  34 ... 114  24  94]
 ...
 [128 102  61 ...  90  24  86]
 [ 85  67   5 ... 120  16  94]
 [ 85  54  32 ...  37  24  94]]


In [11]:
lgb_leaves.shape

(4577464, 25)

### 2.3 保存结果作为新的测试集

In [12]:
# 生成提交结果
out_df = pd.DataFrame(lgb_leaves)

columns = np.empty(lgb_leaves.shape[1], dtype=object)
for i in range(lgb_leaves.shape[1]):
    columns[i] = 'Class_' + str(i+1)
    
out_df.columns = columns

out_df.head()

Unnamed: 0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9,Class_10,...,Class_16,Class_17,Class_18,Class_19,Class_20,Class_21,Class_22,Class_23,Class_24,Class_25
0,85,69,40,117,102,25,102,72,8,59,...,44,11,77,129,58,3,119,81,24,94
1,85,16,34,58,100,31,97,72,0,59,...,44,11,28,120,58,3,15,114,24,94
2,85,11,34,27,100,31,97,72,0,93,...,44,11,28,5,108,3,9,114,24,94
3,85,10,57,12,110,67,108,13,1,56,...,98,11,100,100,54,3,6,4,50,94
4,85,11,37,10,27,11,27,7,1,33,...,71,11,37,19,33,3,9,28,54,94


In [13]:
# 连接id
out_df = pd.concat([ID, out_df], axis=1, ignore_index=True)
# 设置输出数据列名
names_1 = ['id']
names_2 = columns
out_feat_names = []
out_feat_names.extend(names_1)
out_feat_names.extend(names_2)

out_df.columns = out_feat_names

In [14]:
out_df.to_csv(dpath + "test_GBDT.csv", index=False)

In [15]:
out_df.head()

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9,...,Class_16,Class_17,Class_18,Class_19,Class_20,Class_21,Class_22,Class_23,Class_24,Class_25
0,10000174058809263569,85,69,40,117,102,25,102,72,8,...,44,11,77,129,58,3,119,81,24,94
1,10000182526920855428,85,16,34,58,100,31,97,72,0,...,44,11,28,120,58,3,15,114,24,94
2,10000554139829213984,85,11,34,27,100,31,97,72,0,...,44,11,28,5,108,3,9,114,24,94
3,10001094637809798845,85,10,57,12,110,67,108,13,1,...,98,11,100,100,54,3,6,4,50,94
4,10001377041558670745,85,11,37,10,27,11,27,7,1,...,71,11,37,19,33,3,9,28,54,94
