# 对训练集和测试集一起做独热和哈希编码

In [1]:
# 首先导入必要的工具包
import pandas as pd
import numpy as np

## 1、准备数据

### 1.1 读取数据

In [2]:
# 数据路径
dpath = './data/'

# 读取训练数据
file_train = 'train_tiny.csv'
train = pd.read_csv(dpath + file_train, dtype={'id':str})

In [3]:
train.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10014385711019128754,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,-1,79
1,10019341288757450780,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15707,320,50,1722,0,35,-1,79
2,10024331030544393307,1,14102100,1002,0,f727f4e0,fcb30e54,50e219e0,ecad2386,7801e8d9,...,0,0,6616,320,50,576,2,35,100131,32
3,10065138335302585931,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,03528b27,2347f47a,...,1,2,6559,320,50,571,2,39,-1,32
4,10077489368961027234,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,e96166c7,2347f47a,...,1,0,18993,320,50,2161,0,35,100148,157


In [4]:
train.shape

(500000, 24)

In [5]:
file_test = 'test.csv'

# 读取测试数据
test = pd.read_csv(dpath + file_test, dtype={'id': str})

In [6]:
test.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10000174058809263569,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,8330,320,50,761,3,175,100075,23
1,10000182526920855428,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
2,10000554139829213984,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
3,10001094637809798845,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,...,1,0,18648,320,50,1092,3,809,100156,61
4,10001377041558670745,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,...,1,0,23160,320,50,2667,0,47,-1,221


In [7]:
test.shape

(4577464, 23)

### 1.2 数据划分

为了保证训练数据和测试数据编码之后的特征维度依然相同，我们需要先把训练集和测试集合并在一起做独热和哈希编码，处理完之后再把两者分开。

In [8]:
# 保存训练集的id，click和所有特征
id_train = train['id']
y_train = train['click']
X_train = train.drop(['id', 'click'], axis=1)

# 保存测试集的id和所有特征
id_test = test['id']
X_test = test.drop(['id'], axis=1)

# 添加新的特征label作为训练集和测试集的分类
X_train['label'] = 1
X_test['label'] = -1

In [9]:
X_train.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,label
0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,0,15704,320,50,1722,0,35,-1,79,1
1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,0,15707,320,50,1722,0,35,-1,79,1
2,14102100,1002,0,f727f4e0,fcb30e54,50e219e0,ecad2386,7801e8d9,07d7df22,2ad16ba3,...,0,6616,320,50,576,2,35,100131,32,1
3,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,03528b27,2347f47a,8ded1f7a,a99f214a,...,2,6559,320,50,571,2,39,-1,32,1
4,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,e96166c7,2347f47a,0f2161f8,fd47d3a8,...,0,18993,320,50,2161,0,35,100148,157,1


In [10]:
X_test.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,label
0,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,0,8330,320,50,761,3,175,100075,23,-1
1,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,0,22676,320,50,2616,0,35,100083,51,-1
2,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,0,22676,320,50,2616,0,35,100083,51,-1
3,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,a99f214a,...,0,18648,320,50,1092,3,809,100156,61,-1
4,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,a99f214a,...,0,23160,320,50,2667,0,47,-1,221,-1


### 1.3 合并数据

In [11]:
# 将测试数据添加到训练数据后
X_all = X_train.append(X_test, ignore_index=True)

In [12]:
X_all.shape

(5077464, 23)

In [13]:
# 查看离散型特征分布
for col in X_all.columns:
    print('\n%s特征的不同取值和出现次数' % col)
    print(X_all[col].value_counts())


hour特征的不同取值和出现次数
14103109    299162
14103115    294435
14103107    291074
14103111    271846
14103113    271542
14103106    268602
14103116    259785
14103114    258779
14103110    250802
14103118    223281
14103112    221738
14103117    214982
14103108    208737
14103119    158623
14103105    145333
14103104    129317
14103120    128648
14103102    117478
14103121    104979
14103103     98818
14103101     94948
14103122     91011
14103100     90990
14103123     82554
14102209      5549
14102210      5468
14102813      5308
14102212      5064
14102211      4799
14102814      4751
             ...  
14102721      1104
14102300      1102
14102223      1097
14103000      1096
14102121      1087
14102702      1073
14102621      1071
14103001      1047
14102601      1031
14102400      1013
14102200      1011
14102501      1007
14103023      1002
14102800       986
14102523       958
14102701       936
14102504       923
14102123       908
14102623       907
14102600       894
14102500     

6b9769f2    22464
431b3174    12495
f1542531    12337
0489ce3f    10338
c6563308    10300
1cf29716    10252
a8536f3a    10248
ceffea69    10239
ddd2926e    10218
75bb1b58    10195
488a9a3e    10172
57cd4006    10071
8a014cbb     9821
9b1fe278     9755
ee0389c1     5685
07875ea4     5667
bca8f26d     5046
ac77b71a     4833
ff1c4f79     4700
e54c1344     4460
884297d2     4399
95b2935e     4256
df72a310     4073
693bff3e     3763
ff65d711     3605
930ec31d     3492
d90a7774     3377
7ed30f6c     3360
af62faf4     3335
2f323f36     3313
            ...  
686b1119        1
97c0c24a        1
f1b6031d        1
a25f92dc        1
a2517f6d        1
6d690457        1
bf73ee17        1
de8acd82        1
3d1bf83c        1
5cb1aa46        1
55fd986e        1
49172973        1
e0563f12        1
89209436        1
ca272bc0        1
311f07f9        1
9e45dd5a        1
62ce2f54        1
8213c67a        1
1c081ba5        1
dec14b2d        1
43a17414        1
e9f074e8        1
6386d28f        1
7bc7cbd0  

In [14]:
featnames_onehot = ['C1','banner_pos', 
                'site_category', 'app_category', 'device_type', 
                'device_conn_type', 'C15', 'C16', 
                 'C18', 'C19', 'C21']
featnames_hash = ['hour', 'site_id', 'site_domain', 'app_id', 
              'app_domain', 'device_id', 'device_ip', 
              'device_model', 'C14', 'C17', 'C20']

## 2、特征编码

In [15]:
# 数值型数据列名
numerical_features = ['hour', 'C1', 'banner_pos', 
                      'device_type', 'device_type', 'device_conn_type', 
                      'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
#类别型数据列名
categorical_features = ['site_id', 'site_domain', 'site_category', 
                        'app_id', 'app_domain', 'app_category', 
                        'device_id', 'device_ip', 'device_model']

### 2.1 标签编码

In [16]:
# 导入编码工具包
from sklearn import preprocessing

for col in categorical_features:
    le = preprocessing.LabelEncoder()
    le.fit(X_all[col].values.tolist())

    X_all[col] = le.transform(X_all[col].values.tolist())

In [17]:
X_all.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,label
0,14102100,1005,0,387,3552,1,4242,114,0,241357,...,0,15704,320,50,1722,0,35,-1,79,1
1,14102100,1005,0,387,3552,1,4242,114,0,241357,...,0,15707,320,50,1722,0,35,-1,79,1
2,14102100,1002,0,3036,3682,5,4242,114,0,60704,...,0,6616,320,50,576,2,35,100131,32,1
3,14102100,1005,0,1665,2879,5,58,28,18,241357,...,2,6559,320,50,571,2,39,-1,32,1
4,14102100,1005,0,1665,2879,5,4175,28,4,360469,...,0,18993,320,50,2161,0,35,100148,157,1


In [18]:
X_all.shape

(5077464, 23)

### 2.2 独热编码

对维度不高的特征进行独热编码

In [19]:
feats_onehot = pd.DataFrame()
new_df = pd.DataFrame()
for col in featnames_onehot:
    new_df = pd.get_dummies(X_all[col])
    print("%s encoding is completed." % col)
    feats_onehot = pd.concat([feats_onehot, new_df], axis=1, ignore_index=True)
    print("%s onehot is connected." % col)

C1 encoding is completed.
C1 onehot is connected.
banner_pos encoding is completed.
banner_pos onehot is connected.
site_category encoding is completed.
site_category onehot is connected.
app_category encoding is completed.
app_category onehot is connected.
device_type encoding is completed.
device_type onehot is connected.
device_conn_type encoding is completed.
device_conn_type onehot is connected.
C15 encoding is completed.
C15 onehot is connected.
C16 encoding is completed.
C16 onehot is connected.
C18 encoding is completed.
C18 onehot is connected.
C19 encoding is completed.
C19 onehot is connected.
C21 encoding is completed.
C21 onehot is connected.


In [20]:
# 设置独热编码列名
columns_onehot = np.empty(feats_onehot.shape[1], dtype=object)
for i in range(feats_onehot.shape[1]):
    columns_onehot[i] = 'OneHot_' + str(i+1)
feats_onehot.columns = columns_onehot
feats_onehot.head()

Unnamed: 0,OneHot_1,OneHot_2,OneHot_3,OneHot_4,OneHot_5,OneHot_6,OneHot_7,OneHot_8,OneHot_9,OneHot_10,...,OneHot_215,OneHot_216,OneHot_217,OneHot_218,OneHot_219,OneHot_220,OneHot_221,OneHot_222,OneHot_223,OneHot_224
0,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
feats_onehot.shape

(5077464, 224)

### 2.3 哈希编码

对高维度的特征进行哈希编码

In [22]:
X_all[featnames_hash].shape

(5077464, 11)

In [23]:
unique_id = np.unique(X_all[['site_id']])
print("Total site id:", len(unique_id))
print(unique_id)

Total site id: 3158
[   0    1    2 ... 3155 3156 3157]


In [24]:
from sklearn.feature_extraction import FeatureHasher
feats_hash = pd.DataFrame()
for col in featnames_hash:
    fh = FeatureHasher(n_features=10, input_type='string')
    df = fh.fit_transform(X_all[col].astype(str))
    df = pd.DataFrame(df.toarray(), dtype='float16')
    print("Feature %s is hashed." % col)
    feats_hash = pd.concat([feats_hash, df], axis=1, ignore_index=True)
    print("Feature %s is connected." % col)
print("Completed!")

Feature hour is hashed.
Feature hour is connected.
Feature site_id is hashed.
Feature site_id is connected.
Feature site_domain is hashed.
Feature site_domain is connected.
Feature app_id is hashed.
Feature app_id is connected.
Feature app_domain is hashed.
Feature app_domain is connected.
Feature device_id is hashed.
Feature device_id is connected.
Feature device_ip is hashed.
Feature device_ip is connected.
Feature device_model is hashed.
Feature device_model is connected.
Feature C14 is hashed.
Feature C14 is connected.
Feature C17 is hashed.
Feature C17 is connected.
Feature C20 is hashed.
Feature C20 is connected.
Completed!


In [25]:
# 设置哈希编码列名
columns_hash = np.empty(feats_hash.shape[1], dtype=object)
for i in range(feats_hash.shape[1]):
    columns_hash[i] = 'Hash_' + str(i+1)
feats_hash.columns = columns_hash
feats_hash.head()

Unnamed: 0,Hash_1,Hash_2,Hash_3,Hash_4,Hash_5,Hash_6,Hash_7,Hash_8,Hash_9,Hash_10,...,Hash_101,Hash_102,Hash_103,Hash_104,Hash_105,Hash_106,Hash_107,Hash_108,Hash_109,Hash_110
0,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,-3.0,0.0,-3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
1,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,-3.0,0.0,-3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
2,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,-3.0,0.0,-3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,-2.0
3,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,-3.0,0.0,-3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
4,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,-3.0,0.0,-3.0,...,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,-2.0,0.0,-2.0


In [26]:
feats_hash.shape

(5077464, 110)

## 3、处理编码后的特征

In [27]:
# # 区分训练集和测试集的函数
# def split_data(dataset):
#     train_set = pd.DataFrame(columns=dataset.columns)
#     test_set = pd.DataFrame(columns=dataset.columns)
#     for index, row in dataset.iterrows():
#         if row['label'] == 1:
#             train_set = train_set.append(row).drop(['label'], axis=1)
#         elif row['label'] == -1:
#             test_set = test_set.append(row).drop(['label'], axis=1)
#         else:
#             print("Wrong label!!!")
#             return
#     return train_set, test_set

### 3.1 保存编码结果

**标签编码**

In [27]:
# 划分训练集和测试集
columns = X_all.columns
train_label = pd.DataFrame(X_all.loc[0:499999])
test_label = pd.DataFrame(X_all.loc[500000:5077463])

In [28]:
# 合并训练集id和click列
train_label = pd.concat([id_train, y_train, train_label], axis=1)
# 删除label列
train_label = train_label.drop(['label'], axis=1)
train_label.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10014385711019128754,0,14102100,1005,0,387,3552,1,4242,114,...,1,0,15704,320,50,1722,0,35,-1,79
1,10019341288757450780,0,14102100,1005,0,387,3552,1,4242,114,...,1,0,15707,320,50,1722,0,35,-1,79
2,10024331030544393307,1,14102100,1002,0,3036,3682,5,4242,114,...,0,0,6616,320,50,576,2,35,100131,32
3,10065138335302585931,0,14102100,1005,0,1665,2879,5,58,28,...,1,2,6559,320,50,571,2,39,-1,32
4,10077489368961027234,0,14102100,1005,0,1665,2879,5,4175,28,...,1,0,18993,320,50,2161,0,35,100148,157


In [29]:
train_label.shape

(500000, 24)

In [30]:
test_label.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,label
500000,14103100,1005,0,428,3595,21,4242,114,0,241357,...,0,8330,320,50,761,3,175,100075,23,-1
500001,14103100,1005,0,387,3552,1,4242,114,0,241357,...,0,22676,320,50,2616,0,35,100083,51,-1
500002,14103100,1005,0,387,3552,1,4242,114,0,241357,...,0,22676,320,50,2616,0,35,100083,51,-1
500003,14103100,1005,0,1665,2879,5,1435,169,4,241357,...,0,18648,320,50,1092,3,809,100156,61,-1
500004,14103100,1005,0,1665,2879,5,2781,28,28,241357,...,0,23160,320,50,2667,0,47,-1,221,-1


In [31]:
# 重置测试集索引
test_label = test_label.reset_index(drop=True)
test_label.head()

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,label
0,14103100,1005,0,428,3595,21,4242,114,0,241357,...,0,8330,320,50,761,3,175,100075,23,-1
1,14103100,1005,0,387,3552,1,4242,114,0,241357,...,0,22676,320,50,2616,0,35,100083,51,-1
2,14103100,1005,0,387,3552,1,4242,114,0,241357,...,0,22676,320,50,2616,0,35,100083,51,-1
3,14103100,1005,0,1665,2879,5,1435,169,4,241357,...,0,18648,320,50,1092,3,809,100156,61,-1
4,14103100,1005,0,1665,2879,5,2781,28,28,241357,...,0,23160,320,50,2667,0,47,-1,221,-1


In [32]:
# 合并测试集集id
test_label = pd.concat([id_test, test_label], axis=1)
# 删除label列
test_label = test_label.drop(['label'], axis=1)
test_label.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10000174058809263569,14103100,1005,0,428,3595,21,4242,114,0,...,1,0,8330,320,50,761,3,175,100075,23
1,10000182526920855428,14103100,1005,0,387,3552,1,4242,114,0,...,1,0,22676,320,50,2616,0,35,100083,51
2,10000554139829213984,14103100,1005,0,387,3552,1,4242,114,0,...,1,0,22676,320,50,2616,0,35,100083,51
3,10001094637809798845,14103100,1005,0,1665,2879,5,1435,169,4,...,1,0,18648,320,50,1092,3,809,100156,61
4,10001377041558670745,14103100,1005,0,1665,2879,5,2781,28,28,...,1,0,23160,320,50,2667,0,47,-1,221


In [33]:
test_label.shape

(4577464, 23)

In [34]:
# 保存训练集和测试集
train_label.to_csv(dpath + "train_label.csv", index=False, header=True)
test_label.to_csv(dpath + "test_label.csv", index=False, header=True)

**合并独热编码和哈希编码**

In [35]:
# 划分独热编码训练集和测试集
train_onehot = pd.DataFrame(feats_onehot.loc[0:499999])
test_onehot = pd.DataFrame(feats_onehot.loc[500000:5077463])

# 重置独热编码测试集索引
test_onehot = test_onehot.reset_index(drop=True)

# 划分哈希编码训练集和测试集
train_hash = pd.DataFrame(feats_hash.loc[0:499999])
test_hash = pd.DataFrame(feats_hash.loc[500000:5077463])

# 重置哈希编码测试集索引
test_hash = test_hash.reset_index(drop=True)

In [36]:
# 合并训练集独热编码和哈希编码结果
train_oh_hash = pd.concat([id_train, y_train, train_onehot, train_hash], axis=1)
train_oh_hash.head()

Unnamed: 0,id,click,OneHot_1,OneHot_2,OneHot_3,OneHot_4,OneHot_5,OneHot_6,OneHot_7,OneHot_8,...,Hash_101,Hash_102,Hash_103,Hash_104,Hash_105,Hash_106,Hash_107,Hash_108,Hash_109,Hash_110
0,10014385711019128754,0,0,0,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
1,10019341288757450780,0,0,0,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
2,10024331030544393307,1,0,1,0,0,0,0,0,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,-2.0
3,10065138335302585931,0,0,0,1,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
4,10077489368961027234,0,0,0,1,0,0,0,0,1,...,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,-2.0,0.0,-2.0


In [44]:
# 保存训练集
train_oh_hash.to_csv(dpath + "train_oh_hash.csv", index=False, header=True)

In [38]:
# 合并测试集独热编码和哈希编码结果
test_oh_hash = pd.concat([id_test, test_onehot, test_hash], axis=1)
test_oh_hash.head()

Unnamed: 0,id,OneHot_1,OneHot_2,OneHot_3,OneHot_4,OneHot_5,OneHot_6,OneHot_7,OneHot_8,OneHot_9,...,Hash_101,Hash_102,Hash_103,Hash_104,Hash_105,Hash_106,Hash_107,Hash_108,Hash_109,Hash_110
0,10000174058809263569,0,0,1,0,0,0,0,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,-3.0
1,10000182526920855428,0,0,1,0,0,0,0,1,0,...,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,-3.0
2,10000554139829213984,0,0,1,0,0,0,0,1,0,...,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,-3.0
3,10001094637809798845,0,0,1,0,0,0,0,1,0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,-2.0
4,10001377041558670745,0,0,1,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0


In [41]:
# 保存测试集
test_oh_hash.to_csv(dpath + "test_oh_hash.csv", index=False, header=True)

**合并标签编码和哈希编码**

In [42]:
# 合并训练集标签编码和哈希编码结果
train_lb_hash = pd.concat([id_train, y_train, train_label[featnames_onehot], train_hash], axis=1)
train_lb_hash.head()

Unnamed: 0,id,click,C1,banner_pos,site_category,app_category,device_type,device_conn_type,C15,C16,...,Hash_101,Hash_102,Hash_103,Hash_104,Hash_105,Hash_106,Hash_107,Hash_108,Hash_109,Hash_110
0,10014385711019128754,0,1005,0,1,0,1,0,320,50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
1,10019341288757450780,0,1005,0,1,0,1,0,320,50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
2,10024331030544393307,1,1002,0,5,0,0,0,320,50,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,-2.0
3,10065138335302585931,0,1005,0,5,18,1,2,320,50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0
4,10077489368961027234,0,1005,0,5,4,1,0,320,50,...,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,-2.0,0.0,-2.0


In [52]:
train_lb_hash.columns[13: 123]

Index(['Hash_1', 'Hash_2', 'Hash_3', 'Hash_4', 'Hash_5', 'Hash_6', 'Hash_7',
       'Hash_8', 'Hash_9', 'Hash_10',
       ...
       'Hash_101', 'Hash_102', 'Hash_103', 'Hash_104', 'Hash_105', 'Hash_106',
       'Hash_107', 'Hash_108', 'Hash_109', 'Hash_110'],
      dtype='object', length=110)

In [45]:
# 保存训练集
train_lb_hash.to_csv(dpath + "train_lb_hash.csv", index=False, header=True)

In [46]:
# 合并测试集标签编码和哈希编码结果
test_lb_hash = pd.concat([id_test, test_label[featnames_onehot], test_hash], axis=1)
test_lb_hash.head()

Unnamed: 0,id,C1,banner_pos,site_category,app_category,device_type,device_conn_type,C15,C16,C18,...,Hash_101,Hash_102,Hash_103,Hash_104,Hash_105,Hash_106,Hash_107,Hash_108,Hash_109,Hash_110
0,10000174058809263569,1005,0,21,0,1,0,320,50,3,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,-3.0
1,10000182526920855428,1005,0,1,0,1,0,320,50,0,...,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,-3.0
2,10000554139829213984,1005,0,1,0,1,0,320,50,0,...,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,-3.0
3,10001094637809798845,1005,0,5,4,1,0,320,50,3,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,-2.0
4,10001377041558670745,1005,0,5,28,1,0,320,50,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0


In [47]:
# 保存测试集
test_lb_hash.to_csv(dpath + "test_lb_hash.csv", index=False, header=True)