### 贝叶斯平滑的代码参考：https://github.com/BladeCoda/Tencent2017_Final_Coda_Allegro/blob/master/ZHLsmooth.py

In [1]:
import time
import datetime
import gc
from collections import Counter

import pandas as pd
import numpy as np
import scipy.special as special

为防止过拟合，只考虑**7号之前**的样本的转化率，经试验发现，使用了7号的数据构造转化率特征的时候非常容易**过拟合**，所以这里不用

In [None]:
# 读取Step_1数据分割中存储的整个数据集
data = pd.read_csv('../Temp/data.csv')

In [None]:
# 读取Step_5步骤处理的结果
train_day_7_test = pd.read_csv('../Temp/train_day_7_test_step_5.csv')

In [None]:
train_before_7 = data[data['day'] != 7]

In [2]:
# 贝叶斯迭代代码
class BayesianSmoothing(object):
    def __init__(self, alpha, beta):
        self.alpha = alpha
        self.beta = beta
    def update(self, imps, clks, iter_num, epsilon):
        for i in range(iter_num):
            new_alpha, new_beta = self.__fixed_point_iteration(imps, clks, self.alpha, self.beta)
            if abs(new_alpha - self.alpha) < epsilon and abs(new_beta - self.beta) < epsilon:
                break
            print(new_alpha, new_beta,i)
            self.alpha = new_alpha
            self.beta = new_beta

    def __fixed_point_iteration(self, imps, clks, alpha, beta):
        numerator_alpha = 0.0
        numerator_beta = 0.0
        denominator = 0.0
        for i in range(len(imps)):
            numerator_alpha += (special.digamma(clks[i] + alpha) - special.digamma(alpha))
            numerator_beta += (special.digamma(imps[i] - clks[i]+beta) - special.digamma(beta))
            denominator += (special.digamma(imps[i] + alpha+beta) - special.digamma(alpha+beta))

        return alpha * (numerator_alpha / denominator), beta * (numerator_beta / denominator)

In [None]:
# 构建item_id转化率特征
item_all_list = list(set(data['item_id'].values))
bs = BayesianSmoothing(1, 1)
dic_i = dict(Counter(train_before_7['item_id'].values))
dic_cov = dict(Counter(train_before_7[train_before_7['is_trade']==1].item_id.values))
l = list(set(train_before_7.item_id.values))
I = []
C = []
for item in l:
    I.append(dic_i[item]) #该item被浏览了多少次
    
for item in l:
    if item not in dic_cov:
        C.append(0)
    else:
        C.append(dic_cov[item]) #该item成交次数记录
        
bs.update(I, C, 10000, 0.001)

dic_smooth = {}

for item in item_all_list:
    if item not in dic_i:
        dic_smooth[item] = (bs.alpha) / (bs.alpha + bs.beta)
    elif item not in dic_cov:
        dic_smooth[item] = (bs.alpha) / (dic_i[item] + bs.alpha + bs.beta)
    else:
        dic_smooth[item] = (dic_cov[item] + bs.alpha) / (dic_i[item] + bs.alpha + bs.beta)
        
train_day_7_test['item_smooth'] = train_day_7_test['item_id'].map(dic_smooth)

In [None]:
# 构建user_id转化率特征
user_all_list = list(set(data['user_id'].values))
bs = BayesianSmoothing(1, 1)
dic_i = dict(Counter(train_before_7['user_id'].values))
dic_cov = dict(Counter(train_before_7[train_before_7['is_trade']==1].user_id.values))
l = list(set(train_before_7.user_id.values))
I = []
C = []

for user in l:
    I.append(dic_i[user]) 
    
for user in l:
    if user not in dic_cov:
        C.append(0)
    else:
        C.append(dic_cov[user]) 
        
bs.update(I, C, 1000, 0.0001)
dic_smooth = {}

for user in user_all_list:
    if user not in dic_i:
        dic_smooth[user] = (bs.alpha) / (bs.alpha + bs.beta)
    elif user not in dic_cov:
        dic_smooth[user] = (bs.alpha) / (dic_i[user] + bs.alpha + bs.beta)
    else:
        dic_smooth[user] = (dic_cov[user] + bs.alpha) / (dic_i[user] + bs.alpha + bs.beta)
        
train_day_7_test['user_smooth'] = train_day_7_test['user_id'].map(dic_smooth)

In [None]:
# 构建shop_id转化率特征
shop_all_list = list(set(data['shop_id'].values))
print(len(shop_all_list))
bs = BayesianSmoothing(1, 1)
dic_i = dict(Counter(train_before_7['shop_id'].values))
print(len(dic_i))
dic_cov = dict(Counter(train_before_7[train_before_7['is_trade']==1].shop_id.values))
print(len(dic_cov))
l = list(set(train_before_7.shop_id.values))

I = []
C = []

for shop in l:
    I.append(dic_i[shop]) 
    
for shop in l:
    if shop not in dic_cov:
        C.append(0)
    else:
        C.append(dic_cov[shop]) 

bs.update(I, C, 100000, 0.001)

dic_smooth = {}

for shop in shop_all_list:
    if shop not in dic_i:
        dic_smooth[shop] = (bs.alpha) / (bs.alpha + bs.beta)
    elif shop not in dic_cov:
        dic_smooth[shop] = (bs.alpha) / (dic_i[shop] + bs.alpha + bs.beta)
    else:
        dic_smooth[shop] = (dic_cov[shop] + bs.alpha) / (dic_i[shop] + bs.alpha + bs.beta)
train_day_7_test['shop_smooth'] = train_day_7_test['shop_id'].map(dic_smooth)

In [None]:
# 构建user_gender_id转化率特征
user_gender_all_list = list(set(data['user_gender_id'].values))
print(len(user_gender_all_list))
bs = BayesianSmoothing(1, 1)
dic_i = dict(Counter(train_before_7['user_gender_id'].values))
print(len(dic_i))
dic_cov = dict(Counter(train_before_7[train_before_7['is_trade']==1].user_gender_id.values))
print(len(dic_cov))
l = list(set(train_before_7.user_gender_id.values))

I = []
C = []

for user_gender in l:
    I.append(dic_i[user_gender]) 
    
for user_gender in l:
    if user_gender not in dic_cov:
        C.append(0)
    else:
        C.append(dic_cov[user_gender]) 
        
bs.update(I, C, 10000000, 0.000000001)

dic_smooth = {}

for user_gender in user_gender_all_list:
    if user_gender not in dic_i:
        dic_smooth[user_gender] = (bs.alpha) / (bs.alpha + bs.beta)
    elif user_gender not in dic_cov:
        dic_smooth[user_gender] = (bs.alpha) / (dic_i[user_gender] + bs.alpha + bs.beta)
    else:
        dic_smooth[user_gender] = (dic_cov[user_gender] + bs.alpha) / (dic_i[user_gender] + bs.alpha + bs.beta)
        
train_day_7_test['user_gender_smooth'] = train_day_7_test['user_gender_id'].map(dic_smooth)

In [None]:
# 构建user_occupation_id转化率特征
user_occu_all_list = list(set(data['user_occupation_id'].values))
print(len(user_occu_all_list))
bs = BayesianSmoothing(1, 1)
dic_i = dict(Counter(train_before_7['user_occupation_id'].values))
print(len(dic_i))
dic_cov = dict(Counter(train_before_7[train_before_7['is_trade']==1].user_occupation_id.values))
print(len(dic_cov))
l = list(set(train_before_7.user_occupation_id.values))

I = []
C = []

for user_occu in l:
    I.append(dic_i[user_occu]) 
    
for user_occu in l:
    if user_occu not in dic_cov:
        C.append(0)
    else:
        C.append(dic_cov[user_occu]) 
bs.update(I, C, 2000000, 0.0001)
dic_smooth = {}

for user_occu in user_occu_all_list:
    if user_occu not in dic_i:
        dic_smooth[user_occu] = (bs.alpha) / (bs.alpha + bs.beta)
    elif user_occu not in dic_cov:
        dic_smooth[user_occu] = (bs.alpha) / (dic_i[user_occu] + bs.alpha + bs.beta)
    else:
        dic_smooth[user_occu] = (dic_cov[user_occu] + bs.alpha) / (dic_i[user_occu] + bs.alpha + bs.beta)
        
train_day_7_test['user_occupation_smooth'] = train_day_7_test['user_occupation_id'].map(dic_smooth)

In [None]:
# 构建user_occupation_id转化率特征
user_age_all_list = list(set(data['user_age_level'].values))
print(len(user_age_all_list))
bs = BayesianSmoothing(1, 1)
dic_i = dict(Counter(train_before_7['user_age_level'].values))
print(len(dic_i))
dic_cov = dict(Counter(train_before_7[train_before_7['is_trade']==1].user_age_level.values))
print(len(dic_cov))
l = list(set(train_before_7.user_age_level.values))

I = []
C = []

for user_age in l:
    I.append(dic_i[user_age]) 
    
for user_age in l:
    if user_age not in dic_cov:
        C.append(0)
    else:
        C.append(dic_cov[user_age]) 
bs.update(I, C, 1000000, 0.0001)
dic_smooth = {}

for user_age in user_age_all_list:
    if user_age not in dic_i:
        dic_smooth[user_age] = (bs.alpha) / (bs.alpha + bs.beta)
    elif user_age not in dic_cov:
        dic_smooth[user_age] = (bs.alpha) / (dic_i[user_age] + bs.alpha + bs.beta)
    else:
        dic_smooth[user_age] = (dic_cov[user_age] + bs.alpha) / (dic_i[user_age] + bs.alpha + bs.beta)
        
train_day_7_test['user_age_smooth'] = train_day_7_test['user_age_level'].map(dic_smooth)

In [None]:
# 构建item_city_id转化率特征
city_all_list = list(set(data['item_city_id'].values))
print(len(city_all_list))
bs = BayesianSmoothing(1, 1)
dic_i = dict(Counter(train_before_7['item_city_id'].values))
print(len(dic_i))
dic_cov = dict(Counter(train_before_7[train_before_7['is_trade']==1].item_city_id.values))
print(len(dic_cov))
l = list(set(train_before_7.item_city_id.values))

I = []
C = []

for city in l:
    I.append(dic_i[city]) 
    
for city in l:
    if city not in dic_cov:
        C.append(0)
    else:
        C.append(dic_cov[city]) 
        
bs.update(I, C, 100000, 0.0001)

dic_smooth = {}

for city in city_all_list:
    if city not in dic_i:
        dic_smooth[city] = (bs.alpha) / (bs.alpha + bs.beta)
    elif city not in dic_cov:
        dic_smooth[city] = (bs.alpha) / (dic_i[city] + bs.alpha + bs.beta)
    else:
        dic_smooth[city] = (dic_cov[city] + bs.alpha) / (dic_i[city] + bs.alpha + bs.beta)
        
train_day_7_test['city_smooth'] = train_day_7_test['item_city_id'].map(dic_smooth)

In [None]:
# 构建item_brand_id转化率特征
brand_all_list = list(set(data['item_brand_id'].values))
print(len(brand_all_list))
bs = BayesianSmoothing(1, 1)
dic_i = dict(Counter(train_before_7['item_brand_id'].values))
print(len(dic_i))
dic_cov = dict(Counter(train_before_7[train_before_7['is_trade']==1].item_brand_id.values))
print(len(dic_cov))
l = list(set(train_before_7.item_brand_id.values))

I = []
C = []

for brand in l:
    I.append(dic_i[brand]) 
    
for brand in l:
    if brand not in dic_cov:
        C.append(0)
    else:
        C.append(dic_cov[brand]) 
        
bs.update(I, C, 100000, 0.0001)

dic_smooth = {}

for brand in brand_all_list:
    if brand not in dic_i:
        dic_smooth[brand] = (bs.alpha) / (bs.alpha + bs.beta)
    elif brand not in dic_cov:
        dic_smooth[brand] = (bs.alpha) / (dic_i[brand] + bs.alpha + bs.beta)
    else:
        dic_smooth[brand] = (dic_cov[brand] + bs.alpha) / (dic_i[brand] + bs.alpha + bs.beta)
        
train_day_7_test['brand_smooth'] = train_day_7_test['item_brand_id'].map(dic_smooth)

In [None]:
# 构建hour转化率特征
hour_all_list = list(set(data['hour'].values))
print(len(hour_all_list))
bs = BayesianSmoothing(1, 1)
dic_i = dict(Counter(train_before_7['hour'].values))
print(len(dic_i))
dic_cov = dict(Counter(train_before_7[train_before_7['is_trade']==1].hour.values))
print(len(dic_cov))
l = list(set(train_before_7.hour.values))

I = []
C = []

for hour in l:
    I.append(dic_i[hour]) 
    
for hour in l:
    if hour not in dic_cov:
        C.append(0)
    else:
        C.append(dic_cov[hour]) 
        
bs.update(I, C, 1000000, 0.0001)

dic_smooth = {}

for hour in hour_all_list:
    if hour not in dic_i:
        dic_smooth[hour] = (bs.alpha) / (bs.alpha + bs.beta)
    elif hour not in dic_cov:
        dic_smooth[hour] = (bs.alpha) / (dic_i[hour] + bs.alpha + bs.beta)
    else:
        dic_smooth[hour] = (dic_cov[hour] + bs.alpha) / (dic_i[hour] + bs.alpha + bs.beta)
        
train_day_7_test['hour_smooth'] = train_day_7_test['hour'].map(dic_smooth)

In [None]:
# 保存以上处理后的结果
train_day_7_test.to_csv('../Temp/train_day_7_test_step_6.csv', index=False)