In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
import math

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# -1代表缺失值，找出含有缺失值的列
col_with_missing = []
for col in train.columns:
    if [-1] in train[col].values:
        col_with_missing.append(col)

In [4]:
# 找出含有缺失值的行
row_with_missing = set()
for col in col_with_missing:
    for i in train[col][train[col] == -1].index:
        row_with_missing.add(i)
# 含缺失值的样本占总数据集的百分比
print('%.2f%%' % (len(row_with_missing) / len(train) *100))

79.01%


79%的行数据均有缺失值，因此不能直接删除含有缺失值的行

In [5]:
# 含缺失值的分类型数据列
col_with_missing_cat = [col for col in col_with_missing if 'cat' in col]
# 含缺失值的连续型数据列
col_with_missing_num = [col for col in col_with_missing if 'cat' not in col]

In [6]:
# 将分类型数据的缺失值替换为众数
for col in col_with_missing_cat:
    train[col][train[col] == -1] = train[col].value_counts().index[0]
# 将连续型数据的缺失值替换为均值
for col in col_with_missing_num:
    train[col][train[col] == -1] = train[col].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
y_train = train['target']
train_data = train.drop('target', axis=1)

In [8]:
# 为GBDT和LR划分训练集
train_x, val_x, train_y, val_y = train_test_split(train_data, y_train, test_size=0.2)
train_x_1, train_x_2, train_y_1, train_y_2 = train_test_split(train_x, train_y, test_size=0.5)
# GBDT
gbdt = GradientBoostingClassifier(n_estimators = 100)
gbdt.fit(train_x_1, train_y_1)
# 对GBDT输出进行OneHot编码
gbdt_enc = OneHotEncoder(categories='auto')
gbdt_enc.fit(gbdt.apply(train_x_1)[:, :, 0])
# LR训练
gbdt_lr = LogisticRegression(solver='lbfgs', max_iter=1000)
gbdt_lr.fit(gbdt_enc.transform(gbdt.apply(train_x_2)[:, :, 0]), train_y_2)
# 预测
val = gbdt_lr.predict_proba(gbdt_enc.transform(gbdt.apply(val_x)[:, :, 0]))[:, 1]

In [9]:
val_d = gbdt_lr.predict(gbdt_enc.transform(gbdt.apply(val_x)[:, :, 0]))
score = accuracy_score(val_y, val_d)

In [10]:
pred = gbdt_lr.predict_proba(gbdt_enc.transform(gbdt.apply(test)[:, :, 0]))[:, 1]

NE等于预测的log loss除以background CTR的熵

In [11]:
def log_loss(Y, P):
    sum=0.0
    for x in map(lambda y,p:(1-y)/2*math.log(1-p)+(1+y)/2*math.log(p),Y,P):
        sum+=x
    return -sum/len(Y)

In [49]:
# 平均历史点击率
p = y_train.mean()
# background CTR的熵
background_ctr_ce = -(p*math.log(p)+(1-p)*math.log(1-p))

In [51]:
nce = log_loss(val_y, val) / background_ctr_ce

In [54]:
print('Normalized Cross-Entropy: %.4f' % nce)

Normalized Cross-Entropy: 0.9610
