# 2021 科大讯飞比赛
比赛题目：基于用户画像的商品推荐挑战赛<br/>
比赛链接：https://challenge.xfyun.cn/topic/info?type=user-portrait&ch=xf-web-gw

使用方法：GBDT+LR


## 1. 导入数据

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import gc
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, log_loss, mean_squared_error
import time
import datetime
# from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
from scipy import sparse
from tqdm import tqdm_notebook
import re
## 存储文件
import pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
# 训练数据
df_train = pd.read_csv('data/train.csv')
# 测试数据
df_apply_new = pd.read_csv('data/apply_new.csv')
# 合并训练集，验证集
data = pd.concat([df_train,df_apply_new],axis=0,ignore_index=True)
data['label'] = data['label'].fillna(str(-1))

In [3]:
data.head()

Unnamed: 0,id,label,gender,age,appid,time,province,city,model,make
0,1016588,0,,NULL 2,"[4457057,9952871,8942704,11273992,12410356,129...","[1.606747390128E12,1.606747390128E12,1.6067473...",广西,北海,华为,华为 mate20pro
1,1295808,1,,5,"[10577375,13567578,4437795,8934804,9352464,133...","[1.605842042532E12,1.592187596698E12,1.5598650...",广东,广州,OPPO,r11
2,1110160,0,,,"[11171956,9454883,9361934,10578048,10234462,12...","[1.607351673175E12,1.607351673175E12,1.6073516...",内蒙古,锡林郭勒盟,小米,小米 红米note2
3,1132597,0,,2,"[4457927,9412324,12292192,9231799,11977927,852...","[1.56015519913E12,1.56015519913E12,1.582942163...",四川,成都,vivo,vivo x20
4,1108714,0,,,"[5737867,5105608,13792904,5454488,13098817,141...","[1.591494981671E12,1.616071068225E12,1.6160710...",湖南,长沙,vivo,x23


In [4]:
data.shape # 数据维度

(400000, 10)

In [5]:
data.columns  # 列信息

Index(['id', 'label', 'gender', 'age', 'appid', 'time', 'province', 'city',
       'model', 'make'],
      dtype='object')

In [6]:
data.isnull().sum() #检查空值

id               0
label            0
gender      326138
age          51501
appid            0
time             0
province         0
city             0
model            0
make             0
dtype: int64

In [7]:
df_apply_new.isnull().sum() #检查空值

id              0
gender      83776
age         12871
appid           0
time            0
province        0
city            0
model           0
make            0
dtype: int64

## 2. 特征工程-数据清洗、特征构建

### 2.1 数据预处理
（1）统计**appid**中的个数并作为一个指标appid_num<br/>
（2）填补缺失值age(Null == 3)、gender(NaN == 3)<br/>
（3）province和city相加后成为一个新的指标p_c

In [12]:
# 处理Age
# 缺失值填充
data['age'] = data['age'].fillna(0)
data['age']
a = data['age'].copy()
# 统一字符类型转化成str()
a = a.apply(lambda x: str(x).lower())

def clean_data(string):
    # 对数据清洗
    string = re.sub(r"[^0-9()]", "", string)
    return string.strip().lower()
a = a.apply(lambda x: clean_data(x))
data['age'] = a
data['age']

0          2
1          5
2          0
3          2
4          0
          ..
399995    10
399996    30
399997     0
399998     0
399999    10
Name: age, Length: 400000, dtype: object

In [13]:
# 处理Gender
# 缺失值填充
data['gender'] = data['gender'].fillna(str(2))
data['gender']
g = data['gender'].copy()
# 统一字符类型转化成str()
g = g.apply(lambda x: str(x).lower())

def clean_data(string):
    # 对数据清洗
    string = re.sub(r"[^0-9()]", "", string)
    return string.strip().lower()
g = g.apply(lambda x: clean_data(x))
data['gender'] = g
data['gender']

0         2
1         2
2         2
3         2
4         2
         ..
399995    2
399996    2
399997    2
399998    2
399999    2
Name: gender, Length: 400000, dtype: object

In [14]:
# 处理appid
appid_num = data['appid']
def get_appid_num(string):
    # 对数据清洗
    string = string.split(',')
    return len(string)
appid_num = appid_num.apply(lambda x: get_appid_num(x))
data['appid_num'] = appid_num
data['appid_num']

0         59
1         62
2         49
3         71
4         67
          ..
399995    44
399996    31
399997     6
399998    20
399999    26
Name: appid_num, Length: 400000, dtype: int64

In [15]:
data

Unnamed: 0,id,label,gender,age,appid,time,province,city,model,make,appid_num
0,1016588,0,2,2,"[4457057,9952871,8942704,11273992,12410356,129...","[1.606747390128E12,1.606747390128E12,1.6067473...",广西,北海,华为,华为 mate20pro,59
1,1295808,1,2,5,"[10577375,13567578,4437795,8934804,9352464,133...","[1.605842042532E12,1.592187596698E12,1.5598650...",广东,广州,OPPO,r11,62
2,1110160,0,2,0,"[11171956,9454883,9361934,10578048,10234462,12...","[1.607351673175E12,1.607351673175E12,1.6073516...",内蒙古,锡林郭勒盟,小米,小米 红米note2,49
3,1132597,0,2,2,"[4457927,9412324,12292192,9231799,11977927,852...","[1.56015519913E12,1.56015519913E12,1.582942163...",四川,成都,vivo,vivo x20,71
4,1108714,0,2,0,"[5737867,5105608,13792904,5454488,13098817,141...","[1.591494981671E12,1.616071068225E12,1.6160710...",湖南,长沙,vivo,x23,67
...,...,...,...,...,...,...,...,...,...,...,...
399995,1499996,-1,2,10,"[91325,456871,13820427,12291830,13516301,14111...","[1.62029119906E12,1.62029119906E12,1.594622958...",山东,临沂,OPPO,r11splus,44
399996,1499997,-1,2,30,"[11871458,10982847,12783381,12545416,13329883,...","[1.608810345864E12,1.608810345864E12,1.6118417...",安徽,池州,OPPO,a5,31
399997,1499998,-1,2,0,"[10567612,10978146,9381689,10278852,10882324,8...","[1.620363880145E12,1.565525861104E12,1.6194418...",山东,菏泽,vivo,vivo y66i,6
399998,1499999,-1,2,0,"[10757291,13055501,11185398,10982847,303703,10...","[1.606532499309E12,1.606532499309E12,1.6065324...",四川,雅安,vivo,vivo x20,20


In [16]:
data_pre = data[['id', 'label', 'gender', 'age', 'province','city', 'model', 'make', 'appid_num']]
data_pre

Unnamed: 0,id,label,gender,age,province,city,model,appid_num
0,1016588,0,2,2,广西,北海,华为,59
1,1295808,1,2,5,广东,广州,OPPO,62
2,1110160,0,2,0,内蒙古,锡林郭勒盟,小米,49
3,1132597,0,2,2,四川,成都,vivo,71
4,1108714,0,2,0,湖南,长沙,vivo,67
...,...,...,...,...,...,...,...,...
399995,1499996,-1,2,10,山东,临沂,OPPO,44
399996,1499997,-1,2,30,安徽,池州,OPPO,31
399997,1499998,-1,2,0,山东,菏泽,vivo,6
399998,1499999,-1,2,0,四川,雅安,vivo,20


In [17]:
# labelencoder 转化
encoder = ['province', 'city', 'model']
lbl = LabelEncoder()

for feat in encoder:
    lbl.fit(data_pre[feat])
    data_pre[feat] = lbl.transform(data_pre[feat])
data_pre


Unnamed: 0,id,label,gender,age,province,city,model,appid_num
0,1016588,0,2,2,13,42,50,59
1,1295808,1,2,5,12,113,12,62
2,1110160,0,2,0,2,285,64,49
3,1132597,0,2,2,6,127,28,71
4,1108714,0,2,0,22,289,28,67
...,...,...,...,...,...,...,...,...
399995,1499996,-1,2,10,10,12,12,44
399996,1499997,-1,2,30,9,173,12,31
399997,1499998,-1,2,0,10,241,28,6
399998,1499999,-1,2,0,6,299,28,20


In [18]:
## 存储文件
import pickle

##存储中间特征矩阵便于再次访问
with open('train_temp.pkl', 'wb') as file:
    pickle.dump(data_pre, file)

## 3. 训练模型

In [3]:
## 读取特征矩阵
with open('train_temp.pkl', 'rb') as file:
    data = pickle.load(file)

print('前10行的信息：\n', data.head(-10))

前10行的信息：
              id label gender age  province  city  model  make  appid_num  \
0       1016588     0      2   2        13    42     49   948         59   
1       1295808     1      2   5        12   113     11   484         62   
2       1110160     0      2   0         2   285     63  1110         49   
3       1132597     0      2   2         6   127     27   583         71   
4       1108714     0      2   0        22   289     27   657         67   
...         ...   ...    ...  ..       ...   ...    ...   ...        ...   
399985  1499986    -1      2   0        28   151     49   973         43   
399986  1499987    -1      2  30        10   184     11   198         71   
399987  1499988    -1      2  20        10   183     49   970         53   
399988  1499989    -1      2  10        16   309     11   200         38   
399989  1499990    -1      2  40        33   308    101  1219         22   

        model_make  
0              997  
1              495  
2             

In [4]:

"""
### 本次案例中：将所有的样本作为训练集。
### 使用全部的样本作为训练集，通过交叉验证的方法划分为：测试集+验证集（实质上没有使用测试集）
"""
total_train = data[data.label!= '-1']  # data.click != -1的样本为需要预测的样本集合
X_train = total_train
Y_train = total_train['label'] ##标签
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=5)
# print(x_train.shape, x_test.shape)

In [5]:
print(x_train.shape, x_test.shape)

(240000, 10) (60000, 10)


In [6]:
print(y_train.shape, y_test.shape)

(240000,) (60000,)


In [7]:
# 这里只使用了ID特征 (特征：num_feature)，非ID特征之间是没有进行GBDT特征转化的过程
# num_feature = ['province', 'city', 'model']
num_feature = ['province', 'city', 'model', 'make', 'model_make']
pre_feature = ['gender', 'age' , 'appid_num']

In [8]:

"""
### 用gbdt训练类别型变量，得到叶子节点拼接类别型，最后使用LR模型
"""
#用gbdt训练类别型变量，得到叶子节点拼接类别型，最后使用LR模型
# 模型部分

# Lightgbm参数学习的网站：https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
# n_estimators: 适合的提升树的数量
# num_leaves: 基学习器的最大树叶
lgb_clf = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=48, max_depth=-1, learning_rate=0.05, n_estimators=200,
                           max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
                           min_child_weight=5, min_child_samples=150, subsample=0.8, subsample_freq=1,
                           colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, n_jobs=10, silent=True)


In [9]:
# 这里只使用了ID特征 (特征：num_feature)，非ID特征之间是没有进行GBDT特征转化的过程
train_csr = X_train[num_feature]
test_csr = x_test[num_feature]

train_csr

Unnamed: 0,province,city,model,make,model_make
0,13,42,49,948,997
1,12,113,11,484,495
2,2,285,63,1110,1173
3,6,127,27,583,610
4,22,289,27,657,684
...,...,...,...,...,...
299995,30,251,11,505,516
299996,18,70,11,128,139
299997,25,52,11,507,518
299998,11,83,11,198,209


In [10]:
# 只提取最后100维数据，相当于embedding的维度是100维，相当于gbdt决策树
lgb_clf.fit(train_csr, Y_train.astype('int'))
new_feature_train = lgb_clf.predict(train_csr, pred_leaf = True)[:, -100:]
new_feature_test = lgb_clf.predict(test_csr, pred_leaf= True)[:, -100:]

new_feature_train

array([[47,  2, 44, ..., 19, 20,  0],
       [ 0,  3, 23, ..., 14, 46,  0],
       [22, 20, 19, ..., 19, 12, 31],
       ...,
       [ 0,  2,  9, ..., 19, 36,  0],
       [ 0, 46, 31, ..., 42,  0,  0],
       [ 0,  3, 39, ..., 31, 41,  0]])

In [11]:
total_test = data[data.label == '-1']  # data.click != -1的样本为需要预测的样本集合
total_test

# 得到待训练的10000个样本的数据
test_new = lgb_clf.predict(total_test[num_feature], pred_leaf = True)[:, -100:]
test_new

array([[ 0, 25, 23, ..., 36,  0,  0],
       [ 6, 11, 44, ..., 19, 12, 43],
       [ 0, 22, 25, ..., 43,  0,  0],
       ...,
       [ 0, 47, 44, ..., 19,  0,  0],
       [ 0, 20, 19, ...,  3,  0,  0],
       [ 0,  2,  1, ..., 23, 37,  0]])

In [12]:
### 重命名GBDT的结果，在原始的X_train的特征中增加new_feature_train.shape[1]个决策树的特征
for i in range(new_feature_train.shape[1]):
    X_train['gbdt_'+str(i)] = new_feature_train[:, i]
    x_test['gbdt_'+str(i)] = new_feature_test[:, i]
    total_test['gbdt_'+str(i)] = test_new[:, i]

In [13]:
### 拼接GBDT的结果的新的类别变量
### 这里将利用100维度向量作为特征
cate_feature = pre_feature + [i for i in X_train.columns if 'gbdt_' in i]

In [14]:
cate_feature

['gender',
 'age',
 'appid_num',
 'gbdt_0',
 'gbdt_1',
 'gbdt_2',
 'gbdt_3',
 'gbdt_4',
 'gbdt_5',
 'gbdt_6',
 'gbdt_7',
 'gbdt_8',
 'gbdt_9',
 'gbdt_10',
 'gbdt_11',
 'gbdt_12',
 'gbdt_13',
 'gbdt_14',
 'gbdt_15',
 'gbdt_16',
 'gbdt_17',
 'gbdt_18',
 'gbdt_19',
 'gbdt_20',
 'gbdt_21',
 'gbdt_22',
 'gbdt_23',
 'gbdt_24',
 'gbdt_25',
 'gbdt_26',
 'gbdt_27',
 'gbdt_28',
 'gbdt_29',
 'gbdt_30',
 'gbdt_31',
 'gbdt_32',
 'gbdt_33',
 'gbdt_34',
 'gbdt_35',
 'gbdt_36',
 'gbdt_37',
 'gbdt_38',
 'gbdt_39',
 'gbdt_40',
 'gbdt_41',
 'gbdt_42',
 'gbdt_43',
 'gbdt_44',
 'gbdt_45',
 'gbdt_46',
 'gbdt_47',
 'gbdt_48',
 'gbdt_49',
 'gbdt_50',
 'gbdt_51',
 'gbdt_52',
 'gbdt_53',
 'gbdt_54',
 'gbdt_55',
 'gbdt_56',
 'gbdt_57',
 'gbdt_58',
 'gbdt_59',
 'gbdt_60',
 'gbdt_61',
 'gbdt_62',
 'gbdt_63',
 'gbdt_64',
 'gbdt_65',
 'gbdt_66',
 'gbdt_67',
 'gbdt_68',
 'gbdt_69',
 'gbdt_70',
 'gbdt_71',
 'gbdt_72',
 'gbdt_73',
 'gbdt_74',
 'gbdt_75',
 'gbdt_76',
 'gbdt_77',
 'gbdt_78',
 'gbdt_79',
 'gbdt_80',
 'gbd

In [15]:
total_data = pd.concat((X_train, total_test), axis = 0)
total_data

Unnamed: 0,id,label,gender,age,province,city,model,make,appid_num,model_make,...,gbdt_90,gbdt_91,gbdt_92,gbdt_93,gbdt_94,gbdt_95,gbdt_96,gbdt_97,gbdt_98,gbdt_99
0,1016588,0,2,2,13,42,49,948,59,997,...,46,0,41,41,43,43,0,19,20,0
1,1295808,1,2,5,12,113,11,484,62,495,...,46,45,15,41,43,13,0,14,46,0
2,1110160,0,2,0,2,285,63,1110,49,1173,...,46,4,15,8,27,1,0,19,12,31
3,1132597,0,2,2,6,127,27,583,71,610,...,46,16,15,41,27,13,0,19,0,0
4,1108714,0,2,0,22,289,27,657,67,684,...,46,4,15,24,43,1,0,19,34,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,1499996,-1,2,10,10,12,11,487,44,498,...,45,0,31,41,43,25,0,16,0,0
399996,1499997,-1,2,30,9,173,11,185,31,196,...,46,9,15,41,43,13,0,29,0,0
399997,1499998,-1,2,0,10,241,27,624,6,651,...,46,4,15,41,43,13,0,19,0,0
399998,1499999,-1,2,0,6,299,27,583,20,610,...,46,4,15,33,27,1,0,3,0,0


In [16]:
### CTR预估常用方法，转换为One-hot高维稀疏数据，为了节省内存，使用CSR矩阵存储
total_data = pd.concat((X_train, total_test), axis = 0)
base_train_csr = sparse.csr_matrix((len(X_train), 0))
# base_test_csr = sparse.csr_matrix((len(x_test), 0))
base_test_csr = sparse.csr_matrix((len(total_test), 0)) # 测试用例

enc = OneHotEncoder()

"""
# 测试one-hot编码的过程：

total_data['adid'].values
# 用全部的样本取定义one-hot的语料
enc.fit(total_data['adid'].values.reshape(-1, 1))
# 针对训练集X_train得到该用户的one-hat向量
enc.transform(X_train['adid'].values.reshape(-1, 1))
# 转化成array()矩阵形式:
enc.transform(X_train['adid'].values.reshape(-1, 1)).toarray()

"""
for feature in cate_feature:
    enc.fit(total_data[feature].values.reshape(-1, 1))
    base_train_csr = sparse.hstack((base_train_csr, enc.transform(X_train[feature].values.reshape(-1, 1))), 'csr', 'bool')
#     base_test_csr = sparse.hstack((base_test_csr, enc.transform(x_test[feature].values.reshape(-1, 1))),'csr', 'bool')
    base_test_csr = sparse.hstack((base_test_csr, enc.transform(total_test[feature].values.reshape(-1, 1))),'csr', 'bool')
print('one-hot prepared !')


one-hot prepared !


In [17]:
print('训练集shape', base_train_csr.shape, '测试集shape', base_test_csr.shape)

训练集shape (300000, 5208) 测试集shape (100000, 5208)


In [None]:

"""
### LR模型  调参C

## 查看生成的one-hot()向量矩阵的形式：
base_train_csr.toarray()  
模型最后，可以用{'True', 'False'}输入到LR模型中做下游分析。

"""

from sklearn.linear_model import LogisticRegression



print('训练集shape', base_train_csr.shape, '测试集shape', base_test_csr.shape)
# 使用验证集调参
for c in [0.05, 0.1, 0.001, 0.01, 0.2, 0.005]:
    print(c)
    model = LogisticRegression(C=c, verbose=10)  # C = 5
    model.fit(base_train_csr, Y_train.astype('int'))
    train_pred = model.predict_proba(base_test_csr)[:, 1]
    print('得到epcoh参数的过程loss', mean_squared_error(train_pred, y_test.array))
    # print('得到epcoh参数的过程loss', log_loss(train_pred, y_test.array))
    print('\n')



# 最后得到了使用的参数为0.2


## 4. 运用模型进行预测

In [18]:
from sklearn.linear_model import LogisticRegression

c = 0.2
model = LogisticRegression(C=c, verbose=10)  # C = 5
model.fit(base_train_csr, Y_train.astype('int'))
train_pred = model.predict_proba(base_test_csr)[:, 1]

train_pred

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.0s finished


array([0.48426341, 0.40380697, 0.44373756, ..., 0.72073121, 0.00092719,
       0.05344095])

In [19]:
prediction = model.predict(base_test_csr)
prediction

array([0, 0, 0, ..., 1, 0, 0])

In [20]:
# 读入文件并写入预测值
label_submission = pd.read_csv('data/submit_sample.csv')
label_submission.head()

Unnamed: 0,user_id,category_id
0,1400001,0
1,1400002,1
2,1400003,0
3,1400004,0
4,1400005,0


In [21]:
label_submission['category_id']=prediction

In [22]:
label_submission.to_csv("submission_GBDT_LR_V2.csv",index=False)