# 2021 科大讯飞比赛
比赛题目：基于用户画像的商品推荐挑战赛<br/>
比赛链接：https://challenge.xfyun.cn/topic/info?type=user-portrait&ch=xf-web-gw

内容：数据预处理，并得到train_temp_deep_learning.pkl文件

In [1]:
import pandas as pd
import numpy as np
import gc

import time
import datetime
# from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from scipy import sparse
from tqdm import tqdm_notebook
import re

import warnings
warnings.filterwarnings("ignore")

In [2]:
# 训练数据
df_train = pd.read_csv('data/train.csv')
# 测试数据
df_apply_new = pd.read_csv('data/apply_new.csv')
# 合并训练集，验证集
data = pd.concat([df_train,df_apply_new],axis=0,ignore_index=True)
data['label'] = data['label'].fillna(str(-1))

In [3]:
data.head()

Unnamed: 0,id,label,gender,age,appid,time,province,city,model,make
0,1016588,0,,NULL 2,"[4457057,9952871,8942704,11273992,12410356,129...","[1.606747390128E12,1.606747390128E12,1.6067473...",广西,北海,华为,华为 mate20pro
1,1295808,1,,5,"[10577375,13567578,4437795,8934804,9352464,133...","[1.605842042532E12,1.592187596698E12,1.5598650...",广东,广州,OPPO,r11
2,1110160,0,,,"[11171956,9454883,9361934,10578048,10234462,12...","[1.607351673175E12,1.607351673175E12,1.6073516...",内蒙古,锡林郭勒盟,小米,小米 红米note2
3,1132597,0,,2,"[4457927,9412324,12292192,9231799,11977927,852...","[1.56015519913E12,1.56015519913E12,1.582942163...",四川,成都,vivo,vivo x20
4,1108714,0,,,"[5737867,5105608,13792904,5454488,13098817,141...","[1.591494981671E12,1.616071068225E12,1.6160710...",湖南,长沙,vivo,x23


In [4]:
data.shape # 数据维度

(400000, 10)

In [5]:
data.columns  # 列信息

Index(['id', 'label', 'gender', 'age', 'appid', 'time', 'province', 'city',
       'model', 'make'],
      dtype='object')

In [6]:
data.isnull().sum() #检查空值

id               0
label            0
gender      326138
age          51501
appid            0
time             0
province         0
city             0
model            0
make             0
dtype: int64

In [7]:
df_apply_new.isnull().sum() #检查空值

id              0
gender      83776
age         12871
appid           0
time            0
province        0
city            0
model           0
make            0
dtype: int64

## 2. 特征工程-数据清洗、特征构建

### 2.1 数据预处理
（1）统计**appid**中的个数并作为一个指标appid_num<br/>
（2）填补缺失值age(Null == 3)、gender(NaN == 3)<br/>
（3）province和city相加后成为一个新的指标p_c

In [8]:
# 处理Age
# 缺失值填充
data['age'] = data['age'].fillna(0)
data['age']
a = data['age'].copy()
# 统一字符类型转化成str()
a = a.apply(lambda x: str(x).lower())

def clean_data(string):
    # 对数据清洗
    string = re.sub(r"[^0-9()]", "", string)
    return string.strip().lower()
a = a.apply(lambda x: clean_data(x))
data['age'] = a
data['age']

0          2
1          5
2          0
3          2
4          0
          ..
399995    10
399996    30
399997     0
399998     0
399999    10
Name: age, Length: 400000, dtype: object

In [9]:
# 处理Gender
# 缺失值填充
data['gender'] = data['gender'].fillna(str(2))
data['gender']
g = data['gender'].copy()
# 统一字符类型转化成str()
g = g.apply(lambda x: str(x).lower())

def clean_data(string):
    # 对数据清洗
    string = re.sub(r"[^0-9()]", "", string)
    return string.strip().lower()
g = g.apply(lambda x: clean_data(x))
data['gender'] = g
data['gender']

0         2
1         2
2         2
3         2
4         2
         ..
399995    2
399996    2
399997    2
399998    2
399999    2
Name: gender, Length: 400000, dtype: object

In [10]:
# 处理appid
appid_num = data['appid']
def get_appid_num(string):
    # 对数据清洗
    string = string.split(',')
    return len(string)
appid_num = appid_num.apply(lambda x: get_appid_num(x))
data['appid_num'] = appid_num
data['appid_num']

0         59
1         62
2         49
3         71
4         67
          ..
399995    44
399996    31
399997     6
399998    20
399999    26
Name: appid_num, Length: 400000, dtype: int64

In [11]:
data

Unnamed: 0,id,label,gender,age,appid,time,province,city,model,make,appid_num
0,1016588,0,2,2,"[4457057,9952871,8942704,11273992,12410356,129...","[1.606747390128E12,1.606747390128E12,1.6067473...",广西,北海,华为,华为 mate20pro,59
1,1295808,1,2,5,"[10577375,13567578,4437795,8934804,9352464,133...","[1.605842042532E12,1.592187596698E12,1.5598650...",广东,广州,OPPO,r11,62
2,1110160,0,2,0,"[11171956,9454883,9361934,10578048,10234462,12...","[1.607351673175E12,1.607351673175E12,1.6073516...",内蒙古,锡林郭勒盟,小米,小米 红米note2,49
3,1132597,0,2,2,"[4457927,9412324,12292192,9231799,11977927,852...","[1.56015519913E12,1.56015519913E12,1.582942163...",四川,成都,vivo,vivo x20,71
4,1108714,0,2,0,"[5737867,5105608,13792904,5454488,13098817,141...","[1.591494981671E12,1.616071068225E12,1.6160710...",湖南,长沙,vivo,x23,67
...,...,...,...,...,...,...,...,...,...,...,...
399995,1499996,-1,2,10,"[91325,456871,13820427,12291830,13516301,14111...","[1.62029119906E12,1.62029119906E12,1.594622958...",山东,临沂,OPPO,r11splus,44
399996,1499997,-1,2,30,"[11871458,10982847,12783381,12545416,13329883,...","[1.608810345864E12,1.608810345864E12,1.6118417...",安徽,池州,OPPO,a5,31
399997,1499998,-1,2,0,"[10567612,10978146,9381689,10278852,10882324,8...","[1.620363880145E12,1.565525861104E12,1.6194418...",山东,菏泽,vivo,vivo y66i,6
399998,1499999,-1,2,0,"[10757291,13055501,11185398,10982847,303703,10...","[1.606532499309E12,1.606532499309E12,1.6065324...",四川,雅安,vivo,vivo x20,20


In [27]:
# 得到预处理的数据
data_pre = data[['id', 'label', 'gender', 'age', 'province','city', 'model', 'appid_num', 'appid', 'time']]
data_pre

Unnamed: 0,id,label,gender,age,province,city,model,appid_num,appid,time
0,1016588,0,2,2,广西,北海,华为,59,"[4457057,9952871,8942704,11273992,12410356,129...","[1.606747390128E12,1.606747390128E12,1.6067473..."
1,1295808,1,2,5,广东,广州,OPPO,62,"[10577375,13567578,4437795,8934804,9352464,133...","[1.605842042532E12,1.592187596698E12,1.5598650..."
2,1110160,0,2,0,内蒙古,锡林郭勒盟,小米,49,"[11171956,9454883,9361934,10578048,10234462,12...","[1.607351673175E12,1.607351673175E12,1.6073516..."
3,1132597,0,2,2,四川,成都,vivo,71,"[4457927,9412324,12292192,9231799,11977927,852...","[1.56015519913E12,1.56015519913E12,1.582942163..."
4,1108714,0,2,0,湖南,长沙,vivo,67,"[5737867,5105608,13792904,5454488,13098817,141...","[1.591494981671E12,1.616071068225E12,1.6160710..."
...,...,...,...,...,...,...,...,...,...,...
399995,1499996,-1,2,10,山东,临沂,OPPO,44,"[91325,456871,13820427,12291830,13516301,14111...","[1.62029119906E12,1.62029119906E12,1.594622958..."
399996,1499997,-1,2,30,安徽,池州,OPPO,31,"[11871458,10982847,12783381,12545416,13329883,...","[1.608810345864E12,1.608810345864E12,1.6118417..."
399997,1499998,-1,2,0,山东,菏泽,vivo,6,"[10567612,10978146,9381689,10278852,10882324,8...","[1.620363880145E12,1.565525861104E12,1.6194418..."
399998,1499999,-1,2,0,四川,雅安,vivo,20,"[10757291,13055501,11185398,10982847,303703,10...","[1.606532499309E12,1.606532499309E12,1.6065324..."


In [31]:
# 1. 列拆分：https://blog.csdn.net/Asher117/article/details/84346073

#方法二(用apply或map都行）
data_pre['appid'].map(lambda x: x.split(','))

0         [[4457057, 9952871, 8942704, 11273992, 1241035...
1         [[10577375, 13567578, 4437795, 8934804, 935246...
2         [[11171956, 9454883, 9361934, 10578048, 102344...
3         [[4457927, 9412324, 12292192, 9231799, 1197792...
4         [[5737867, 5105608, 13792904, 5454488, 1309881...
                                ...                        
399995    [[91325, 456871, 13820427, 12291830, 13516301,...
399996    [[11871458, 10982847, 12783381, 12545416, 1332...
399997    [[10567612, 10978146, 9381689, 10278852, 10882...
399998    [[10757291, 13055501, 11185398, 10982847, 3037...
399999    [[11219658, 10757721, 11755428, 13343061, 1098...
Name: appid, Length: 400000, dtype: object

In [17]:


d1 = pd.DataFrame({'ticker' : ['spx 5/25/2001 p500', 'spx 5/25/2001 p600', 'spx 5/25/2001 p700'],
                  'a':[1,2,3]})
d1

Unnamed: 0,ticker,a
0,spx 5/25/2001 p500,1
1,spx 5/25/2001 p600,2
2,spx 5/25/2001 p700,3


In [24]:
a = d1['ticker'].str.split().tolist()
a

[['spx', '5/25/2001', 'p500'],
 ['spx', '5/25/2001', 'p600'],
 ['spx', '5/25/2001', 'p700']]

In [26]:
d1['ticker'] = a
d1

Unnamed: 0,ticker,a
0,"[spx, 5/25/2001, p500]",1
1,"[spx, 5/25/2001, p600]",2
2,"[spx, 5/25/2001, p700]",3


Unnamed: 0,id,label,gender,age,province,city,model,make,appid_num
0,1016588,0,2,2,广西,北海,华为,华为 mate20pro,59
1,1295808,1,2,5,广东,广州,OPPO,r11,62
2,1110160,0,2,0,内蒙古,锡林郭勒盟,小米,小米 红米note2,49
3,1132597,0,2,2,四川,成都,vivo,vivo x20,71
4,1108714,0,2,0,湖南,长沙,vivo,x23,67
...,...,...,...,...,...,...,...,...,...
399995,1499996,-1,2,10,山东,临沂,OPPO,r11splus,44
399996,1499997,-1,2,30,安徽,池州,OPPO,a5,31
399997,1499998,-1,2,0,山东,菏泽,vivo,vivo y66i,6
399998,1499999,-1,2,0,四川,雅安,vivo,vivo x20,20


In [30]:
# labelencoder 转化
encoder = ['province', 'city', 'model', 'make']
lbl = LabelEncoder()

for feat in encoder:
    lbl.fit(data_pre[feat])
    data_pre[feat] = lbl.transform(data_pre[feat])
data_pre


Unnamed: 0,id,label,gender,age,province,city,model,make,appid_num
0,1016588,0,2,2,13,42,49,948,59
1,1295808,1,2,5,12,113,11,484,62
2,1110160,0,2,0,2,285,63,1110,49
3,1132597,0,2,2,6,127,27,583,71
4,1108714,0,2,0,22,289,27,657,67
...,...,...,...,...,...,...,...,...,...
399995,1499996,-1,2,10,10,12,11,487,44
399996,1499997,-1,2,30,9,173,11,185,31
399997,1499998,-1,2,0,10,241,27,624,6
399998,1499999,-1,2,0,6,299,27,583,20


In [31]:
data_pre['model_make'] = data_pre['model'] + data_pre['make']
data_pre

Unnamed: 0,id,label,gender,age,province,city,model,make,appid_num,model_make
0,1016588,0,2,2,13,42,49,948,59,997
1,1295808,1,2,5,12,113,11,484,62,495
2,1110160,0,2,0,2,285,63,1110,49,1173
3,1132597,0,2,2,6,127,27,583,71,610
4,1108714,0,2,0,22,289,27,657,67,684
...,...,...,...,...,...,...,...,...,...,...
399995,1499996,-1,2,10,10,12,11,487,44,498
399996,1499997,-1,2,30,9,173,11,185,31,196
399997,1499998,-1,2,0,10,241,27,624,6,651
399998,1499999,-1,2,0,6,299,27,583,20,610


In [32]:
## 存储文件
import pickle

##存储中间特征矩阵便于再次访问
with open('train_temp_2.pkl', 'wb') as file:
    pickle.dump(data_pre, file)