In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)
from tqdm import tqdm
tqdm.pandas(desc='pandas bar')
import pickle
from sklearn.preprocessing import LabelEncoder
import time

# 读数据
item读取有些麻烦, 需要按行来读. 其他数据正常使用pandas来读.

In [2]:
data_path = r'./data_for_ctr_predict/'
user = pd.read_table(data_path+'user_info.txt', sep='\t', names=['user_id', 'user_device', 'user_system', 'user_province', 'user_city', 'user_age', 'user_gender'])
#item = pd.read_table(data_path+'doc_info.txt', sep='\t', names=['item_id', 'item_title', 'item_time', 'item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords'])
train = pd.read_table(data_path+'train_data.txt', sep='\t', names=['user_id', 'item_id', 'time', 'network', 'refresh', 'position', 'label', 'duration'])
test = pd.read_table(data_path+'test_data.txt', sep='\t', names=['id', 'user_id', 'item_id', 'time', 'network', 'refresh'])

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
data_path = r'./data_for_ctr_predict/'
with open(data_path+'doc_info.txt', 'r') as f:
    lines = f.readlines()

item = []
for line in tqdm(lines):
    unit = line.split('\t')
    if len(unit)<7:
        for _ in range(7-len(unit)):
            unit.append(np.nan)
    item.append(unit)
#item = np.array(item)

100%|██████████| 633391/633391 [00:01<00:00, 323044.32it/s]


In [5]:
item = pd.DataFrame(item, columns=['item_id', 'item_title', 'item_time', 'item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords'])
item['item_id'] = item['item_id'].astype('int64')

# 检查数据

## 检查是否存在冷启动
验证了一下, 训练集和测试集内都不存在冷启动问题

In [5]:
len(set(train['item_id'].unique())-set(item['item_id'].unique())), len(set(train['user_id'].unique())-set(user['user_id'].unique()))

(0, 0)

In [6]:
len(set(test['item_id'].unique())-set(item['item_id'].unique())), len(set(test['user_id'].unique())-set(user['user_id'].unique()))

(0, 0)

## 检查空缺值

In [7]:
user.isnull().sum() / user.shape[0]

user_id          0.000000
user_device      0.044036
user_system      0.044077
user_province    0.059972
user_city        0.062450
user_age         0.038931
user_gender      0.038074
dtype: float64

In [8]:
user['user_age'].fillna('A_0_24:0.25,A_25_29:0.25,A_30_39:0.25,A_40+:0.25', inplace=True)
user['user_gender'].fillna('female:0.5,male:0.5', inplace=True)

In [9]:
item.isnull().sum() / item.shape[0]

item_id          0.000000
item_title       0.000000
item_time        0.000382
item_picture     0.000382
item_cluster1    0.000444
item_cluster2    0.000445
item_keywords    0.017337
dtype: float64

In [10]:
train.isnull().sum() / train.shape[0]

user_id     0.0
item_id     0.0
time        0.0
network     0.0
refresh     0.0
position    0.0
label       0.0
duration    0.0
dtype: float64

In [11]:
test.isnull().sum() / test.shape[0]

id         0.0
user_id    0.0
item_id    0.0
time       0.0
network    0.0
refresh    0.0
dtype: float64

# 处理数据

## 处理user特征

In [12]:
user.head(10)

Unnamed: 0,user_id,user_device,user_system,user_province,user_city,user_age,user_gender
0,1000372820,TAS-AN00,Android,广东,广州,"A_0_24:0.404616,A_25_29:0.059027,A_30_39:0.516...","female:0.051339,male:0.948661"
1,1000652892,PACM00,Android,河北,唐山,"A_0_24:0.615458,A_25_29:0.086233,A_30_39:0.141...","female:0.280295,male:0.719705"
2,1000908852,MI6X,Android,上海,上海,"A_0_24:0.123255,A_25_29:0.208225,A_30_39:0.298...","female:0.000000,male:1.000000"
3,1001168798,iPhone11,IOS,,,"A_0_24:0.436296,A_25_29:0.489370,A_30_39:0.061...","female:0.870710,male:0.129290"
4,1001305614,M2103K19C,Android,江苏,苏州,"A_0_24:0.006632,A_25_29:0.043408,A_30_39:0.350...","female:0.000000,male:1.000000"
5,1001309700,LYA-AL10,Android,黑龙江,哈尔滨,"A_0_24:0.413768,A_25_29:0.437501,A_30_39:0.141...","female:0.000000,male:1.000000"
6,1001339860,RedmiNote7Pro,Android,天津,天津,"A_0_24:0.067328,A_25_29:0.029283,A_30_39:0.303...","female:0.155445,male:0.844555"
7,1001384888,M2007J22C,Android,河北,石家庄,"A_0_24:0.008414,A_25_29:0.027505,A_30_39:0.161...","female:0.000000,male:1.000000"
8,100142658,JEF-AN20,Android,重庆,重庆,"A_0_24:0.160670,A_25_29:0.695923,A_30_39:0.109...","female:1.000000,male:0.000000"
9,1001439274,HMA-AL00,Android,山东,淄博,"A_0_24:0.000172,A_25_29:0.007701,A_30_39:0.778...","female:0.000000,male:1.000000"


In [7]:
def get_multi_hot(x, k):
    str_list = x.split(',')
    res = [float(s.split(':')[1]) for s in str_list]
    for _ in range(k-len(res)):
        print('加0')
        res.append(0)
    if len(res) == 16:
        res = np.array(res)
        res = res[[0,4,8,12]]
        res = list(res)
    if (k == 2) & (len(res) == 4):
        res = np.array(res)
        res = res[[0,2]]
        res = list(res)
    if len(res) != k:
        print(x)
    return res

In [8]:
# 将age和gender都处理成列表的形式
user['new_age'] = user['user_age'].progress_apply(lambda x: get_multi_hot(x, 4))
user['new_gender'] = user['user_gender'].progress_apply(lambda x: get_multi_hot(x, 2))

pandas bar: 100%|██████████| 1538384/1538384 [00:06<00:00, 223144.72it/s]
pandas bar:  15%|█▍        | 229271/1538384 [00:00<00:05, 256565.57it/s]

加0


pandas bar:  24%|██▍       | 370733/1538384 [00:01<00:03, 326880.01it/s]

加0
加0


pandas bar:  45%|████▍     | 687717/1538384 [00:02<00:03, 265994.55it/s]

加0
加0
加0


pandas bar:  54%|█████▍    | 826915/1538384 [00:02<00:02, 324407.12it/s]

加0
加0


pandas bar:  63%|██████▎   | 968551/1538384 [00:03<00:01, 344760.41it/s]

加0


pandas bar:  70%|██████▉   | 1074430/1538384 [00:03<00:01, 349307.49it/s]

加0


pandas bar:  83%|████████▎ | 1282698/1538384 [00:04<00:00, 293741.73it/s]

加0
加0
加0
加0


pandas bar:  95%|█████████▍| 1457728/1538384 [00:05<00:00, 337088.26it/s]

加0
加0


pandas bar: 100%|██████████| 1538384/1538384 [00:05<00:00, 282314.60it/s]


加0


In [9]:
user.drop(['user_age', 'user_gender'], axis=1, inplace=True)
user.rename(columns={'new_age': 'user_age', 'new_gender': 'user_gender'}, inplace=True)

In [10]:
user[['user_device', 'user_system', 'user_province', 'user_city']] = user[['user_device', 'user_system', 'user_province', 'user_city']].fillna('nan')

In [11]:
user.isnull().sum()

user_id          0
user_device      0
user_system      0
user_province    0
user_city        0
user_age         0
user_gender      0
dtype: int64

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
user[['user_device', 'user_system', 'user_province', 'user_city']] = user[['user_device', 'user_system', 'user_province', 'user_city']].apply(encoder.fit_transform)

In [13]:
uid_dict = dict(zip(user.user_id, user.index))

In [14]:
user['new_id'] = user.index
user.drop(['user_id'], axis=1, inplace=True)
user.rename(columns={'new_id': 'user_id'}, inplace=True)
user = user[['user_id', 'user_device', 'user_system', 'user_province', 'user_city', 'user_gender', 'user_age']]

In [20]:
user[['user_id', 'user_device', 'user_system', 'user_province', 'user_city']].nunique()

user_id          1538384
user_device         3096
user_system            3
user_province        328
user_city            768
dtype: int64

## 处理item特征

In [24]:
item.head(10)

Unnamed: 0,item_id,item_title,item_time,item_picture,item_cluster1,item_cluster2,item_keywords
0,349635709,"拿到c1驾照后,实习期扣分了会怎样?扣12分驾照会吊销么?",1572519971000,9,汽车,汽车/用车,"上班族:8.469502,买车:8.137443,二手车:9.022247,副页:11.21..."
1,361653323,"疫情谣言粉碎机丨接种新冠疫苗后用麻药或致死?盘点最新疫情谣言,别被忽悠了",1624522285000,1,健康,健康/疾病防护治疗及西医用药,"医生:14.760494,吸烟:16.474872,板蓝根:15.597788,板蓝根^^熏..."
2,426732705,"实拍本田飞度:空间真大,8万出头工薪族可选,但内饰能忍?",1610808303000,9,汽车,汽车/买车,"155n:8.979802,polo:7.951116,中控台:5.954278,中网:7...."
3,430221183,搭载135kw电机比亚迪秦plus纯电动版外观更精致,1612581556000,2,汽车,汽车/买车,"etc:12.055207,代表:8.878175,内饰:5.342025,刀片:9.453..."
4,441756326,【提车作业】不顾他人眼光帕萨特phev俘获30老男人浪子心,1618825835000,23,汽车,汽车/买车,"丰田凯美瑞:12.772149,充电器:8.394001,品牌:8.436843,城市:7...."
5,443485341,"魏延有反骨之心都能重用,赵云忠心为什么却不被重用?",1619484501000,4,历史,历史/中国史,"三国:8.979797,五虎将:13.072728,人才:7.532783,保镖:6.811..."
6,447124796,"高考志愿|14个省份新高考录取有变化,这些专业傻傻分不清楚,填志愿看仔细",1624506618000,1,教育,教育/高考,"兴趣:11.558689,家长:11.350382,就业:12.176434,考生:11.5..."
7,448023100,三国煮酒论英雄,1624601704000,1,文化艺术,文化艺术/读书,"历史:9.184261,故事:9.405677\n"
8,452701283,懂我所需—吉利帝豪,1622081779000,19,汽车,汽车/买车,"ec7:14.355725,交互:10.203005,人机交互:10.540608,人机交互..."
9,452933213,"美丽的非洲长河?可以从这条河中,人的形象与命运?",1622121296000,3,旅游,旅游/旅游攻略,"东非:8.944922,东非高原:11.758570,两河:9.025832,伊兹:7.49..."


In [25]:
def convert_time(x):
    try:
        return int(x)
    except:
        return int(1563404932000)

item['item_time'] = item['item_time'].apply(convert_time)

In [26]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
item['item_time'].astype('int64').describe()

count          633391.000
mean    1624972374821.468
std        1291307341.395
min     1563404932000.000
25%     1624678967000.000
50%     1624975577000.000
75%     1625301111000.000
max     1625672120000.000
Name: item_time, dtype: float64

In [27]:
time_dict = dict(zip(item.item_id, item.item_time))

2019-7-18--2021-7-7

In [28]:
item = item[['item_id', 'item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords']]

In [29]:
encoder = LabelEncoder()
item[['item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords']] = item[['item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords']].fillna('nan')
item[['item_picture', 'item_cluster1', 'item_cluster2']] = item[['item_picture', 'item_cluster1', 'item_cluster2']].apply(encoder.fit_transform)

In [30]:
item.isnull().sum()

item_id          0
item_picture     0
item_cluster1    0
item_cluster2    0
item_keywords    0
dtype: int64

### 处理keywords

In [31]:
# 处理关键字信息
def chuli_k(x, prob=False):
    if x == 'nan':
        return x
    else:
        try:
            y = x.split(',')
        except:
            print(x)
        z1 = [] # 存概率
        z2 = [] # 存keyword
        for kw in y: 
            if len(kw.split(':'))==2:
                # 还要去掉^^
                word = kw.split(':')[0].replace('^^','')
                if word != '':
                    z2.append(word)
                    z1.append(kw.split(':')[1])
            else:
                if ':' in kw:
                    # 异形^^:^^火力^^小队:15.721525 处理这种情况
                    word = kw.split(':',1)[1].split(':')[0].replace('^^','')
                    if word != '':
                        z2.append(word)
                        z1.append(kw.split(':',1)[1].split(':')[1])
        if prob:
            return z1
        else:
            return z2
item['keywords'] = item['item_keywords'].progress_apply(lambda x: chuli_k(x))
item['keywords_p'] = item['item_keywords'].progress_apply(lambda x: chuli_k(x, prob=True))

pandas bar: 100%|██████████| 633391/633391 [00:24<00:00, 26293.89it/s]
pandas bar: 100%|██████████| 633391/633391 [00:23<00:00, 26547.91it/s]


In [32]:
# 先得到keywords
keywords = []
def get_k(x):
    if x != 'nan':
        keywords.extend(x)
item['keywords'].progress_apply(lambda x: get_k(x))
keywords = set(keywords)

# 对keywords进行label_encode
# 处理关键字信息，并得到encode的字典,label_encode时间太久了，用dict来做
keywords = list(keywords)
ind = list(range(1,len(keywords)+1))
keys = dict(zip(keywords,ind))
def chuli_kk(x):
    if x == 'nan':
        return x
    else:
        z = []
        for i in range(len(x)):
            z.append(keys[x[i]])
        return z
item['keywords'] = item['keywords'].progress_apply(lambda x: chuli_kk(x))

pandas bar: 100%|██████████| 633391/633391 [00:01<00:00, 590368.97it/s]
pandas bar: 100%|██████████| 633391/633391 [00:11<00:00, 54457.77it/s]


In [33]:
iid_dict = dict(zip(item.item_id, item.index))

In [34]:
item['new_id'] = item.index
item.drop(['item_id'], axis=1, inplace=True)
item.rename(columns={'new_id': 'item_id'}, inplace=True)
item = item[['item_id', 'item_picture', 'item_cluster1', 'item_cluster2', 'keywords', 'keywords_p']]

In [21]:
item.nunique()

item_id          633391
item_picture        160
item_cluster1        44
item_cluster2       370
dtype: int64

## 处理训练集和测试集

In [6]:
train.head(10)

Unnamed: 0,user_id,item_id,time,network,refresh,position,label,duration
0,1000014754,463510256,1624843756147,5,0,16,0,0
1,1000014754,463852707,1624843756147,5,0,13,1,80
2,1000014754,464757134,1625052999841,5,0,13,1,1050
3,1000014754,464617167,1625052999841,5,0,16,1,286
4,1000014754,465426190,1625382421168,5,0,5,0,0
5,1000014754,465815972,1625382421168,5,0,4,1,285
6,1000014754,465991958,1625382421168,5,0,0,1,353
7,1000014754,463067100,1624757147178,5,0,13,0,0
8,1000014754,464264603,1625011694982,5,1,18,0,0
9,1000014754,464295109,1625009731701,5,0,19,1,215


In [11]:
def get_interval(x):
    item_id = x['item_id']
    return x['time'] - time_dict[item_id]

In [12]:
interval = train.progress_apply(get_interval, axis=1)
interval.loc[interval<0] = 0
interval.describe()

pandas bar: 100%|██████████| 189766959/189766959 [42:23<00:00, 74622.54it/s]


count     189766959.000
mean      103783111.404
std       283037560.188
min               0.000
25%        44566409.000
50%        87798774.000
75%       154696761.000
max     62182241539.000
dtype: float64

大于10天的间隔都记为一类, 小于3天的按4小时来记, 大于3天小于10天的按天来记

In [19]:
with open(r'./interval.pkl', 'wb') as f:
    pickle.dump(interval, f)
#with open(r'./interval.pkl', 'rb') as f:
#    interval = pickle.load(f)

In [13]:
def convert_interval(x):
    x = int(x / 1000)
    if x < 144000:
        t = time.localtime(x)
        return int(((t.tm_mday-1) * 24 + t.tm_hour - 8) / 4)
    elif x < 748800:
        t = time.localtime(x)
        return 24 -3 +t.tm_mday
    else:
        return 32

interval = interval.progress_apply(convert_interval)

pandas bar: 100%|██████████| 189766959/189766959 [07:18<00:00, 433176.73it/s]


In [14]:
test['interval'] = test.apply(get_interval, axis=1)
test['interval'].loc[test['interval']<0] = 0
test['interval'] = test['interval'].progress_apply(convert_interval)

pandas bar: 100%|██████████| 50000/50000 [00:00<00:00, 410639.43it/s]


In [15]:
train['interval'] = interval

In [17]:
interval.describe()

count   189766959.000
mean            9.733
std             9.312
min             0.000
25%             3.000
50%             6.000
75%            24.000
max            32.000
dtype: float64

In [3]:
#day = train['time'].progress_apply(lambda x: time.localtime(int(x)).tm_wday)
hour = train['time'].progress_apply(lambda x: time.localtime(int(x)).tm_hour)

pandas bar: 100%|██████████| 189766959/189766959 [06:59<00:00, 452315.17it/s]


In [20]:
with open(r'./day_hour.pkl', 'rb') as f:
    day, hour = pickle.load((f))

In [21]:
#train['day'] = day
train['hour'] = hour

In [22]:
#test['day'] = test['time'].progress_apply(lambda x: time.localtime(int(x)).tm_wday)
test['hour'] = test['time'].progress_apply(lambda x: time.localtime(int(x)).tm_hour)

pandas bar: 100%|██████████| 50000/50000 [00:00<00:00, 434336.84it/s]


In [6]:
train = reduce_mem_usage(train)

Mem. usage decreased to 5429.28 Mb (53.1% reduction)


In [7]:
train['cvr'] = train['duration']
train.loc[train['label']==0, 'cvr'] = 0

In [9]:
train.loc[train['cvr']<0, 'cvr'] = 1

In [10]:
def get_log(x):
    if x < 1:
        return x
    else:
        return np.log10(x) + 1

train['cvr_log'] = train['cvr'].progress_apply(get_log)

pandas bar: 100%|██████████| 189766959/189766959 [04:11<00:00, 753556.13it/s]


In [11]:
train['cvr_log'].describe()

count    1.897670e+08
mean     4.162415e-01
std      1.042156e+00
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      9.791912e+00
Name: cvr_log, dtype: float64

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189766959 entries, 0 to 189766958
Data columns (total 8 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   item_id   int32
 2   time      int64
 3   network   int8 
 4   refresh   int16
 5   position  int16
 6   label     int8 
 7   duration  int32
dtypes: int16(2), int32(2), int64(2), int8(2)
memory usage: 5.3 GB


In [27]:
train['network'].min(), train['network'].max()

(2, 5)

In [28]:
train['refresh'].min(), train['refresh'].max()

(0, 650)

In [8]:
train['day'].min(), train['day'].max()

(0, 6)

In [12]:
train['hour'].min(), train['hour'].max()

(0, 23)

In [26]:
train.shape, test.shape

((189766959, 8), (50000, 6))

- 训练集的结束时间为2021.6.24 22:56:53, 训练集的结束时间为2021.7.6 23:59:59
- 测试集的开始时间为2021.7.7 00:00:04, 测试集的结束时间为2021.7.7 23:41:22
- 验证集截取7.6一天的数据

In [27]:
train['time'].max(), train['time'].min()

(1625587199972, 1624546613949)

In [28]:
test['time'].max(), test['time'].min()

(1625672482794, 1625587204226)

In [12]:
with open(r'./train_online_ml.pkl', 'wb') as f:
    pickle.dump(train[['user_id', 'item_id', 'network', 'refresh', 'label', 'cvr_log']].to_numpy(), f, pickle.HIGHEST_PROTOCOL)

In [13]:
validation = train.loc[train['time']>1625500811000]
validation.shape

(15192000, 10)

In [None]:
train = train.loc[set(train.index) - set(validation.index)]
train.shape

In [None]:
train = train[['user_id', 'item_id', 'network', 'refresh', 'label', 'cvr_log']]
validation = validation[['user_id', 'item_id', 'network', 'refresh', 'label', 'cvr_log']]
test = test[['user_id', 'item_id', 'network', 'refresh']]

In [23]:
len(set(test['user_id']) - set(train['user_id']))

1718

In [31]:
user['user_id'].nunique()

1538384

# 存储数据

In [36]:
with open(r'./train.pkl', 'wb') as f:
    pickle.dump(train.to_numpy(), f, pickle.HIGHEST_PROTOCOL)
with open(r'./test.pkl', 'wb') as f:
    pickle.dump(test.to_numpy(), f, pickle.HIGHEST_PROTOCOL)
with open(r'./validation.pkl', 'wb') as f:
    pickle.dump(validation.to_numpy(), f, pickle.HIGHEST_PROTOCOL)
with open(r'./user_feature.pkl', 'wb') as f:
    pickle.dump(user, f, pickle.HIGHEST_PROTOCOL)
with open(r'./item_feature.pkl', 'wb') as f:
    pickle.dump(item, f, pickle.HIGHEST_PROTOCOL)
with open(r'./id_dict.pkl', 'wb') as f:
    pickle.dump((uid_dict, iid_dict), f, pickle.HIGHEST_PROTOCOL)
    

In [35]:
with open(r'./item_feature.pkl', 'wb') as f:
    pickle.dump(item, f, pickle.HIGHEST_PROTOCOL)

In [17]:
import json
info_dict = {'feat_field': {}, 'vocabulary_size':{}, 'feat_type':{'dense':[], 'sparse':[], 'multi-hot':[]}, 'sample_num': train.shape[0], 'user_num': user.shape[0], 'item_num': item.shape[0]}

user_feats = list(user.columns)
user_feats.remove('user_age')
user_feats.remove('user_gender')
for feat in user_feats:
    info_dict['vocabulary_size'][feat] = user[feat].nunique()
info_dict['vocabulary_size']['user_age'] = 4
info_dict['vocabulary_size']['user_gender'] = 2

for feat in list(item.columns):
    info_dict['vocabulary_size'][feat] = item[feat].nunique()

for feat in ['network', 'refresh']:
    info_dict['vocabulary_size'][feat] = train[feat].nunique()
info_dict['vocabulary_size']['network'] = 6
info_dict['vocabulary_size']['day'] = 7
info_dict['vocabulary_size']['hour'] = 24
info_dict['feat_type']['sparse'] = user_feats + list(item.columns) + ['network', 'refresh']
info_dict['feat_type']['multi-hot'] = ['user_age', 'user_gender']
info_dict['feat_field']['user'] = list(user.columns)
info_dict['feat_field']['item'] = list(item.columns)
info_dict['feat_field']['inter'] = list(train.columns)

with open(r'./info.json', 'w') as f:
    json.dump(info_dict, f)

NameError: name 'item' is not defined

In [38]:
dict(user.nunique())

{'user_id': 1538384,
 'user_device': 3096,
 'user_system': 3,
 'user_province': 328,
 'user_city': 768}

In [6]:
with open('./doc_statis.pkl', 'rb') as f:
    item_static = pickle.load(f) 

In [7]:
item_static.columns

Index(['doc_id', 'doc_exposure_cnt', 'doc_click_cnt', 'doc_consumption_min',
       'doc_consumption_mean', 'doc_consumption_max', 'doc_consumption_sum',
       'doc_age_mean', 'doc_age0_rat', 'doc_age1_cnt', 'doc_age2_cnt',
       'doc_age3_cnt', 'doc_gender_cnt', 'doc_show_position_mean',
       'doc_refresh_min', 'doc_refresh_mean', 'doc_refresh_max',
       'doc_refresh_sum', 'doc_freshness'],
      dtype='object')