In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

## 处理用户信息表数据

In [2]:
user_info = pd.read_csv("../data/user_info.csv")
user_info.head()

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
1,234512,5.0,0.0
2,344532,5.0,0.0
3,186135,5.0,0.0
4,30230,5.0,0.0


In [3]:
user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    424170 non-null  int64  
 1   age_range  421953 non-null  float64
 2   gender     417734 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 9.7 MB


In [4]:
user_info.isna().sum()

user_id         0
age_range    2217
gender       6436
dtype: int64

In [5]:
user_info.nunique()

user_id      424170
age_range         9
gender            3
dtype: int64

In [6]:
# 删除带有空值的行
user_info.dropna(axis=0,inplace=True)

### 删除无效的行

In [7]:
drop_data = user_info[(user_info.age_range == 0) | (user_info.gender == 2)]
drop_data.shape

(99868, 3)

In [8]:
drop_index = drop_data.index.values
user_info = user_info.drop(drop_index)
user_info.shape

(317840, 3)

In [9]:
# 这里类型是 str 的话后面查数据查不到
# user_info.user_id = user_info.user_id.astype('str')
# user_info.age_range = user_info.age_range.astype('str')
# user_info.gender = user_info.gender.astype('str')
# user_info.info()

In [10]:
user_info.age_range[user_info.age_range == 8] =7

## 处理⽤户⾏为复购表数据

In [11]:
user_merchant = pd.read_csv("../data/user_merchant.csv")
user_merchant.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int64
 1   merchant_id  260864 non-null  int64
 2   label        260864 non-null  int64
dtypes: int64(3)
memory usage: 6.0 MB


In [12]:
user_merchant.isna().sum()

user_id        0
merchant_id    0
label          0
dtype: int64

In [13]:
user_merchant.user_id = user_merchant.user_id.astype("str")
user_merchant.merchant_id = user_merchant.merchant_id .astype("str")
user_merchant.label = user_merchant.label .astype("str")
user_merchant.describe(include="all")

Unnamed: 0,user_id,merchant_id,label
count,260864,260864,260864
unique,212062,1993,2
top,221133,4044,0
freq,18,3379,244912


### 查看数据是否不平衡

In [14]:
user_merchant.label.value_counts()[1] / user_merchant.label.count()

0.06115063788027478

In [15]:
# 划分特征值和标签
X = user_merchant.drop({'label'},axis =1)
Y = user_merchant['label']

In [16]:
# 欠采样方式处理数据
from imblearn.under_sampling import NearMiss
nm1 = NearMiss(version=1)
X_resampled_nm1, y_resampled = nm1.fit_resample(X, Y)
user_merchant = pd.concat([X_resampled_nm1,y_resampled],axis=1)

In [17]:
user_info = user_info.loc[user_info.user_id.isin(user_merchant.user_id.unique()),:]
user_info

Unnamed: 0,user_id,age_range,gender
0,376517,6.0,1.0
27,208701,5.0,0.0
34,184971,4.0,0.0
50,142152,5.0,0.0
53,170951,5.0,0.0
...,...,...,...
423987,60364,2.0,0.0
424019,202723,3.0,0.0
424032,319682,2.0,0.0
424037,389654,4.0,0.0


## 处理用户行为表数据

In [18]:
user_log = pd.read_csv("../data/user_log.csv")
user_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   item_id      int64  
 2   cat_id       int64  
 3   seller_id    int64  
 4   brand_id     float64
 5   time_stamp   int64  
 6   action_type  int64  
dtypes: float64(1), int64(6)
memory usage: 2.9 GB


In [19]:
# 利用用户商铺表失衡数据处理的结果过滤用户行为表
user_log = user_log.loc[user_log.user_id.isin(user_merchant.user_id.values),:]
user_log = user_log.loc[user_log.seller_id.isin(user_merchant.merchant_id.values),:]
user_log.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1972402 entries, 1691 to 54924616
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   item_id      int64  
 2   cat_id       int64  
 3   seller_id    int64  
 4   brand_id     float64
 5   time_stamp   int64  
 6   action_type  int64  
dtypes: float64(1), int64(6)
memory usage: 120.4 MB


In [20]:
# 查看是否有缺失值
user_log.isna().sum()

user_id           0
item_id           0
cat_id            0
seller_id         0
brand_id       2938
time_stamp        0
action_type       0
dtype: int64

In [21]:
# 计算缺失率
user_log['brand_id'].isna().sum() / user_log['brand_id'].count()

0.0014917764427275645

In [22]:
# 缺失值所占比率很小，所以我们可以把他删掉
user_log.dropna(inplace=True)
user_log.isna().sum()

user_id        0
item_id        0
cat_id         0
seller_id      0
brand_id       0
time_stamp     0
action_type    0
dtype: int64

In [23]:
user_log.shape

(1969464, 7)

## 用户特征提取

In [24]:
#用户交互总次数
user_feaut = user_log.groupby("user_id")["action_type"].count().to_frame()
user_feaut.head()

Unnamed: 0_level_0,action_type
user_id,Unnamed: 1_level_1
1,21
4,42
7,6
14,313
17,23


In [25]:
#用户各种行为总次数统计（点击、加购、收藏和购买）
user_feaut_2 =  pd.pivot_table(user_log,index="user_id",columns="action_type",values="cat_id",aggfunc="count")
user_feaut_2.head()

action_type,0,1,2,3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,16.0,,5.0,
4,41.0,,1.0,
7,2.0,,4.0,
14,290.0,,18.0,5.0
17,18.0,,1.0,4.0


In [26]:
user_feaut = user_feaut.merge(user_feaut_2,on="user_id")
user_feaut.head()

Unnamed: 0_level_0,action_type,0,1,2,3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,21,16.0,,5.0,
4,42,41.0,,1.0,
7,6,2.0,,4.0,
14,313,290.0,,18.0,5.0
17,23,18.0,,1.0,4.0


In [27]:
user_feaut.columns =["total_log","click","add_car","buy","collect"]
user_feaut.head()

Unnamed: 0_level_0,total_log,click,add_car,buy,collect
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,21,16.0,,5.0,
4,42,41.0,,1.0,
7,6,2.0,,4.0,
14,313,290.0,,18.0,5.0
17,23,18.0,,1.0,4.0


In [28]:
user_feaut_2 = user_log.groupby("user_id")["seller_id","item_id","cat_id","brand_id"].nunique()
user_feaut_2.head()

Unnamed: 0_level_0,seller_id,item_id,cat_id,brand_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4,4,3,4
4,6,20,10,6
7,2,5,3,2
14,40,177,34,38
17,4,8,5,4


In [29]:
user_feaut_2.columns = ["seller_count","item_count","cat_count","brand_count"]
user_feaut_2.head()

Unnamed: 0_level_0,seller_count,item_count,cat_count,brand_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4,4,3,4
4,6,20,10,6
7,2,5,3,2
14,40,177,34,38
17,4,8,5,4


In [30]:
user_feaut = user_feaut.merge(user_feaut_2,on="user_id") 
user_feaut.head()

Unnamed: 0_level_0,total_log,click,add_car,buy,collect,seller_count,item_count,cat_count,brand_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,21,16.0,,5.0,,4,4,3,4
4,42,41.0,,1.0,,6,20,10,6
7,6,2.0,,4.0,,2,5,3,2
14,313,290.0,,18.0,5.0,40,177,34,38
17,23,18.0,,1.0,4.0,4,8,5,4


In [31]:
#从时间戳中提月和日
user_log["month"] = user_log.time_stamp // 100;
user_log["day"] = user_log.time_stamp % 100;
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,month,day
1691,26516,416965,1401,586,5579.0,1107,0,11,7
1692,26516,416965,1401,586,5579.0,1107,0,11,7
1693,26516,416965,1401,586,5579.0,1107,0,11,7
1694,26516,352345,177,2565,8149.0,1101,0,11,1
1718,26516,142231,177,2565,8149.0,1101,0,11,1


In [32]:
# 用户平均每天交互、购买的次数 ，用户平均每月交互、购买的次数
user_feaut_2 = user_log.groupby("user_id")["month","time_stamp"].nunique()
user_feaut_2.columns = ["month_count","day_count"]
user_feaut_2.head()

Unnamed: 0_level_0,month_count,day_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2,3
4,5,9
7,2,3
14,7,48
17,3,6


In [33]:
user_feaut = user_feaut.merge(user_feaut_2,on="user_id") 
user_feaut.head()

Unnamed: 0_level_0,total_log,click,add_car,buy,collect,seller_count,item_count,cat_count,brand_count,month_count,day_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,21,16.0,,5.0,,4,4,3,4,2,3
4,42,41.0,,1.0,,6,20,10,6,5,9
7,6,2.0,,4.0,,2,5,3,2,2,3
14,313,290.0,,18.0,5.0,40,177,34,38,7,48
17,23,18.0,,1.0,4.0,4,8,5,4,3,6


In [34]:
user_feaut["month_avg_log"] = user_feaut.total_log/user_feaut.month_count
user_feaut["month_avg_buy"] =np.where(user_feaut.buy.isna(),0,user_feaut.buy/user_feaut.month_count)
user_feaut["day_avg_log"] = user_feaut.total_log/user_feaut.day_count
user_feaut["day_avg_buy"] =np.where(user_feaut.buy.isna(),0,user_feaut.buy/user_feaut.day_count)
user_feaut.head()

Unnamed: 0_level_0,total_log,click,add_car,buy,collect,seller_count,item_count,cat_count,brand_count,month_count,day_count,month_avg_log,month_avg_buy,day_avg_log,day_avg_buy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,21,16.0,,5.0,,4,4,3,4,2,3,10.5,2.5,7.0,1.666667
4,42,41.0,,1.0,,6,20,10,6,5,9,8.4,0.2,4.666667,0.111111
7,6,2.0,,4.0,,2,5,3,2,2,3,3.0,2.0,2.0,1.333333
14,313,290.0,,18.0,5.0,40,177,34,38,7,48,44.714286,2.571429,6.520833,0.375
17,23,18.0,,1.0,4.0,4,8,5,4,3,6,7.666667,0.333333,3.833333,0.166667


In [35]:
#删除特征提取的临时数据框
del user_feaut_2

## 商铺特征提取

In [36]:
#商铺下所有交互总次数
shop_feaut = user_log.groupby("seller_id")["user_id"].count().to_frame()
shop_feaut.head()

Unnamed: 0_level_0,user_id
seller_id,Unnamed: 1_level_1
2,214
8,305
9,242
10,1658
13,681


In [37]:
shop_feaut.columns=["total_count"]
shop_feaut.head()

Unnamed: 0_level_0,total_count
seller_id,Unnamed: 1_level_1
2,214
8,305
9,242
10,1658
13,681


In [38]:
#商铺下各种行为总次数统计（点击、加购、收藏和购买）
shop_feaut_2 = pd.pivot_table(user_log,index="seller_id",columns="action_type",values="user_id",aggfunc="count")
shop_feaut_2.head()

action_type,0,1,2,3
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,181.0,,23.0,10.0
8,258.0,,30.0,17.0
9,219.0,,14.0,9.0
10,1529.0,,72.0,57.0
13,567.0,,100.0,14.0


In [39]:
shop_feaut= shop_feaut.merge(shop_feaut_2,on="seller_id")
shop_feaut.head()

Unnamed: 0_level_0,total_count,0,1,2,3
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,214,181.0,,23.0,10.0
8,305,258.0,,30.0,17.0
9,242,219.0,,14.0,9.0
10,1658,1529.0,,72.0,57.0
13,681,567.0,,100.0,14.0


In [40]:
shop_feaut.columns = ["total_count","click","add_car","buy","collect"]
shop_feaut.head()

Unnamed: 0_level_0,total_count,click,add_car,buy,collect
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,214,181.0,,23.0,10.0
8,305,258.0,,30.0,17.0
9,242,219.0,,14.0,9.0
10,1658,1529.0,,72.0,57.0
13,681,567.0,,100.0,14.0


In [41]:
#商铺下交互的总用户数，多少被交互的商品，商品类别和商品品牌数量
shop_feaut_2 = user_log.groupby("seller_id")["user_id","item_id","cat_id","brand_id"].nunique()
shop_feaut_2.head()

Unnamed: 0_level_0,user_id,item_id,cat_id,brand_id
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,82,48,7,1
8,83,51,8,1
9,72,103,13,15
10,410,185,13,2
13,187,28,5,1


In [42]:
shop_feaut_2.columns=["user_count","item_count","cat_count","brand_count"]
shop_feaut= shop_feaut.merge(shop_feaut_2,on="seller_id")
shop_feaut.head()

Unnamed: 0_level_0,total_count,click,add_car,buy,collect,user_count,item_count,cat_count,brand_count
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,214,181.0,,23.0,10.0,82,48,7,1
8,305,258.0,,30.0,17.0,83,51,8,1
9,242,219.0,,14.0,9.0,72,103,13,15
10,1658,1529.0,,72.0,57.0,410,185,13,2
13,681,567.0,,100.0,14.0,187,28,5,1


In [43]:
#商铺月平均有多少用户交互
shop_feaut_2= user_log.groupby("seller_id")["month"].nunique()
shop_feaut= shop_feaut.merge(shop_feaut_2,on="seller_id")
shop_feaut.head()

Unnamed: 0_level_0,total_count,click,add_car,buy,collect,user_count,item_count,cat_count,brand_count,month
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,214,181.0,,23.0,10.0,82,48,7,1,7
8,305,258.0,,30.0,17.0,83,51,8,1,7
9,242,219.0,,14.0,9.0,72,103,13,15,7
10,1658,1529.0,,72.0,57.0,410,185,13,2,7
13,681,567.0,,100.0,14.0,187,28,5,1,7


In [44]:
shop_feaut["month_avg_user"]=shop_feaut.user_count/shop_feaut.month
shop_feaut.head()

Unnamed: 0_level_0,total_count,click,add_car,buy,collect,user_count,item_count,cat_count,brand_count,month,month_avg_user
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,214,181.0,,23.0,10.0,82,48,7,1,7,11.714286
8,305,258.0,,30.0,17.0,83,51,8,1,7,11.857143
9,242,219.0,,14.0,9.0,72,103,13,15,7,10.285714
10,1658,1529.0,,72.0,57.0,410,185,13,2,7,58.571429
13,681,567.0,,100.0,14.0,187,28,5,1,7,26.714286


In [45]:
#获取用户详细信息
user_log = user_log.merge(user_info,on="user_id")
user_log.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,month,day,age_range,gender
0,26516,416965,1401,586,5579.0,1107,0,11,7,4.0,1.0
1,26516,416965,1401,586,5579.0,1107,0,11,7,4.0,1.0
2,26516,416965,1401,586,5579.0,1107,0,11,7,4.0,1.0
3,26516,352345,177,2565,8149.0,1101,0,11,1,4.0,1.0
4,26516,142231,177,2565,8149.0,1101,0,11,1,4.0,1.0


In [46]:
#商铺下交互的用户按年龄段和性别分别统计
shop_feaut_2 = pd.pivot_table(user_log,index="seller_id",columns="age_range",values="user_id",aggfunc="nunique")
shop_feaut_2.head()

age_range,1.0,2.0,3.0,4.0,5.0,6.0,7.0
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,,10.0,19.0,14.0,9.0,8.0,2.0
8,,6.0,23.0,17.0,8.0,7.0,2.0
9,,8.0,21.0,19.0,6.0,11.0,2.0
10,,17.0,108.0,80.0,55.0,40.0,10.0
13,,25.0,51.0,19.0,19.0,21.0,7.0


In [47]:
shop_feaut_2.columns=["less18","between18and24","between25and29","between30and34","between35and39","between40and49","grate50"]
shop_feaut_2.head()

Unnamed: 0_level_0,less18,between18and24,between25and29,between30and34,between35and39,between40and49,grate50
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,,10.0,19.0,14.0,9.0,8.0,2.0
8,,6.0,23.0,17.0,8.0,7.0,2.0
9,,8.0,21.0,19.0,6.0,11.0,2.0
10,,17.0,108.0,80.0,55.0,40.0,10.0
13,,25.0,51.0,19.0,19.0,21.0,7.0


In [48]:
shop_feaut= shop_feaut.merge(shop_feaut_2,on="seller_id")
shop_feaut.head()

Unnamed: 0_level_0,total_count,click,add_car,buy,collect,user_count,item_count,cat_count,brand_count,month,month_avg_user,less18,between18and24,between25and29,between30and34,between35and39,between40and49,grate50
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,214,181.0,,23.0,10.0,82,48,7,1,7,11.714286,,10.0,19.0,14.0,9.0,8.0,2.0
8,305,258.0,,30.0,17.0,83,51,8,1,7,11.857143,,6.0,23.0,17.0,8.0,7.0,2.0
9,242,219.0,,14.0,9.0,72,103,13,15,7,10.285714,,8.0,21.0,19.0,6.0,11.0,2.0
10,1658,1529.0,,72.0,57.0,410,185,13,2,7,58.571429,,17.0,108.0,80.0,55.0,40.0,10.0
13,681,567.0,,100.0,14.0,187,28,5,1,7,26.714286,,25.0,51.0,19.0,19.0,21.0,7.0


In [49]:
shop_feaut_2 = pd.pivot_table(user_log,index="seller_id",columns="gender",values="user_id",aggfunc="nunique")
shop_feaut_2.head()

gender,0.0,1.0
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,44.0,18.0
8,60.0,3.0
9,18.0,49.0
10,187.0,123.0
13,77.0,65.0


In [50]:
shop_feaut_2.columns=["gender_F","gender_M"]
shop_feaut= shop_feaut.merge(shop_feaut_2,on="seller_id")
shop_feaut.head()

Unnamed: 0_level_0,total_count,click,add_car,buy,collect,user_count,item_count,cat_count,brand_count,month,month_avg_user,less18,between18and24,between25and29,between30and34,between35and39,between40and49,grate50,gender_F,gender_M
seller_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2,214,181.0,,23.0,10.0,82,48,7,1,7,11.714286,,10.0,19.0,14.0,9.0,8.0,2.0,44.0,18.0
8,305,258.0,,30.0,17.0,83,51,8,1,7,11.857143,,6.0,23.0,17.0,8.0,7.0,2.0,60.0,3.0
9,242,219.0,,14.0,9.0,72,103,13,15,7,10.285714,,8.0,21.0,19.0,6.0,11.0,2.0,18.0,49.0
10,1658,1529.0,,72.0,57.0,410,185,13,2,7,58.571429,,17.0,108.0,80.0,55.0,40.0,10.0,187.0,123.0
13,681,567.0,,100.0,14.0,187,28,5,1,7,26.714286,,25.0,51.0,19.0,19.0,21.0,7.0,77.0,65.0


In [51]:
#删除提取商铺特征的临时数据框
del shop_feaut_2

In [52]:
#合并提取的特征到电商--用户行为复购表
user_feaut = user_feaut.reset_index()
user_merchant = user_merchant.merge(user_feaut,on="user_id",how="left")
shop_feaut = shop_feaut.reset_index()
user_merchant = user_merchant.merge(shop_feaut,right_on="seller_id",left_on="merchant_id" ,how="left")

In [53]:
#删除用户，商铺特征提取表
del user_feaut
del shop_feaut

In [55]:
user_merchant.isna().sum()

user_id               0
merchant_id           0
label                 0
total_log             0
click_x             234
add_car_x         31892
buy_x                 9
collect_x         17321
seller_count          0
item_count_x          0
cat_count_x           0
brand_count_x         0
month_count           0
day_count             0
month_avg_log         0
month_avg_buy         0
day_avg_log           0
day_avg_buy           0
seller_id             0
total_count           0
click_y               0
add_car_y         30638
buy_y                 0
collect_y            42
user_count            0
item_count_y          0
cat_count_y           0
brand_count_y         0
month                 0
month_avg_user        0
less18            31719
between18and24      295
between25and29       19
between30and34       13
between35and39       72
between40and49      126
grate50            2205
gender_F              0
gender_M             37
dtype: int64