# Ate Model
Created by Vulcan626 on 2023/11/11

In [1]:
# 导入包
import numpy as np
import pandas as pd

In [2]:
# 导入数据
df = pd.read_csv('dataset/UserBehavior.csv')
df.columns = ['user_id', 'item_id', 'category_id', 'behavior_type', 'timestamp']
print(df.columns)

Index(['user_id', 'item_id', 'category_id', 'behavior_type', 'timestamp'], dtype='object')


## 数据集分析
1. 该数据集包含了2017年11月25日至2017年12月3日之间，有行为的约一百万随机用户的所有行为（行为包括点击、购买、加购、喜欢）。
2. 数据集的每一行表示一条用户行为，由用户ID、商品ID、商品类目ID、行为类型和时间戳组成，并以逗号分隔。

In [3]:
# 行为和时间戳的数据类型
print(type(df['behavior_type'].loc[0]))
print(type(df['timestamp'].loc[0]))

<class 'str'>
<class 'numpy.int64'>


In [4]:
# 各行为类型数量统计
from collections import Counter
print(Counter(df['behavior_type']))

Counter({'pv': 89716263, 'cart': 5530446, 'fav': 2888258, 'buy': 2015839})


In [5]:
# 将行为类型转换为数字再次统计
df['behavior_type'] = df['behavior_type'].map({'pv': 1, 'cart': 2, 'fav': 3, 'buy': 4, 'p': 1, 'c': 2, 'f': 3, 'b': 4})
print(Counter(df['behavior_type']))

Counter({1: 89716263, 2: 5530446, 3: 2888258, 4: 2015839})


In [6]:
df['behavior_type1'] = df['behavior_type'].apply(lambda x: list(str(x)))
print(df['behavior_type1'].loc[0])

['1']


In [7]:
print(df['timestamp'].loc[2])

1511593493


In [8]:
df['timestamp1'] = df['timestamp'].apply(lambda x: list(str(x).split('.0')))
print(df['timestamp1'].loc[0])

['1511561733']


## 构造序列特征

In [9]:
a = df.groupby('user_id')['behavior_type1'].sum()
b = df.groupby('user_id')['timestamp1'].sum()
print(b)

KeyboardInterrupt: 

In [None]:
data = pd.DataFrame()
data['time_list'] = b
data['Behavior_list'] = a
print(data.head(10))

In [None]:
print(data['Behavior_list'].loc[1])
print(data['time_list'].loc[1])

In [None]:
# 点击次数
data['is_pv'] = data['Behavior_list'].apply(lambda x: Counter(x)['1'])
print(data['is_pv'].loc[1])

In [None]:
# 是否加购
data['is_cart'] = data['Behavior_list'].apply(lambda x: 1 if '2' in x else 0)
print(data['is_cart'].loc[13])

In [None]:
# 是否购买
data['is_buy'] = data['Behavior_list'].apply(lambda x: 1 if '5' in x else 0)
print(data['is_buy'].loc[13])

In [None]:
# 是否收藏
data['is_fav'] = data['Behavior_list'].apply(lambda x: 1 if '3' in x else 0)
print(data['is_fav'].loc[13])

In [None]:
# Behavior_list长度特征
data['Behavior_list_len'] = data['Behavior_list'].apply(lambda x: len(x))
print(data['Behavior_list_len'].loc[13])

In [None]:
# 是否收藏，如果收藏了有几次
data['fav_much'] = data['Behavior_list'].apply(lambda x: 0 if '3' not in x else Counter(x)['3'])
print(data['fav_much'])

In [None]:
# 是否加购，如果加购了有几次
data['cart_much'] = data['Behavior_list'].apply(lambda x: 0 if '2' not in x else Counter(x)['2'])
print(data['cart_much'])

In [None]:
print(data.columns)

In [None]:
data['label'] = data['is_buy']

del data['is_buy']

In [None]:
print(data.shape)

In [None]:
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

#查看缺失值情况
msno.matrix(data, labels=True)

In [None]:
# 'fav_much', 'cart_much'特征分布可视化
f,ax=plt.subplots(1,2,figsize=(12,5))
sns.set_palette(["#9b59b6","#3498db",])        #设置所有图的颜色，使用hls色彩空间
sns.distplot(data['fav_much'],bins=30,kde=True,label='123',ax=ax[0])
sns.distplot(data['cart_much'],bins=30,kde=True,label='12',ax=ax[1])

In [None]:
data.index = list(range(data.shape[0]))

In [None]:
#导入需要的建模包
from sklearn.model_selection import train_test_split, StratifiedKFold,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


drop = ['time_list', 'Behavior_list',"label","time_list_len"]
train_col = [x for x in data.columns if x not in drop]
labels = data['label']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data[train_col],data['label'], test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
print(train_col)

In [None]:
# 未调参随机森林
rfc = RandomForestClassifier(n_estimators=2000, max_depth=5)
rfc.fit(X_train, Y_train)

y_pred = rfc.predict(X_test)
print('Model accuracy score with 1000 decision-trees : {0:0.4f}'. format(accuracy_score(Y_test, y_pred)))

In [None]:
# GridSearchCV调参
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)

param_grid = { "min_samples_leaf" : [1, 5, 10], "max_depth" : [4,5,6], "n_estimators": [100,500,1000,1500]}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)

gs = gs.fit(X_train, Y_train)

In [None]:
print(gs.grid_scores_)

In [None]:
a = gs.grid_scores_
mean_acc = [x[1] for x in a]
params =[x[0] for x in a]

print(max(mean_acc))
print(params)

In [None]:
# 5-fold 交叉验证
forest = RandomForestClassifier(max_depth=6, min_samples_leaf=10, n_estimators=1000)
def muti_score(model):
    #    warnings.filterwarnings('ignore')
    precision = cross_val_score(model, X_train, Y_train, scoring='precision', cv=5)
    recall = cross_val_score(model, X_train, Y_train, scoring='recall', cv=5)
    f1_score = cross_val_score(model, X_train, Y_train, scoring='f1', cv=5)
    auc = cross_val_score(model, X_train, Y_train, scoring='roc_auc', cv=5)
    print("precision:",precision.mean())
    print("recall:",recall.mean())
    print("F1_score:",f1_score.mean())
    print("AUC:",auc.mean())

muti_score(forest)