# 1 导入必要的库

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 用于推荐系统的库
from surprise import Dataset, Reader, SVD, KNNBasic, NMF
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.model_selection import cross_validate
import implicit
from scipy.sparse import csr_matrix
import lightfm
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score

  from .autonotebook import tqdm as notebook_tqdm


# 2. 加载预处理后的数据

In [3]:
df = pd.read_csv('../datasets/walmart_preprocessed.csv')

print(f"数据集形状: {df.shape}")
print(f"总用户数: {df['User_ID'].nunique()}")
print(f"总产品数: {df['Product_ID'].nunique()}")
print(f"总交互数: {len(df)}")

数据集形状: (550068, 20)
总用户数: 5891
总产品数: 3631
总交互数: 550068


# 3 特征工程

## 3.1 创建用户特征矩阵

In [5]:
print("\n构建用户特征矩阵...")
user_features = df.drop_duplicates('User_ID').set_index('User_ID')

# 选择用户特征
user_cols = ['Gender_Code', 'Occupation', 'City_Code', 'Marital_Status', 
            'Stay_Years', 'Purchase_Count', 'Avg_User_Purchase']

# 处理年龄特征 - 将分类变量转为数值编码
age_mapping = {'0-17': 0, '18-25': 1, '26-35': 2, '36-45': 3, '46-50': 4, '51-55': 5, '55+': 6}
user_features['Age_Code'] = user_features['Age'].map(age_mapping)
user_cols.append('Age_Code')

# 保留基本用户特征
user_features = user_features[user_cols]
print(f"用户特征矩阵形状: {user_features.shape}")
print(f"用户特征: {user_features.columns.tolist()}")


构建用户特征矩阵...
用户特征矩阵形状: (5891, 8)
用户特征: ['Gender_Code', 'Occupation', 'City_Code', 'Marital_Status', 'Stay_Years', 'Purchase_Count', 'Avg_User_Purchase', 'Age_Code']


## 3.2 创建商品特征矩阵

In [6]:
print("\n构建商品特征矩阵...")
# 计算每个产品的统计特征
product_features = df.groupby('Product_ID').agg({
    'Purchase': ['mean', 'std', 'count'],
    'Product_Category': 'first'
}).reset_index()
product_features.columns = ['Product_ID', 'Avg_Price', 'Price_Std', 'Purchase_Count', 'Category']

# 创建产品类别的One-Hot编码
product_category_dummies = pd.get_dummies(product_features['Category'], prefix='category')
product_features = pd.concat([product_features, product_category_dummies], axis=1)

# 设置索引
product_features.set_index('Product_ID', inplace=True)
print(f"产品特征矩阵形状: {product_features.shape}")


构建商品特征矩阵...
产品特征矩阵形状: (3631, 24)


## 3.3 创建交互矩阵 (用户-商品-评分)

In [7]:
print("\n构建交互矩阵...")
interactions = df[['User_ID', 'Product_ID', 'Purchase']]

# 将购买金额标准化到[0-5]区间作为评分,更符合推荐系统的评分范围
min_purchase = interactions['Purchase'].min()
max_purchase = interactions['Purchase'].max()
interactions['Rating'] = 1 + 4 * (interactions['Purchase'] - min_purchase) / (max_purchase - min_purchase)

print(f"交互矩阵形状: {interactions.shape}")
print("评分分布:")
print(interactions['Rating'].describe())


构建交互矩阵...
交互矩阵形状: (550068, 4)
评分分布:
count    550068.000000
mean          2.545279
std           0.838960
min           1.000000
25%           1.970562
50%           2.342018
75%           3.011274
max           5.000000
Name: Rating, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions['Rating'] = 1 + 4 * (interactions['Purchase'] - min_purchase) / (max_purchase - min_purchase)


## 3.4 处理稀疏性问题

In [8]:
# 过滤掉少于5次交互的用户和商品
user_counts = interactions['User_ID'].value_counts()
product_counts = interactions['Product_ID'].value_counts()
active_users = user_counts[user_counts >= 5].index
active_products = product_counts[product_counts >= 5].index

print(f"\n过滤前交互数: {len(interactions)}")
filtered_interactions = interactions[
    interactions['User_ID'].isin(active_users) & 
    interactions['Product_ID'].isin(active_products)
]
print(f"过滤后交互数: {len(filtered_interactions)}")
print(f"保留的用户数: {len(active_users)}")
print(f"保留的产品数: {len(active_products)}")


过滤前交互数: 550068
过滤后交互数: 549320
保留的用户数: 5891
保留的产品数: 3277


In [9]:
# 更新用户和商品特征
user_features = user_features.loc[active_users]
product_features = product_features.loc[active_products]

## 3.5 特征缩放

In [10]:
print("\n特征缩放...")
# 对用户特征进行标准化
user_scaler = StandardScaler()
user_features_scaled = user_scaler.fit_transform(user_features)
user_features_scaled = pd.DataFrame(
    user_features_scaled, 
    index=user_features.index, 
    columns=user_features.columns
)

# 对产品特征进行标准化 (不包括One-Hot编码的类别)
num_cols = ['Avg_Price', 'Price_Std', 'Purchase_Count']
cat_cols = [col for col in product_features.columns if col.startswith('category_')]
product_scaler = StandardScaler()
product_features[num_cols] = product_scaler.fit_transform(product_features[num_cols])


特征缩放...


## 3.6 降维

In [11]:
# 对用户特征进行PCA
pca = PCA(n_components=min(5, len(user_features.columns)))
user_features_pca = pca.fit_transform(user_features_scaled)
print(f"用户特征PCA解释方差比例: {pca.explained_variance_ratio_}")
print(f"总解释方差: {sum(pca.explained_variance_ratio_):.4f}")

# 保存处理后的数据
interactions.to_csv('../datasets/interactions.csv', index=False)
user_features.to_csv('../datasets/user_features.csv')
product_features.to_csv('../datasets/product_features.csv')

用户特征PCA解释方差比例: [0.18955545 0.15916731 0.15433437 0.12470382 0.11657118]
总解释方差: 0.7443
