In [18]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler

In [4]:
# 导入并预览数据
dataset = pd.read_csv("D:\Codes\data\shopping_trends.csv")
dataset.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [8]:
# 数据清洗
# 去空
dataset.isna().sum()

Customer ID               0
Age                       0
Gender                    0
Item Purchased            0
Category                  0
Purchase Amount (USD)     0
Location                  0
Size                      0
Color                     0
Season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64

In [9]:
# 去重
dataset.duplicated().sum()

0

In [11]:
# 去无用列
dataset.nunique()

Customer ID               3900
Age                         53
Gender                       2
Item Purchased              25
Category                     4
Purchase Amount (USD)       81
Location                    50
Size                         4
Color                       25
Season                       4
Review Rating               26
Subscription Status          2
Shipping Type                6
Discount Applied             2
Promo Code Used              2
Previous Purchases          50
Payment Method               6
Frequency of Purchases       7
dtype: int64

In [12]:
# 数据类型转换
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [15]:
dataset['Customer ID'] = dataset['Customer ID'].astype(str)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   object 
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [22]:
# 数据预处理
# Frequency of Purchases数值化
frequency_mapping = {
    'Weekly': 52,
    'Bi-Weekly': 26,
    'Monthly': 12,
    'Quarterly': 4,
    'Every 3 Months': 4,
    'Annually': 1,
    'Fortnightly': 26
}

dataset_new = dataset.copy()
dataset_new['Frequency of Purchases'] = dataset_new['Frequency of Purchases'].map(frequency_mapping)
dataset_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   object 
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [24]:
# Previous Purchases, Frequency of Purchases归一化
scaler = MinMaxScaler()
dataset_new[['Previous Purchases', 'Frequency of Purchases']] = scaler.fit_transform(dataset_new[['Previous Purchases', 'Frequency of Purchases']])
dataset_new[['Previous Purchases', 'Frequency of Purchases']].describe()

Unnamed: 0,Previous Purchases,Frequency of Purchases
count,3900.0,3900.0
mean,0.49697,0.322966
std,0.294839,0.329601
min,0.0,0.0
25%,0.244898,0.058824
50%,0.489796,0.215686
75%,0.755102,0.490196
max,1.0,1.0


In [123]:
# 结合用户画像，使用矩阵SVD分解得出用户评分预测表
# 原始用户评分表
rating_matrix = dataset_new.pivot(index='Customer ID', columns='Item Purchased', values='Review Rating').fillna(0)

# 创建用户画像
user_profile = dataset_new.groupby('Customer ID')[['Previous Purchases', 'Frequency of Purchases']].mean()
user_profile['Average Rating'] = user_rating_mean
user_profile.head()

# 标准化
user_rating_mean = np.mean(rating_matrix, axis=1)
rating_matrix_normalized = rating_matrix - user_rating_mean.values.reshape(-1, 1)

# SVD分解
U, sigma, Vt = svds(rating_matrix_normalized.values, k=10)
sigma = np.diag(sigma)

# 结合用户画像
user_factors = np.dot(U, sigma)
user_profile_arr = user_profile.values
user_factors = user_factors * (1 + user_profile_arr[:, 0].reshape(-1, 1)) * (1 + user_profile_arr[:, 1].reshape(-1, 1)) * user_profile_arr[:, 2].reshape(-1, 1)

predicted_ratings = np.dot(user_factors, Vt) + user_rating_mean.values.reshape(-1, 1)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=rating_matrix.columns, index=rating_matrix.index)

predicted_ratings_df.head()

Item Purchased,Backpack,Belt,Blouse,Boots,Coat,Dress,Gloves,Handbag,Hat,Hoodie,...,Scarf,Shirt,Shoes,Shorts,Skirt,Sneakers,Socks,Sunglasses,Sweater,T-shirt
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.116403,0.082796,0.810733,0.11259,0.065368,0.08656,0.113344,0.102349,0.08817,0.112646,...,0.107105,0.073238,0.11222,0.105068,0.077251,0.115379,0.070614,0.078943,0.086405,0.112409
10,0.197705,0.1716,0.161067,0.200297,0.240873,0.159317,0.19979,0.2072,0.217806,0.20026,...,0.203968,0.231891,0.200546,0.205342,0.195804,0.198409,0.234766,0.187936,0.159732,0.200419
100,0.150957,0.14113,0.132986,0.152264,0.170629,0.131264,0.15201,0.155645,0.16059,0.152245,...,0.154079,0.166801,0.152388,0.154748,0.153756,0.151314,0.168035,0.150171,0.131687,0.152325
1000,0.124334,0.064457,0.072354,0.128448,0.220242,0.073914,0.127626,0.140472,0.162151,0.128387,...,0.134644,0.196283,0.128853,0.137077,0.051339,0.12543,0.20381,0.054882,0.073533,0.128646
1001,0.16875,0.077139,0.089222,0.175045,0.31549,0.091608,0.173788,0.193442,0.226611,0.174952,...,0.184525,0.278834,0.175666,0.188249,0.057069,0.170428,0.290351,0.062489,0.091026,0.175349
