In [9]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('Matrix Factorization.ipynb'), '..')))
from ReadData.DatePreProcess import read_df, user_df_preprocess, item_df_preprocess

In [10]:
user_src_name = '../tianchi_fresh_comp_train_user_online.csv'
user_df = read_df(user_src_name, type='user')

In [11]:
item_src_name = '../tianchi_fresh_comp_train_item_online.csv'
item_df = read_df(item_src_name, type='item')

In [8]:
user_df.columns

Index(['user_id', 'item_id', 'behavior_type', 'user_geohash', 'item_category',
       'time', 'date', 'hr'],
      dtype='object')

In [12]:
df = pd.merge(user_df, item_df, how='left', on='item_id')

In [13]:
df

Unnamed: 0,user_id,item_id,behavior_type,user_geohash,item_category_x,time,date,hr,item_geohash,item_category_y
0,77404236,306224045,1,,9023,2014-11-24 19,2014-11-24,19,,
1,77404236,321344925,4,,3424,2014-11-25 20,2014-11-25,20,,
2,77404236,393135256,1,,11623,2014-12-06 23,2014-12-06,23,,
3,77404236,277463747,1,,451,2014-12-17 20,2014-12-17,20,,
4,77404236,38563179,1,,8432,2014-12-17 17,2014-12-17,17,,
...,...,...,...,...,...,...,...,...,...,...
105235,78598518,242255272,1,,1370,2014-12-06 13,2014-12-06,13,,
105236,78598518,242255272,1,,1370,2014-12-03 08,2014-12-03,08,,
105237,78598518,386374279,1,,11623,2014-12-16 15,2014-12-16,15,,
105238,78598518,192057635,4,,10507,2014-12-02 16,2014-12-02,16,,


In [12]:
user_number = len(user_df['user_id'].unique())
item_number = len(item_df['item_id'].unique())
print(user_number, item_number)

85 4467252


In [13]:
user_idx_list = user_df['user_id'].unique()
item_idx_list = item_df['item_id'].unique()

In [24]:
print(item_idx_list)

[100014006 100030948 100037683 ... 139591826 139595801 139596319]


In [14]:
embedding_dim = 16
user_matrix = np.random.rand(user_number, embedding_dim)
item_matrix = np.random.rand(embedding_dim, item_number)

In [15]:
user_matrix

array([[0.02514382, 0.13431704, 0.67182426, ..., 0.92629072, 0.6564474 ,
        0.71875196],
       [0.60811422, 0.68630949, 0.48807599, ..., 0.89001905, 0.45992455,
        0.22654268],
       [0.68947257, 0.37215287, 0.64420077, ..., 0.5679925 , 0.14810157,
        0.94552655],
       ...,
       [0.99264747, 0.38511346, 0.26744746, ..., 0.22501835, 0.86577634,
        0.58966431],
       [0.83794696, 0.57718346, 0.07283856, ..., 0.61312887, 0.32766151,
        0.66985173],
       [0.09859396, 0.40738821, 0.9469112 , ..., 0.704581  , 0.61856551,
        0.9278246 ]])

In [27]:
tar_matrix = np.zeros((user_number, item_number))

In [28]:
tar_matrix.shape

(85, 4467252)

In [32]:
tar_df = df.loc[df['behavior_type'] == 4][['user_id', 'item_id']]

In [39]:
for idx, rows in tar_df.iterrows():
    user_id, item_id = rows['user_id'], rows['item_id']
    print(user_id, item_id)

77404236 321344925
77413503 153796248
77413503 153796248
77413503 166644525
77413503 106606746
77429943 175363016
77429943 61903388
77429943 196437023
77429943 101627241
77429943 102722744
77429943 219749679
77429943 102722744
77429943 225622606
77429943 102722744
77429943 35532916
77429943 102722744
77429943 244664564
77429943 41588524
77429943 219749679
77429943 102722744
77429943 61903388
77429943 102722744
77429943 259280897
77429943 29670818
77429943 397745949
77429943 102722744
77429943 219749679
77429943 88297899
77429943 285795783
77429943 270145493
77429943 202481298
77429943 402313702
77429943 211287010
77429943 142754161
77429943 102722744
77429943 205651272
77429943 219749679
77429943 219749679
77429943 37027719
77429943 12960036
77429943 50179885
77429943 33933819
77429943 102722744
77429943 292015436
77429943 72338781
77429943 56543694
77429943 89766248
77429943 300936571
77429943 367074098
77429943 344869794
77429943 261167081
77446473 389242167
77467791 95479652
7746779

# 初始化矩阵

In [65]:
from collections import defaultdict


def init_matrix(user_df, item_df, embedding_dim=64):
    user_idx_list = user_df['user_id'].unique().tolist()
    item_idx_list = item_df['item_id'].unique().tolist()

    user_number, item_number = len(user_idx_list), len(item_idx_list)
    user_matrix = np.random.rand(user_number, embedding_dim)
    item_matrix = np.random.rand(embedding_dim, item_number)

    tar_df = pd.merge(user_df, item_df, how='inner', on='item_id')

    tar_matrix = np.zeros((user_number, item_number), dtype=int)

    repurchase_df = tar_df.loc[tar_df['behavior_type'] >= 2][['user_id', 'item_id']]

    for idx, rows in repurchase_df.iterrows():
        user_id, item_id = rows['user_id'], rows['item_id']
        user_idx = user_idx_list.index(user_id)
        item_idx = item_idx_list.index(item_id)

        tar_matrix[user_idx][item_idx] = 1

    return user_matrix, item_matrix, tar_matrix

In [66]:
user_df = read_df('../tianchi_fresh_comp_train_user_online.csv', type='user')
item_df = read_df('../tianchi_fresh_comp_train_item_online.csv', type='item')
embedding_dim = 16
user_matrix, item_matrix, tar_matrix = init_matrix(user_df, item_df, embedding_dim)

MemoryError: cannot allocate memory for array

In [23]:
tar_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
item_matrix.shape

(16, 4467252)

In [26]:
user_matrix.shape

(85, 16)

# 交替最小二乘法

In [19]:
# 超参数
epoches = 30
lr = 0.001
Lambda = 0.01

In [63]:
def loss_func_optimized(user_matrix, item_matrix, tar_matrix):
    # 确保所有输入都是NumPy数组，如果不是，转换为NumPy数组
    user_matrix = np.array(user_matrix)
    item_matrix = np.array(item_matrix)
    tar_matrix = np.array(tar_matrix)
    
    
    # 使用矩阵运算计算损失
    diff = tar_matrix - np.dot(user_matrix, item_matrix.T)  # 注意转置item_matrix
    loss = np.sum(diff**2)
    
    return loss

In [64]:
loss_func_optimized(user_matrix, item_matrix, tar_matrix)

MemoryError: Unable to allocate 1.41 GiB for an array with shape (85, 4467252) and data type int32

In [None]:
from tqdm import tqdm
def matrix_factorization(user_matrix, item_matrix, tar_matrix, epoches, lr, Lambda):
    res = []
    embedding_dim = len(user_matrix[0])
    for epoch in tqdm(range(epoches)):
        for i in range(len(user_matrix)):
            for j in range(len(item_matrix[0])):
                eij = tar_matrix[i, j] - np.dot(user_matrix[i, :], item_matrix[:, j])
                