In [1]:
import numpy as np
from scipy import spatial
import torch 
import pandas as pd 
# torch.manual_seed(0)

In [2]:
files = ['Douban_Book', 'Movielens', 'Yelp']

In [3]:
with open(f'./data/Douban_Book/user_book.dat') as f:
    data = [list(map(int, line.strip().split('\t'))) for line in f.readlines()]

# User-Item matrix 

CF 的缺點：

* 如果沒有用戶的歷史數據就沒辦法做任何推薦
* 以及無論 user-based 或 item-based 都需要消耗大量的運算資源
* 大部分用戶有評分紀錄的資料都只佔所有資料中的很小一部分，matrix 相當稀疏，很難找到相似的資料
* 會有馬太效應，越熱門的物品越容易被推薦，所以通常都會降低熱門物品的權重

user-based 考慮的是 user 和 user 之間的相似程度

    給定一個用戶 A
    計算用戶 A 跟其他所有用戶的相似度
    找出最相似的 m 個用戶
    再找出這些用戶有評分但是用戶 A 沒有評分的物品（也可以額外限制至少要幾個用戶有評分過）
    以「相似用戶的相似度」和「該用戶對該物品的評分」來加權算出用戶 A 對這些未評分物品的評分
    最後推薦給 A 評分最高的 n 個物品

預測 user_4 對 item_a 的評分 =
(user_4_user_1_sim x user_1_item_a_rating + user_4_user_3_sim x user_3_item_a_rating) / (user_4_user_1_sim + user_4_user_3_sim)

user-based 的特點：

* 適合 user 遠少於 item 的系統，相似度的計算量會較少
* item 的時效性強、更多樣的系統，例如新聞、社交網站，適合用 user-based CF
* 不容易給出推薦理由
* 驚喜度較高

In [4]:
# matrix = torch.zeros((13024, 22347), dtype= int)
# one_hot_matrix = torch.zeros((13024, 22347), dtype= int)
# for info in data:
#     matrix[info[0]-1, info[1]-1] = info[2]
#     one_hot_matrix[info[0]-1, info[1]-1] = 1

In [5]:
# matrix

## Filtering & Spliting
filtering 部分要改

In [6]:
# filtered_matrix = matrix[torch.sum(one_hot_matrix, dim= 1) >= 3]

In [7]:
# random_index = torch.randperm(len(filtered_matrix))
# train_mat = filtered_matrix[random_index][:0.8*len(filtered_matrix)]
# val_mat = filtered_matrix[random_index][:0.8*len(filtered_matrix)]
# test_mat = filtered_matrix[random_index][:0.8*len(filtered_matrix)]

In [8]:
class CF:
    def __init__(self, path= './data/Douban_Book/user_book.dat'):
        
        
        names = ['user_id', 'item_id', 'rating']
        df = pd.read_csv(path, sep='\t', names=names)
        
        '''filtering'''
        grouped_df = df.groupby(['user_id'])['item_id'].count()
        filtered_user_id = grouped_df[grouped_df>3].index
        filtered_df = df.set_index('user_id').loc[filtered_user_id].reset_index()
        
        '''user_id to index_id'''
        self.user_dict = dict()
        for i, user_id in enumerate(filtered_user_id):
            self.user_dict[user_id] = i 
        
        self.n_user= filtered_df['user_id'].unique().shape[0]
        self.n_item = filtered_df['item_id'].unique().shape[0]
        self.data= np.array(filtered_df)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
    def k_fold_data_split(self):
        
        np.random.shuffle(self.data)
        train_data = self.data[:int(0.7*len(data))]
        val_data = self.data[int(0.7*len(data)):int(0.8*len(data))]
        test_data = self.data[int(0.8*len(data)):]
        return train_data, val_data, test_data
    
    def user_item_matrix(self):
        
        train_data, val_data, test_data= self.k_fold_data_split()
        matrix = torch.zeros((self.n_user, self.n_item), dtype= int)
        
        for info in train_data:
            matrix[self.user_dict[info[0]], info[1]-1] = info[2]
            
        return matrix
    
    def similarity_matrix(self, method= 'cosine', kind= 'user', epsilon= 1e-9):
        
        user_item_matrix = self.user_item_matrix().to(self.device)
        similarity_matrix = torch.zeros((self.n_user, self.n_user), dtype= float)
        print('ok1')
        if method == 'cosine':
            
            '''slow calc
            for user1 in range(0, self.n_user):
                for user2 in range(user1, self.n_user):
                    similarity = 1 - spatial.distance.cosine(user_item_matrix[user1], user_item_matrix[user2])
                    similarity_matrix[user1, user2] = similarity
                print('ok')
            
            return similarity_matrix '''
            
            '''fast calc'''
            # epsilon -> small number for handling dived-by-zero errors
            if kind == 'user':
                print('ok2')
#                 sim = user_item_matrix.dot(user_item_matrix.T) + epsilon
                sim = torch.mm(user_item_matrix, user_item_matrix.t()) + epsilon
            elif kind == 'item':
                sim = user_item_matrix.T.dot(user_item_matrix) + epsilon
            print('ok3')
            norms = np.array([np.sqrt(np.diagonal(sim))])
            return (sim / norms / norms.T)
            
#     def k_nearest_neighbor(self, user1, k= 10)

In [9]:
simi = CF().similarity_matrix()

RuntimeError: CUDA out of memory. Tried to allocate 1.88 GiB (GPU 0; 2.00 GiB total capacity; 0 bytes already allocated; 1.65 GiB free; 0 bytes reserved in total by PyTorch)

In [None]:
names = ['user_id', 'item_id', 'rating']
df = pd.read_csv('./data/Douban_Book/user_book.dat', sep='\t', names=names)

In [None]:
filtered_df = df.groupby(['user_id'])['item_id'].count()

In [None]:
filtered_user_id = filtered_df[filtered_df>3].index
print(filtered_user_id)

In [None]:
df.set_index('user_id').loc[filtered_user_id].reset_index()