In [1]:
import numpy as np
# from scipy import spatial
import torch 
import pandas as pd 
# torch.manual_seed(0)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/python_data/社群網路與推薦系統/hw3/data/Douban_Book/user_book.dat'

In [4]:
files = ['Douban_Book', 'Movielens', 'Yelp']

In [5]:
# with open(f'./data/Douban_Book/user_book.dat') as f:
#     data = [list(map(int, line.strip().split('\t'))) for line in f.readlines()]

# User-Item matrix 

CF 的缺點：

* 如果沒有用戶的歷史數據就沒辦法做任何推薦
* 以及無論 user-based 或 item-based 都需要消耗大量的運算資源
* 大部分用戶有評分紀錄的資料都只佔所有資料中的很小一部分，matrix 相當稀疏，很難找到相似的資料
* 會有馬太效應，越熱門的物品越容易被推薦，所以通常都會降低熱門物品的權重

user-based 考慮的是 user 和 user 之間的相似程度

    給定一個用戶 A
    計算用戶 A 跟其他所有用戶的相似度
    找出最相似的 m 個用戶
    再找出這些用戶有評分但是用戶 A 沒有評分的物品（也可以額外限制至少要幾個用戶有評分過）
    以「相似用戶的相似度」和「該用戶對該物品的評分」來加權算出用戶 A 對這些未評分物品的評分
    最後推薦給 A 評分最高的 n 個物品

預測 user_4 對 item_a 的評分 =
(user_4_user_1_sim x user_1_item_a_rating + user_4_user_3_sim x user_3_item_a_rating) / (user_4_user_1_sim + user_4_user_3_sim)

user-based 的特點：

* 適合 user 遠少於 item 的系統，相似度的計算量會較少
* item 的時效性強、更多樣的系統，例如新聞、社交網站，適合用 user-based CF
* 不容易給出推薦理由
* 驚喜度較高

In [6]:
# matrix = torch.zeros((13024, 22347), dtype= int)
# one_hot_matrix = torch.zeros((13024, 22347), dtype= int)
# for info in data:
#     matrix[info[0]-1, info[1]-1] = info[2]
#     one_hot_matrix[info[0]-1, info[1]-1] = 1

In [7]:
# matrix

## Filtering & Spliting
filtering 部分要改

In [8]:
# filtered_matrix = matrix[torch.sum(one_hot_matrix, dim= 1) >= 3]

In [9]:
# random_index = torch.randperm(len(filtered_matrix))
# train_mat = filtered_matrix[random_index][:0.8*len(filtered_matrix)]
# val_mat = filtered_matrix[random_index][:0.8*len(filtered_matrix)]
# test_mat = filtered_matrix[random_index][:0.8*len(filtered_matrix)]

In [43]:
class CF:
    def __init__(self, path= './data/Douban_Book/user_book.dat'):
        
        
        names = ['user_id', 'item_id', 'rating']
        df = pd.read_csv(path, sep='\t', names=names)
        
        '''filtering'''
        grouped_df = df.groupby(['user_id'])['item_id'].count()
        filtered_user_id = grouped_df[grouped_df>3].index
        filtered_df = df.set_index('user_id').loc[filtered_user_id].reset_index()
        
        '''user_id to index_id'''
        self.user_dict = dict()
        for i, user_id in enumerate(filtered_user_id):
            self.user_dict[user_id] = i 
        
        self.n_user= filtered_df['user_id'].unique().shape[0]
        self.n_item = filtered_df['item_id'].unique().shape[0]
        self.data= np.array(filtered_df)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
            
    def k_fold_data_split(self):
        
        np.random.shuffle(self.data)
        train_data = self.data[:int(0.7*len(self.data))]
        val_data = self.data[int(0.7*len(self.data)):int(0.8*len(self.data))]
        test_data = self.data[int(0.8*len(self.data)):]
        return train_data, val_data, test_data
    
    def get_user_item_matrix(self):
        
        train_data, _, self.test_data= self.k_fold_data_split()
        matrix = torch.zeros((self.n_user, self.n_item), dtype= float)
        
        for info in train_data:
            matrix[self.user_dict[info[0]], info[1]-1] = info[2]
        
        self.user_item_matrix = matrix 
        return matrix
    
    def get_similarity_matrix(self, method= 'cosine', kind= 'user', epsilon= 1e-9):
        
        user_item_matrix = self.get_user_item_matrix().to(self.device)
        # similarity_matrix = torch.zeros((self.n_user, self.n_user), dtype= float)
        print('ok1')
        if method == 'cosine':
            
            '''slow calc
            for user1 in range(0, self.n_user):
                for user2 in range(user1, self.n_user):
                    similarity = 1 - spatial.distance.cosine(user_item_matrix[user1], user_item_matrix[user2])
                    similarity_matrix[user1, user2] = similarity
                print('ok')
            
            return similarity_matrix '''
            
            '''fast calc'''
            # epsilon -> small number for handling dived-by-zero errors
            if kind == 'user':
                print('ok2')
#                 sim = user_item_matrix.dot(user_item_matrix.T) + epsilon
                sim = torch.mm(user_item_matrix, user_item_matrix.t()) + epsilon
            elif kind == 'item':
                sim = user_item_matrix.T.dot(user_item_matrix) + epsilon
            print('ok3')
            norms = torch.sqrt(torch.diagonal(sim))
            sim_matrix = sim/norms/norms.t()
            adjusted_sim_matrix = sim_matrix - torch.mean(sim_matrix, dim= 1) #減去平均
            return adjusted_sim_matrix
            
    def k_nearest_neighbor(self, sim_matrix, user_id, k= 10):
      # sim_matrix= self.similarity_matrix()
      top_k_neighbor_ids = torch.argsort(sim_matrix[self.user_dict[user_id]], dim= 0, descending=True)[1:k+1] #除了自己
      return top_k_neighbor_ids
    
    def predict(self):
      predicts = []
      ratings = []
      neighbor_dict = dict()
      sim_matrix = self.get_similarity_matrix()
      unique_user_ids = list(set([info[0] for info in self.test_data]))
      for user_id in unique_user_ids:
        neighbor_dict[user_id] = self.k_nearest_neighbor(sim_matrix= sim_matrix, user_id= user_id)

      for (user_id, item_id, rating) in self.test_data:
        # print(neighbor_dict[user_id]) #該user_id對應的k個最近鄰居
        neighbor_ratings = self.user_item_matrix[neighbor_dict[user_id]].t()[item_id] #k個鄰居在item對應的評分
        predict = torch.sum(neighbor_ratings).item()/len(neighbor_ratings)
        predicts.append(predict)
        ratings.append(rating)
      return predicts, ratings

# Main

In [44]:
CF_obj= CF(path= path)

In [24]:
# sim_matrix = CF_obj.similarity_matrix()

In [39]:
CF_obj.user_item_matrix[torch.tensor([ 7362,  2344,  8225,   590,  6198,  5831,  1918,  4371,  3033, 10836],
       device='cuda:0')].t()[1822]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)

In [45]:
CF_obj.predict()

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
tensor([ 1967,  9383,  7704,  8020,   613, 10614,  1871,  8464,  5831,   338],
       device='cuda:0')
tensor([ 6974, 10480,  8581,   677,  2349,  1967,  9383,  5672,  4030,  4103],
       device='cuda:0')
tensor([ 9383,  3651,  8436,  2146,  7240, 10480,  9618,  3253,  7565,  8374],
       device='cuda:0')
tensor([1967, 9383, 8741, 6847, 4210, 7266, 2839, 8629,  224, 9459],
       device='cuda:0')
tensor([9383, 6129, 2223, 6993,  550, 8197, 1605, 5053, 2694, 7489],
       device='cuda:0')
tensor([ 9383,  6474, 11076,  5423,  4103,  4113,  4926,  6898,  5397,  4210],
       device='cuda:0')
tensor([ 5357,  1967,  9383,  1302, 11153, 11087,  9034,  7266,  2839,  6897],
       device='cuda:0')
tensor([ 9383, 11255,  5309,  6974,  1154,  4384,  7240,  9351,  3965,  9618],
       device='cuda:0')
tensor([9383, 8501, 4113, 5309, 4633, 6936, 9324, 6993, 8263, 7324],
       device='cuda:0')
tensor([1967, 9383, 1302, 4384, 7240, 6974, 2116, 6970, 2104, 8119],

([0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.2,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.4,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.4,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3,
  0.0,