In [92]:
import pandas as pd
import operator
import random
import math
import heapq
import pickle 


In [3]:
data = pd.read_csv("./movieLensSmall/ratings.csv")

In [70]:

user_click = {}
for index, row in data.iterrows():
    user_click.setdefault(row.userId, {})
    user_click[row.userId].setdefault(row.movieId, {"rating": row.rating, "click_time": row.timestamp})
    

In [99]:
def cal_item_sim(user_click, similarity_cal_method = 0, alpha = 1):
    """
    Calculate the similarity dictionary across items

    Parameters:
        user_click: dict, {user_1:{click_item_1: {rating: , time: } 
                                   click_item_2: {rating: , time: } , etc.}, 
                           user_2:...}
        
        similarity_cal_method: int, chosen from [0,1,2,3]
            method 0: 
    Returns:
        the sorted similarity matrix across items
        item_sim_score_sorted: dict, {item_1:{item_2: similar score, 
                                              item_3: similar score, ... }
                                      item_2:{item_1: similar score,
                                              item_3: similar score, ...}}

    """
    if similarity_cal_method == 2:
        def sim_score_time_decay(click_time_1, click_time_2, alpha = alpha):
            time_dif = abs(click_time_1 - click_time_2)
            time_dif_day = time_dif / 60 / 60 / 24
            return 1 / (1 + alpha * time_dif_day)
        
    

    co_appear = {}
    item_click_user = {}
    item_user_click_times = {}
    for user,item_list in user_click.items():
        item_list = list(item_list.keys())
        item_click_number = len(item_list)
        for index_i in range(0,len(item_list)):
            item_id_i = item_list[index_i]
            item_click_user.setdefault(item_id_i, [])
            item_click_user[item_id_i].append(user_click[user][item_id_i]['rating'])
            
            item_user_click_times.setdefault(item_id_i, 0)
            item_user_click_times[item_id_i] += 1
            
            for index_j in range(index_i + 1, len(item_list)):
                item_id_j = item_list[index_j]
                co_appear.setdefault(item_id_i, {})
                co_appear.setdefault(item_id_j, {})
                co_appear[item_id_i].setdefault(item_id_j, 0)
                co_appear[item_id_j].setdefault(item_id_i, 0)

                if similarity_cal_method == 0:
                    sim_score = 1
                    
                elif similarity_cal_method == 1:
                    sim_score = 1/math.log(1+item_click_number)
                    
                elif similarity_cal_method == 2:
                    sim_score = sim_score_time_decay(user_click[user][item_id_i]['click_time'],
                                                    user_click[user][item_id_j]['click_time'])
                             
                elif similarity_cal_method == 3:
                    sim_score = user_click[user][item_id_i]['rating'] * user_click[user][item_id_j]['rating']
                
                co_appear[item_id_i][item_id_j] += sim_score
                co_appear[item_id_j][item_id_i] += sim_score
    item_sim_dict = {}
    
    for item_id_i, related_item in co_appear.items():
        for item_id_j, sim_score in related_item.items():
            item_sim_dict.setdefault(item_id_i, {})
            item_sim_dict[item_id_i].setdefault(item_id_j, 0)
            if similarity_cal_method != 3:
                item_sim_dict[item_id_i][item_id_j] = sim_score / math.sqrt(item_user_click_times[item_id_j] * item_user_click_times[item_id_i])
            
            elif similarity_cal_method == 3:
                item_sim_dict[item_id_i][item_id_j] = sim_score / (math.sqrt(sum([i**2 for i in item_click_user[item_id_i]]))*
                                                                 math.sqrt(sum([i**2 for i in item_click_user[item_id_j]])))
    
    return item_sim_dict

def cal_recom(user_click, item_sim_dict, recom_top_k = 5):
    """
    Calculate the similarity dictionary across items

    Parameters:
        user_click: dict, {user_1:{click_item_1: {rating: , click_time: } 
                                   click_item_2: {rating: , click_time: } , etc.}, 
                           user_2:...}
        

    Returns:
        the sorted similarity matrix across items
        item_sim_score_sorted: dict, {item_1:{item_2: similar score, 
                                              item_3: similar score, ... }
                                      item_2:{item_1: similar score,
                                              item_3: similar score, ...}}

    """
    user_recom_dict = {}
    item_list = list(data.movieId.value_counts().index)
    for user_id in user_click:
        recom_list = []
        for item_id in item_list:
            recom_score = 0
            if item_id not in user_click[user_id]:
                for (item_click_id, profile) in user_click[user_id].items():
                    if item_click_id in item_sim_dict[item_id]:
                        recom_score += profile['rating'] * item_sim_dict[item_id][item_click_id]
            recom_list.append([item_id, recom_score])
        recom_list = sorted(recom_list, reverse = True, key = lambda x: x[1])[:recom_top_k]
        user_recom_dict.setdefault(user_id, recom_list)
    return user_recom_dict
                    


In [103]:

user_click = {}
for index, row in data.iterrows():
    user_click.setdefault(row.userId, {})
    user_click[row.userId].setdefault(row.movieId, {"rating": row.rating, "click_time": row.timestamp})
    
item_sim_dict = cal_item_sim(user_click, similarity_cal_method = 3)
recommendation = cal_recom(user_click, item_sim_dict)

In [106]:
with open('recommendation_list_cosine_similarity_with_like_info.pkl', 'wb') as f:
    pickle.dump(recommendation, f)

In [105]:
recommendation

{1.0: [[2918, 352.6840101334495],
  [1036, 330.9498844896122],
  [1968, 328.18050722272784],
  [1527, 326.4373739321719],
  [1200, 323.1176482769818]],
 2.0: [[69122, 43.201512541719325],
  [2959, 41.68955451802748],
  [33794, 40.67491087379894],
  [80463, 40.51056942088245],
  [72998, 39.921421126330465]],
 3.0: [[2985, 18.26745559172653],
  [2641, 16.756006006502442],
  [2529, 16.652272811259074],
  [1129, 16.38869422354456],
  [1214, 16.362928405403192]],
 4.0: [[2918, 221.35822612266855],
  [1089, 215.95983808278518],
  [2987, 215.46262119275522],
  [1208, 213.92211641537804],
  [1394, 213.4245445783274]],
 5.0: [[480, 69.27048861467722],
  [356, 67.14734930003435],
  [500, 64.73674570100495],
  [377, 64.69351083913317],
  [593, 63.563467366244915]],
 6.0: [[648, 251.55533968006742],
  [551, 243.96918799292393],
  [1, 238.09537735336974],
  [442, 231.95902273683885],
  [586, 230.00112157230777]],
 7.0: [[2571, 187.79660626879053],
  [2115, 186.7741509871769],
  [1527, 185.976750335

In [101]:
recommendation

{1.0: [[2918, 222.6048776556851],
  [2011, 215.91201920996104],
  [1391, 215.1708295613149],
  [589, 214.60925402408404],
  [1527, 213.83243655122143]],
 2.0: [[2959, 28.228232871505934],
  [7438, 27.15378111175406],
  [112556, 26.181744224294636],
  [33794, 25.840992724859355],
  [80463, 25.83509391501096]],
 3.0: [[2641, 13.266244263442035],
  [3070, 12.491245761253133],
  [1127, 11.701818931534508],
  [2985, 11.686706840502394],
  [2916, 11.347288188844484]],
 4.0: [[2918, 137.37348334470647],
  [2716, 136.34908185223884],
  [2987, 134.8349013492319],
  [1089, 134.47443921565295],
  [1270, 132.03376435216427]],
 5.0: [[480, 55.408368273786145],
  [356, 54.4415928165091],
  [377, 54.012022386858774],
  [500, 50.98677210817008],
  [593, 50.937599395358255]],
 6.0: [[551, 193.57585910307438],
  [442, 192.34931535067938],
  [648, 192.0891203648311],
  [586, 191.9604483667801],
  [39, 190.3702170786149]],
 7.0: [[2571, 124.47367337844604],
  [1580, 118.08774063940385],
  [2716, 117.60219