In [35]:
def load_items(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    items = {}
    for line in lines:
        line = line.strip()
        item_id, attribute_1, attribute_2 = line.split('|')
        item_id = int(item_id)
        if attribute_1 == 'None':
            attribute_1 = None
        if attribute_2 == 'None':
            attribute_2 = None
        items[item_id] = (attribute_1, attribute_2)

    return items

items = load_items('./data/itemAttribute.txt')

In [None]:
from tqdm import tqdm
import numpy as np  

In [1]:
# read rating data
data_path = './data/train.txt'
train_lines = 0
all_avg = 0
item_rating= {}

with open(data_path, 'r') as f:
    lines = f.readlines()
    user_id = None
    train_lines = len(lines)
    data = {}
    for line in lines:
        line = line.strip()
        if '|' in line:  # user line
            train_lines -= 1
            if(user_id != None):
                avg = data[user_id]['sum'] / data[user_id]['num_ratings']
                data[user_id]['ratings'].update({k: v - avg for k, v in data[user_id]['ratings'].items()})
                data[user_id]['norm'] = sum(x**2 for x in data[user_id]['ratings'].values())**0.5

            user_id, num_ratings = line.split('|')
            user_id = int(user_id)
            data[user_id] = {}
            data[user_id]['num_ratings'] = int(num_ratings)
            data[user_id]['ratings'] = {}
            data[user_id]['sum'] = 0
        else:  # rating line
            item_id, score = map(int, line.split())
            data[user_id]['ratings'][item_id] = score
            data[user_id]['sum'] += score
            all_avg += score
            if item_id not in item_rating:
                item_rating[item_id] = {'num': 0, 'sum': 0}
            item_rating[item_id]['num'] += 1
            item_rating[item_id]['sum'] += score

    avg = data[user_id]['sum'] / data[user_id]['num_ratings']
    data[user_id]['ratings'].update({k: v - avg for k, v in data[user_id]['ratings'].items()})
    data[user_id]['norm'] = sum(x**2 for x in data[user_id]['ratings'].values())**0.5

all_avg /= train_lines
lines = None

In [15]:
#cacluate similarity between users
#takes 1 hour 20mins
similarity = {}
for i, (userid1, user1_data) in tqdm(enumerate(data.items()), total=len(data)):
    similarity[userid1] = {}
    for j, (userid2, user2_data) in enumerate(data.items()):
        if  user1_data['norm'] == 0 or  user2_data['norm'] == 0:
            similarity[userid1][userid2] = 0
            continue
        if i >= j:
            continue
        else:
            cos_sim = 0.0
            for item, rating in user1_data['ratings'].items():
                if item in user2_data['ratings']:
                    cos_sim += rating * user2_data['ratings'][item]
            cos_sim = cos_sim / (user1_data['norm'] * user2_data['norm'])
            similarity[userid1][userid2] = cos_sim

100%|██████████| 19835/19835 [1:23:37<00:00,  3.95it/s]  


In [7]:
# save similarity matrix
np.save('similarity.npy', similarity)

In [3]:
# load similarity matrix
similarity = np.load('similarity.npy', allow_pickle=True).item()

In [4]:
max(similarity[0].values())

0.27493

In [9]:
#read test data
test_input = {}
with open('./data/test.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()  # 去除行尾的换行符
        if '|' in line:
            # 这是一个用户的开始
            userid, num_items = line.split('|')  # 分割用户ID和评分项目数量
            test_input[userid] = {'num_items': int(num_items), 'items': []}
        else:
            # 这是一个项目ID
            itemid = line
            test_input[userid]['items'].append(itemid)
lines = None

In [6]:
import heapq

def find_top_n_keys(dictionary, n):
    top_n_items = heapq.nlargest(n, ((k, v) for k, v in dictionary.items()), key=lambda item: item[1])
    top_n_keys = [key for (key, value) in top_n_items ]
    return top_n_keys

In [14]:
test_output = {}

for user_id, user_data in (tqdm(test_input.items())):
    user_id = int(user_id)
    test_output[user_id] = {}
    N_neighbor = find_top_n_keys(similarity[user_id], 500)
    for item_id in user_data['items']:
        item_id = int(item_id)
        eval = 0
        sim_sum = 0
        for neighbor in N_neighbor:
            if item_id in data[neighbor]['ratings']:
                sim_sum += similarity[user_id][neighbor]
                baseline = data[neighbor]['sum']/data[neighbor]['num_ratings'] +  item_rating[item_id]['sum']/item_rating[item_id]['num'] - all_avg
                eval += similarity[user_id][neighbor] * (data[neighbor]['ratings'][item_id] + data[neighbor]['sum']/data[neighbor]['num_ratings'] - baseline)
        if sim_sum != 0:
            eval /= sim_sum
        if item_id in item_rating:
            baseline = data[user_id]['sum']/data[user_id]['num_ratings'] +  item_rating[item_id]['sum']/item_rating[item_id]['num'] - all_avg
        else:
            baseline = data[user_id]['sum']/data[user_id]['num_ratings']
        eval += baseline
        if eval > 100:
            eval = 100
        test_output[user_id][item_id] = int(eval) 
            

100%|██████████| 19835/19835 [02:04<00:00, 159.60it/s]


In [15]:
result_path = 'result_baseline.txt'
count = 0
with open(result_path, 'w') as f:
    for userid, item_list in test_output.items():
        f.write(f"{userid}|6\n")
        for item, rating in item_list.items():
            f.write(f"{item}  {rating}\n")

In [16]:
bias = {}
for userid in data.keys():
    bias[userid] = data[userid]['sum']/data[userid]['num_ratings'] - all_avg

In [20]:
dic = {k:similarity[0][k] for k in find_top_n_keys(similarity[0], 20)}
dic = None

In [17]:
weight = {}
lr = 0.2
for userid in data.keys():
    weight[userid] = {k:similarity[userid][k] for k in find_top_n_keys(similarity[userid], 100)}
    for neighbor in weight[userid].keys():
        grad = 0
        for itemid in data[userid]['ratings'].keys():
            if itemid in data[neighbor]['ratings']:
                grad += weight[userid][neighbor] * (data[neighbor]['ratings'][itemid] + \
                                                      data[neighbor]['sum']/data[neighbor]['num_ratings'] - \
                                                      bias[neighbor] - item_rating[item_id]['sum']/item_rating[item_id]['num'])
        if grad == 0: #没有共同评分的项目
            continue
        grad += bias[userid] + item_rating[item_id]['sum']/item_rating[item_id]['num']
        

    # for itemid in data[userid]['ratings'].keys():
    #     eval = 0
    #     for neighbor in weight[userid].keys():
    #         if itemid in data[neighbor]['ratings']:
    #             eval += weight[userid][neighbor] * (data[neighbor]['ratings'][itemid] + \
    #                                                  data[neighbor]['sum']/data[neighbor]['num_ratings'] - \
    #                                                  bias[neighbor] - item_rating[item_id]['sum']/item_rating[item_id]['num'])
        
    weight[userid][itemid] = data[userid]['ratings'][itemid] + bias[userid]

19835

In [14]:
#the oridinary version
from tqdm import tqdm
test_output = {}

for user_id, user_data in (tqdm(test_input.items())):
    user_id = int(user_id)
    test_output[user_id] = {}
    N_neighbor = find_top_n_keys(similarity[user_id], 500)
    for item_id in user_data['items']:
        item_id = int(item_id)
        eval = 0
        sim_sum = 0
        for neighbor in N_neighbor:
            if item_id in data[neighbor]['ratings']:
                sim_sum += similarity[user_id][neighbor]
                eval += similarity[user_id][neighbor] * (data[neighbor]['ratings'][item_id] + data[neighbor]['sum']/data[neighbor]['num_ratings'])
        if sim_sum != 0:
            eval /= sim_sum
        test_output[user_id][item_id] = int(eval)
            

print('evaluate done')

result_path = 'result_normal.txt'
count = 0
with open(result_path, 'w') as f:
    for userid, item_list in test_output.items():
        f.write(f"{userid}|6\n")
        for item, rating in item_list.items():
            f.write(f"{item}  {rating}\n")

print('write done')

100%|██████████| 19835/19835 [02:07<00:00, 155.97it/s]


evaluate done
write done


In [44]:
with open('sim_top_10.txt', 'w') as f:
    for i, (userid1, user1_data) in enumerate(similarity.items()):
        if(i<=10):
            f.write(f"{userid1}: \n ")
            for userid2, cos_sim in user1_data.items():
                f.write(f"{userid2} {cos_sim}\n")
        else:
            break