In [1]:
import numpy as np
import pandas as pd 
import scipy.sparse as sp
from copy import deepcopy
import random
import torch.utils.data as data

In [2]:
def load_all(dataset, data_path):

	train_rating = data_path + '{}.train.rating'.format(dataset)
	valid_rating = data_path + '{}.valid.rating'.format(dataset)
	test_negative = data_path + '{}.test.negative'.format(dataset)

	################# load training data #################	
	train_data = pd.read_csv(
		train_rating, 
		sep='\t', header=None, names=['user', 'item', 'noisy'], 
		usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.int32})

	if dataset == "adressa":
		user_num = 212231
		item_num = 6596
	else:
		user_num = train_data['user'].max() + 1
		item_num = train_data['item'].max() + 1
	print("user, item num")
	print(user_num, item_num)
	train_data = train_data.values.tolist()

	# load ratings as a dok matrix
	train_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)
	train_data_list = []
	train_data_noisy = []
	for x in train_data:
		train_mat[x[0], x[1]] = 1.0
		train_data_list.append([x[0], x[1]])
		train_data_noisy.append(x[2])

	################# load validation data #################
	valid_data = pd.read_csv(
		valid_rating, 
		sep='\t', header=None, names=['user', 'item', 'noisy'], 
		usecols=[0, 1, 2], dtype={0: np.int32, 1: np.int32, 2: np.int32})
	valid_data = valid_data.values.tolist()
	valid_data_list = []
	for x in valid_data:
		valid_data_list.append([x[0], x[1]])
	
	train_pos = {}
	for x in train_data_list:
		if x[0] in train_pos:
			train_pos[x[0]].append(x[1])
		else:
			train_pos[x[0]] = [x[1]]
	valid_pos = {}
	for x in valid_data_list:
		if x[0] in valid_pos:
			valid_pos[x[0]].append(x[1])
		else:
			valid_pos[x[0]] = [x[1]]


	################# load testing data #################
	test_mat = sp.dok_matrix((user_num, item_num), dtype=np.float32)

	test_data_pos = {}
	with open(test_negative, 'r') as fd:
		line = fd.readline()
		while line != None and line != '':
			arr = line.split('\t')
			if dataset == "adressa":
				u = eval(arr[0])[0]
				i = eval(arr[0])[1]
			else:
				u = int(arr[0])
				i = int(arr[1])
			if u in test_data_pos:
				test_data_pos[u].append(i)
			else:
				test_data_pos[u] = [i]
			test_mat[u, i] = 1.0
			line = fd.readline()


	return train_data_list, valid_data_list, test_data_pos, train_pos, valid_pos, user_num, item_num, train_mat, train_data_noisy

In [3]:
dataset = 'book'

In [4]:
data_path = './{}/{}/'.format(dataset, dataset)

In [5]:
train_data, valid_data, test_data_pos, train_pos, valid_pos, user_num ,item_num, train_mat, train_data_noisy = load_all(dataset, data_path)

user, item num
80464 98663


In [6]:
f = open(data_path + 'train.txt', 'w')

for u, i in train_pos.items():
    f.write(str(u))
    for n in i:
        f.write(" " + str(n))
    f.write('\n')

f.close()

In [7]:
f = open(data_path + 'valid.txt', 'w')

for u, i in valid_pos.items():
    f.write(str(u))
    for n in i:
        f.write(" " + str(n))
    f.write('\n')

f.close()

In [8]:
f = open(data_path + 'test.txt', 'w')

for u, i in test_data_pos.items():
    f.write(str(u))
    for n in i:
        f.write(" " + str(n))
    f.write('\n')

f.close()