In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from torch.autograd import Variable
import random
import torch.optim as optim
import pickle
import torch.utils.data
from torch.backends import cudnn
from scipy.sparse import csr_matrix
import math
import bottleneck as bn

In [2]:
# load data
dir = 'cul/users.dat'
rating_raw = pd.read_csv(dir, engine='python', header=None)
rating_raw['user'] = rating_raw.index
rating_raw.columns = ['i', 'u']

count = 0
num_users = 0
num_items = 0
for idx, row in rating_raw.iterrows():
    items = row.i.split()
    count += len(items)
    
    for item in items:
        if num_items < int(item):
            num_items = int(item)
            
    if num_users < int(row.u):
        num_users = int(row.u)
        
print(count)
print(num_users)
print(num_items)

pairs = []
for idx, row in rating_raw.iterrows():
    items = row.i.split()
    for item in items:
        pairs.append((int(row.u), int(item)))
pairs = np.array(pairs)
rating_bi = pd.DataFrame(pairs, columns=['u', 'i'])

142807
7946
25974


In [None]:
# delete users whose interactions < 
inter = 0
rating_bi = rating_bi.groupby('u').filter(lambda x: len(x) > inter)
rating_bi = rating_bi[rating_bi.columns[0:2]]
rating_bi = rating_bi.reset_index().drop(columns = ['index'])

# delete ghost user, item
u = rating_bi.u.unique().tolist()
i = rating_bi.i.unique().tolist()
u.sort()
i.sort()

idxi = np.zeros(max(i)+1).astype('int')
for item in i:
    idxi[item] = i.index(item)
idxu = np.zeros(max(u)+1).astype('int')
for user in u:
    idxu[user] = u.index(user)
    
rating_bi['u'] = rating_bi['u'].apply(lambda x: idxu[x])
rating_bi['i'] = rating_bi['i'].apply(lambda x: idxi[x])

In [None]:
pairs = rating_bi.to_numpy()

u_val = {}
u_test = {}

u = -1
ii = -1
i = -1
for row in pairs:
    if row[0] != u:
        if u != -1:
            u_val[u] = ii
            u_test[u] = i
            
        u = row[0]
    ii = i
    i = row[1]
u_val[u] = ii
u_test[u] = i

users = list(u_val.keys())
pairs_val = np.zeros((len(users),2))
for i in range(len(users)):
    pairs_val[i][0] = users[i]
    pairs_val[i][1] = u_val[users[i]]
pairs_test = np.zeros((len(users),2))
for i in range(len(users)):
    pairs_test[i][0] = users[i]
    pairs_test[i][1] = u_test[users[i]]

In [None]:
del_list = []
for i in range(len(pairs)):
    if pairs[i][1] == u_val[pairs[i][0]]:
        del_list.append(i)
    if pairs[i][1] == u_test[pairs[i][0]]:
        del_list.append(i)

pairs_train = np.delete(pairs, del_list, 0)
np.random.shuffle(pairs_train)

num_users = pairs[:, 0].max() + 1
num_items = pairs[:, 1].max() + 1

train_matrix = np.zeros((int(num_users), int(num_items)), dtype=int)
for row in pairs:
    train_matrix[int(row[0]), int(row[1])] = 1
    
train_nei = {}
u = -1
nei = []
for row in pairs:
    if row[0] != u:
        if u != -1:
            train_nei[u] = nei
        u = row[0]
        nei = []
    nei.append(row[1])

In [None]:
torch.save(torch.LongTensor(pairs_train), 'cul/train_'  + str(inter) + '.pt')
torch.save(torch.LongTensor(pairs_val), 'cul/val_'  + str(inter) + '.pt')
torch.save(torch.LongTensor(pairs_test), 'cul/test_'  + str(inter) + '.pt')
torch.save(torch.LongTensor(train_matrix), 'cul/train_matrix_'  + str(inter) + '.pt')
np.save('cul/train_nei_'  + str(inter), train_nei)

In [None]:
inter = 5
train = torch.load('cul/train_' + str(inter) + '.pt')
val = torch.load('cul/val_' + str(inter) + '.pt')
test = torch.load('cul/test_' + str(inter) + '.pt')

train_matrix = torch.load('cul/train_matrix_' + str(inter) + '.pt')
train_nei = np.load('cul/train_nei_' + str(inter) + '.npy').item()

num_users = train_matrix.size()[0]
num_items = train_matrix.size()[1]

print(num_users)
print(num_items)
print(train.size()[0]+val.size()[0]*2)

In [None]:
# for neg_sample
matrix = train_matrix.numpy()
neg_max = num_items - min(np.sum(matrix, axis = 1))
neg_count = neg_max - np.sum(matrix, axis = 1)

i, j = np.where(matrix == 0)
user = 0
count = 0
negs = []
for index, idx in enumerate(i):
    if user < idx:
        user = idx
        neg = j[count:index].tolist()
        neg += [-1]*(int(neg_max)-len(neg))
        negs.append(neg)
        count = index 
neg = j[count:].tolist()
neg += [-1]*(int(neg_max)-len(neg))        
negs.append(neg)
negs_np = np.array(negs)

In [None]:
np.save('cul/negs_np_'  + str(inter), negs_np)
np.save('cul/neg_count_'  + str(inter), neg_count)