In [133]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle

In [41]:
info = pd.read_csv('../data/raw/ml-100k/u.info', sep=' ', names=['num', 'name'])
info.head()
info = info['num'].values.tolist()
NUM_USERS = info[0]
NUM_ITEMS = info[1]
NUM_USERS, NUM_ITEMS

(943, 1682)

In [77]:
df = pd.read_csv('../data/raw/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
df = df.loc[df['rating'] > 3].drop('timestamp', axis=1)
df['user_id'] = df['user_id'] - 1
df['item_id'] = NUM_USERS + df['item_id'] - 1
print(len(df))
df.head()

55375


Unnamed: 0,user_id,item_id,rating
5,297,1416,4
7,252,1407,5
11,285,1956,5
12,199,1164,5
16,121,1329,5


In [71]:
genres = pd.read_csv('../data/raw/ml-100k/u.genre', sep='|', names=['genre', 'id'])
print(len(genres))
genres = genres['genre'].values.tolist()
genres

19


['unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [78]:
items = pd.read_csv('../data/raw/ml-100k/u.item', sep='|', encoding="ISO-8859-1", names=['movie_id', 'movie title', 'release date', 'video release date', 'IMDB URL'] + genres)
items['movie_id'] = items['movie_id'] + NUM_USERS
print(len(items))
items.head()

1682


Unnamed: 0,movie_id,movie title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,944,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,945,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,946,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,947,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,948,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [30]:
users = pd.read_csv('../data/raw/ml-100k/u.user', sep='|', names=['user id', 'age', 'gender', 'occupation', 'zip code'])
print(len(users))
users.head()

943


Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [32]:
occupations = pd.read_csv('../data/raw/ml-100k/u.occupation', sep='|', names=['occupation'])
print(len(occupations))
occupations

21


Unnamed: 0,occupation
0,administrator
1,artist
2,doctor
3,educator
4,engineer
5,entertainment
6,executive
7,healthcare
8,homemaker
9,lawyer


In [79]:


dim = NUM_USERS + NUM_ITEMS

adjacency_matrix = np.zeros((dim, dim), dtype=int)
for _, row in df.iterrows():
    user_id = row['user_id']
    item_id = row['item_id']
    adjacency_matrix[user_id, item_id] = 1
    adjacency_matrix[item_id, user_id] = 1

In [89]:
user_movie_list = {i: adjacency_matrix[i, NUM_USERS:dim].nonzero()[0] + NUM_USERS for i in range(NUM_USERS)}
movie_user_list = {NUM_USERS + i: adjacency_matrix[NUM_USERS + i, :NUM_USERS].nonzero()[0] for i in range(NUM_ITEMS)}

In [90]:
# metapaths

# movie - user - movie
m_u_m = []
for user, movie_list in user_movie_list.items():
    m_u_m.extend([(m1, user, m2) for m1 in movie_list for m2 in movie_list])
m_u_m = np.array(m_u_m)

# user - movie - user
u_m_u = []
for movie, user_list in movie_user_list.items():
    u_m_u.extend([(u1, movie, u2) for u1 in user_list for u2 in user_list])
u_m_u = np.array(u_m_u)

In [111]:
m_u_m_metapath_map = {}
u_m_u_metapath_map = {}

for i in tqdm(range(NUM_USERS)):
    u_m_u_metapath_map[i] = u_m_u[np.where(u_m_u[:, 2] == i, )[0]]

for i in tqdm(range(NUM_ITEMS)):
    m_u_m_metapath_map[i] = m_u_m[np.where(m_u_m[:, 2] == NUM_USERS + i, )[0]]

100%|██████████| 943/943 [00:16<00:00, 57.52it/s]
100%|██████████| 1682/1682 [00:24<00:00, 70.06it/s]


In [122]:
train = pd.read_csv('../data/raw/ml-100k/ua.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
train = train.loc[train['rating'] > 3].drop(['timestamp', 'rating'], axis=1)
train['user_id'] = train['user_id'] - 1
train['item_id'] = NUM_USERS + train['item_id'] - 1
print(len(train))
train.head()

49906


Unnamed: 0,user_id,item_id
0,0,943
2,0,945
5,0,948
6,0,949
8,0,951


In [129]:
test = pd.read_csv('../data/raw/ml-100k/ua.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
test = test.loc[test['rating'] > 3].drop(['timestamp', 'rating'], axis=1)
test['user_id'] = test['user_id'] - 1
test['item_id'] = NUM_USERS + test['item_id'] - 1
print(len(test))
test.head()

5469


Unnamed: 0,user_id,item_id
0,0,962
1,0,975
2,0,1003
5,0,1102
6,0,1113


In [127]:
train_pos = []
for _, row in train.iterrows():
    train_pos.extend([(row['user_id'], row['item_id']), (row['item_id'], row['user_id'])])
train_pos = np.array(train_pos)

test_pos = []
for _, row in test.iterrows():
    test_pos.extend([(row['user_id'], row['item_id']), (row['item_id'], row['user_id'])])
test_pos = np.array(test_pos)

In [137]:
path = '../data/preprocessed/'

out_file = open(path + 'm_u_m_metapath_map.pickle', "wb")
pickle.dump(m_u_m_metapath_map, out_file)
out_file.close()

out_file = open(path + 'u_m_u_metapath_map.pickle', "wb")
pickle.dump(u_m_u_metapath_map, out_file)
out_file.close()

np.save(path + 'train_pos.npy', train_pos)
np.save(path + 'test_pos.npy', test_pos)

np.save(path + 'adjacency_matrix.npy', adjacency_matrix)