In [179]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle

from sklearn.preprocessing import MinMaxScaler

In [155]:
info = pd.read_csv('../data/raw/ml-100k/u.info', sep=' ', names=['num', 'name'])
info.head()
info = info['num'].values.tolist()
NUM_USERS = info[0]
NUM_ITEMS = info[1]
NUM_USERS, NUM_ITEMS

(943, 1682)

In [156]:
df = pd.read_csv('../data/raw/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
df = df.loc[df['rating'] > 3].drop('timestamp', axis=1)
df['user_id'] = df['user_id'] - 1
df['item_id'] = NUM_USERS + df['item_id'] - 1
print(len(df))
df.head()

55375


Unnamed: 0,user_id,item_id,rating
5,297,1416,4
7,252,1407,5
11,285,1956,5
12,199,1164,5
16,121,1329,5


In [157]:
genres = pd.read_csv('../data/raw/ml-100k/u.genre', sep='|', names=['genre', 'id'])
print(len(genres))
genres = genres['genre'].values.tolist()
genres

19


['unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [170]:
items = pd.read_csv('../data/raw/ml-100k/u.item', sep='|', encoding="ISO-8859-1", names=['movie_id', 'movie title', 'release date', 'video release date', 'IMDB URL'] + genres)
items = items.drop(['movie title', 'video release date', 'IMDB URL'], axis=1)
items = items.drop('release date', axis=1) # !!!!! tmp
print(len(items))
items.head()

1682


Unnamed: 0,movie_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [190]:
movie_features = {}

for _, row in items.iterrows():
    movie_features[row['movie_id'] - 1] = row.values[1:]

In [183]:
users = pd.read_csv('../data/raw/ml-100k/u.user', sep='|', names=['user id', 'age', 'gender', 'occupation', 'zip code'])
users = pd.get_dummies(users, columns=['occupation', 'gender'])
users = users.drop('zip code', axis=1) # !!!!! tmp

scaler = MinMaxScaler()
users[['age']] = scaler.fit_transform(users[['age']])

print(len(users))
users.head()

943


Unnamed: 0,user id,age,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,...,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,gender_F,gender_M
0,1,0.257576,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
1,2,0.69697,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,3,0.242424,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
3,4,0.257576,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
4,5,0.393939,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False


In [187]:
user_features = {}

for _, row in users.iterrows():
    user_features[row['user id'] - 1] = row.values[1:]

In [161]:
dim = NUM_USERS + NUM_ITEMS

adjacency_matrix = np.zeros((dim, dim), dtype=int)
for _, row in df.iterrows():
    user_id = row['user_id']
    item_id = row['item_id']
    adjacency_matrix[user_id, item_id] = 1
    adjacency_matrix[item_id, user_id] = 1

In [162]:
user_movie_list = {i: adjacency_matrix[i, NUM_USERS:dim].nonzero()[0] + NUM_USERS for i in range(NUM_USERS)}
movie_user_list = {NUM_USERS + i: adjacency_matrix[NUM_USERS + i, :NUM_USERS].nonzero()[0] for i in range(NUM_ITEMS)}

In [163]:
# metapaths

# movie - user - movie
m_u_m = []
for user, movie_list in user_movie_list.items():
    m_u_m.extend([(m1, user, m2) for m1 in movie_list for m2 in movie_list])
m_u_m = np.array(m_u_m)

# user - movie - user
u_m_u = []
for movie, user_list in movie_user_list.items():
    u_m_u.extend([(u1, movie, u2) for u1 in user_list for u2 in user_list])
u_m_u = np.array(u_m_u)

In [164]:
m_u_m_metapath_map = {}
u_m_u_metapath_map = {}

for i in tqdm(range(NUM_USERS)):
    u_m_u_metapath_map[i] = u_m_u[np.where(u_m_u[:, 0] == i, )[0]]

for i in tqdm(range(NUM_ITEMS)):
    m_u_m_metapath_map[i] = m_u_m[np.where(m_u_m[:, 0] == NUM_USERS + i, )[0]]

  0%|          | 0/943 [00:00<?, ?it/s]

100%|██████████| 943/943 [00:14<00:00, 65.76it/s]
100%|██████████| 1682/1682 [00:22<00:00, 73.16it/s]


In [165]:
train = pd.read_csv('../data/raw/ml-100k/ua.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
train = train.loc[train['rating'] > 3].drop(['timestamp', 'rating'], axis=1)
train['user_id'] = train['user_id'] - 1
train['item_id'] = train['item_id'] - 1
print(len(train))
train.head()

49906


Unnamed: 0,user_id,item_id
0,0,0
2,0,2
5,0,5
6,0,6
8,0,8


In [166]:
test = pd.read_csv('../data/raw/ml-100k/ua.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
test = test.loc[test['rating'] > 3].drop(['timestamp', 'rating'], axis=1)
test['user_id'] = test['user_id'] - 1
test['item_id'] = test['item_id'] - 1
print(len(test))
test.head()

5469


Unnamed: 0,user_id,item_id
0,0,19
1,0,32
2,0,60
5,0,159
6,0,170


In [167]:
train_pos = []
for _, row in train.iterrows():
    train_pos.extend([(row['user_id'], row['item_id']), (row['item_id'], row['user_id'])])
train_pos = np.array(train_pos)

test_pos = []
for _, row in test.iterrows():
    test_pos.extend([(row['user_id'], row['item_id']), (row['item_id'], row['user_id'])])
test_pos = np.array(test_pos)

In [191]:
path = '../data/preprocessed/'

out_file = open(path + 'm_u_m_metapath_map.pickle', "wb")
pickle.dump(m_u_m_metapath_map, out_file)
out_file.close()

out_file = open(path + 'u_m_u_metapath_map.pickle', "wb")
pickle.dump(u_m_u_metapath_map, out_file)
out_file.close()

out_file = open(path + 'user_features.pickle', "wb")
pickle.dump(user_features, out_file)
out_file.close()

out_file = open(path + 'movie_features.pickle', "wb")
pickle.dump(movie_features, out_file)
out_file.close()

np.save(path + 'train_pos.npy', train_pos)
np.save(path + 'test_pos.npy', test_pos)

np.save(path + 'adjacency_matrix.npy', adjacency_matrix)