In [119]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle

from sklearn.preprocessing import MinMaxScaler

In [120]:
np.random.seed(1337)

In [121]:
info = pd.read_csv('../data/raw/ml-100k/u.info', sep=' ', names=['num', 'name'])
info.head()
info = info['num'].values.tolist()
NUM_USERS = info[0]
NUM_ITEMS = info[1]
NUM_USERS, NUM_ITEMS

(943, 1682)

In [122]:
df = pd.read_csv('../data/raw/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
df['user_id'] = df['user_id'] - 1
# add NUM_USERS to item_id, so it won't correlate with user_id
df['item_id'] = NUM_USERS + df['item_id'] - 1 
print(len(df))
df.head()

100000


Unnamed: 0,user_id,item_id,rating,timestamp
0,195,1184,3,881250949
1,185,1244,3,891717742
2,21,1319,1,878887116
3,243,993,2,880606923
4,165,1288,1,886397596


In [123]:
genres = pd.read_csv('../data/raw/ml-100k/u.genre', sep='|', names=['genre', 'id'])
print(len(genres))
genres = genres['genre'].values.tolist()
genres

19


['unknown',
 'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [124]:
items = pd.read_csv('../data/raw/ml-100k/u.item', sep='|', encoding="ISO-8859-1", names=['movie_id', 'movie title', 'release date', 'video release date', 'IMDB URL'] + genres)
items = items.drop(['movie title', 'video release date', 'IMDB URL'], axis=1)
# convert release date column into number
items['release date'] = items['release date'].fillna('01-Jan-1970')
items['release date'] = pd.to_datetime(items['release date'], format=r'%d-%b-%Y')
items['release date'] = (items['release date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

scaler = MinMaxScaler()
# and then scale this number
items[['release date']] = scaler.fit_transform(items[['release date']])

print(len(items))
items.head()

1682


Unnamed: 0,movie_id,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0.950417,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0.950417,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,0.950417,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,0.950417,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0.950417,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [125]:
movie_features = {}

for _, row in items.iterrows():
    movie_features[row['movie_id'] - 1] = row.values[1:]

In [126]:
users = pd.read_csv('../data/raw/ml-100k/u.user', sep='|', names=['user id', 'age', 'gender', 'occupation', 'zip code'])

# one hot encode occupation and gender columns
users = pd.get_dummies(users, columns=['occupation', 'gender'])

# dropping the zip code, because it is hard to transform into number
users = users.drop('zip code', axis=1)

scaler = MinMaxScaler()
# scale the age
users[['age']] = scaler.fit_transform(users[['age']])

print(len(users))
users.head()

943


Unnamed: 0,user id,age,occupation_administrator,occupation_artist,occupation_doctor,occupation_educator,occupation_engineer,occupation_entertainment,occupation_executive,occupation_healthcare,...,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,gender_F,gender_M
0,1,0.257576,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
1,2,0.69697,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,3,0.242424,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
3,4,0.257576,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
4,5,0.393939,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False


In [127]:
user_features = {}

for _, row in users.iterrows():
    user_features[row['user id'] - 1] = row.values[1:]

In [128]:
train = pd.read_csv('../data/raw/ml-100k/ua.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
train = train.loc[train['rating'] > 3].drop(['timestamp', 'rating'], axis=1)
train['user_id'] = train['user_id'] - 1
train['item_id'] = NUM_USERS + train['item_id'] - 1
print(len(train))
train.head()

49906


Unnamed: 0,user_id,item_id
0,0,943
2,0,945
5,0,948
6,0,949
8,0,951


In [129]:
test = pd.read_csv('../data/raw/ml-100k/ua.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
test = test.loc[test['rating'] > 3].drop(['timestamp', 'rating'], axis=1)
test['user_id'] = test['user_id'] - 1
test['item_id'] = NUM_USERS + test['item_id'] - 1
print(len(test))
test.head()

5469


Unnamed: 0,user_id,item_id
0,0,962
1,0,975
2,0,1003
5,0,1102
6,0,1113


In [130]:
train_pos = []
for _, row in train.iterrows():
    train_pos.extend([(row['user_id'], row['item_id'] - NUM_USERS)])
train_pos = np.array(train_pos)

test_pos = []
for _, row in test.iterrows():
    test_pos.extend([(row['user_id'], row['item_id'] - NUM_USERS)])
test_pos = np.array(test_pos)

In [131]:
def metapaths_from_dataframe(dataframe):
    '''Function to extract movie-user-movie and user-movie-user metapaths from the given graph'''

    dim = NUM_USERS + NUM_ITEMS

    # constructing the adjacency matrix from the dataframe
    adjmatrix = np.zeros((dim, dim), dtype=int)
    for _, row in dataframe.iterrows():
        user_id = row['user_id']
        item_id = row['item_id']
        adjmatrix[user_id, item_id] = 1
        adjmatrix[item_id, user_id] = 1

    user_movie_list = {i: adjmatrix[i, NUM_USERS:dim].nonzero()[0] for i in range(NUM_USERS)}
    movie_user_list = {i: adjmatrix[NUM_USERS + i, :NUM_USERS].nonzero()[0] for i in range(NUM_ITEMS)}

    # movie - user - movie
    m_u_m = []
    for user, movie_list in user_movie_list.items():
        m_u_m.extend([(m1, user, m2) for m1 in movie_list for m2 in movie_list])
    m_u_m = np.array(m_u_m)

    # user - movie - user
    u_m_u = []
    for movie, user_list in movie_user_list.items():
        u_m_u.extend([(u1, movie, u2) for u1 in user_list for u2 in user_list])
    u_m_u = np.array(u_m_u)

    m_u_m_metapath_map = {}
    u_m_u_metapath_map = {}

    for i in tqdm(range(NUM_USERS)):
        u_m_u_metapath_map[i] = u_m_u[np.where(u_m_u[:, 0] == i, )[0]]

    for i in tqdm(range(NUM_ITEMS)):
        m_u_m_metapath_map[i] = m_u_m[np.where(m_u_m[:, 0] == i, )[0]]

    return u_m_u_metapath_map, m_u_m_metapath_map

In [132]:
train_user_metapaths, train_movie_metapaths = metapaths_from_dataframe(train)

test_user_metapaths, test_movie_metapaths = metapaths_from_dataframe(test)

100%|██████████| 943/943 [00:12<00:00, 72.54it/s]
100%|██████████| 1682/1682 [00:22<00:00, 75.21it/s]
100%|██████████| 943/943 [00:00<00:00, 3654.74it/s]
100%|██████████| 1682/1682 [00:00<00:00, 14947.19it/s]


In [133]:
# saving all preprocessed data
path = '../data/preprocessed/'

out_file = open(path + 'user_features.pickle', "wb")
pickle.dump(user_features, out_file)
out_file.close()

out_file = open(path + 'movie_features.pickle', "wb")
pickle.dump(movie_features, out_file)
out_file.close()

In [134]:
train_path = '../data/preprocessed/train/'

out_file = open(train_path + 'user_metapaths.pickle', "wb")
pickle.dump(train_user_metapaths, out_file)
out_file.close()

out_file = open(train_path + 'movie_metapaths.pickle', "wb")
pickle.dump(train_movie_metapaths, out_file)
out_file.close()

np.save(train_path + 'train_pos.npy', train_pos)

In [135]:
test_path = '../data/preprocessed/test/'

out_file = open(test_path + 'user_metapaths.pickle', "wb")
pickle.dump(test_user_metapaths, out_file)
out_file.close()

out_file = open(test_path + 'movie_metapaths.pickle', "wb")
pickle.dump(test_movie_metapaths, out_file)
out_file.close()

np.save(test_path + 'test_pos.npy', test_pos)

In [136]:
def get_mean_length(metapath):
    '''Function to get mean number of metapath instances for every node, given the metapaths'''
    lengths = []
    for _, val in metapath.items():
        if len(val) > 0:
            lengths.append(len(val))
    return int(np.mean(lengths))

In [137]:
mean_train_user = get_mean_length(train_user_metapaths)
mean_train_movie = get_mean_length(train_movie_metapaths)

mean_test_user = get_mean_length(test_user_metapaths)
mean_test_movie = get_mean_length(test_movie_metapaths)

print(f'{mean_train_user=}')
print(f'{mean_train_movie=}')
print(f'{mean_test_user=}')
print(f'{mean_test_movie=}')

mean_train_user=5931
mean_train_movie=3789
mean_test_user=100
mean_test_movie=41


In [138]:
# preprocessing data for benchmark evaluation
benchmark_path = '../benchmark/data/'

benchmark = pd.read_csv('../data/raw/ml-100k/ub.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
benchmark = benchmark.loc[benchmark['rating'] > 3].drop(['timestamp', 'rating'], axis=1)
benchmark['user_id'] = benchmark['user_id'] - 1
benchmark['item_id'] = NUM_USERS + benchmark['item_id'] - 1

benchmark_user_metapaths, benchmark_movie_metapaths = metapaths_from_dataframe(benchmark)

out_file = open(benchmark_path + 'user_metapaths.pickle', "wb")
pickle.dump(benchmark_user_metapaths, out_file)
out_file.close()

out_file = open(benchmark_path + 'movie_metapaths.pickle', "wb")
pickle.dump(benchmark_movie_metapaths, out_file)
out_file.close()

mean_benchmark_user = get_mean_length(test_user_metapaths)
mean_benchmark_movie = get_mean_length(test_movie_metapaths)

print(f'{mean_benchmark_user=}')
print(f'{mean_benchmark_movie=}')

100%|██████████| 943/943 [00:00<00:00, 4493.92it/s]
100%|██████████| 1682/1682 [00:00<00:00, 12177.84it/s]


mean_benchmark_user=100
mean_benchmark_movie=41


In [139]:
# for benchmark taking only edges that are not present in the benchmark graph
benchmark_data = df.loc[df['user_id'].isin(benchmark['user_id'].values) & df['item_id'].isin(benchmark['item_id'].values)]
benchmark_data = benchmark_data.merge(benchmark, on=['user_id', 'item_id'], how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)
benchmark_data = benchmark_data.drop(['timestamp'], axis=1)
benchmark_data['rating'] = np.where(benchmark_data['rating'] > 3, 1, 0)

benchmark_dataset = []
for _, row in benchmark_data.iterrows():
    benchmark_dataset.extend([(row['user_id'], row['item_id'] - NUM_USERS, row['rating'])])
benchmark_dataset = np.array(benchmark_dataset)

np.save(benchmark_path + 'benchmark_dataset.npy', benchmark_dataset)
benchmark_data

Unnamed: 0,user_id,item_id,rating
0,195,1184,0
1,185,1244,0
2,165,1288,0
3,297,1416,1
4,114,1207,0
...,...,...,...
89684,377,1020,0
89685,879,1418,0
89686,715,1146,1
89687,12,1167,0
