In [124]:
import sys
import os
import pickle

sys.path.append('/home/samariddin/projects/recommender-system/')
# Get the absolute path of the current working directory
abs_path = os.path.abspath('.')

path_to_data = os.path.join('/home', 'samariddin', 'projects', 'recommender-system', 'data', 'dataset', 'ml-100k')

In [125]:
import numpy as np
import pandas as pd
import src.metrics as ml_metrics
from src.recommender.recommender import GlocalK

from collections import Counter
from itertools import combinations
import networkx as nx
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [126]:
def convert_categorical(df_X, _X):
	values = np.array(df_X[_X])
	# integer encode
	label_encoder = LabelEncoder()
	integer_encoded = label_encoder.fit_transform(values)
	# binary encode
	onehot_encoder = OneHotEncoder(sparse_output=False)
	integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
	onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
	df_X = df_X.drop(columns=_X)
	for j in range(integer_encoded.max() + 1):
		df_X.insert(loc=j + 1, column=str(_X) + str(j + 1), value=onehot_encoded[:, j])
	return df_X

In [127]:
def extract(df, df_user, alpha_coefs=[0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.04, 0.045], alpha_param=1682):
	for alpha_coef in alpha_coefs:
		pairs = []
		grouped = df.groupby(['MID', 'rate'])

		for key, group in grouped:
			pairs.extend(list(combinations(group['UID'], 2)))

		counter = Counter(pairs)
		alpha = alpha_coef * alpha_param  # 1m = 3883, param*i_no
		edge_list = map(
			list,
			Counter(el for el in counter.elements()
					if counter[el] >= alpha).keys())
		G = nx.Graph()

		for el in edge_list:
			G.add_edge(el[0], el[1], weight=1)
			G.add_edge(el[0], el[0], weight=1)
			G.add_edge(el[1], el[1], weight=1)

		pr = nx.pagerank(G.to_directed())
		df_user['PR'] = df_user['UID'].map(pr)
		df_user['PR'] /= float(df_user['PR'].max())
		dc = nx.degree_centrality(G)
		df_user['CD'] = df_user['UID'].map(dc)
		df_user['CD'] /= float(df_user['CD'].max())
		cc = nx.closeness_centrality(G)
		df_user['CC'] = df_user['UID'].map(cc)
		df_user['CC'] /= float(df_user['CC'].max())
		bc = nx.betweenness_centrality(G)
		df_user['CB'] = df_user['UID'].map(bc)
		df_user['CB'] /= float(df_user['CB'].max())
		lc = nx.load_centrality(G)
		df_user['LC'] = df_user['UID'].map(lc)
		df_user['LC'] /= float(df_user['LC'].max())
		nd = nx.average_neighbor_degree(G, weight='weight')
		df_user['AND'] = df_user['UID'].map(nd)
		df_user['AND'] /= float(df_user['AND'].max())
		X_train = df_user.loc[:, df_user.columns[1:]]
		X_train.fillna(0, inplace=True)

		X_train.to_pickle("/home/samariddin/projects/recommender-system/data/extracted_features/features_alpha(" + str(alpha_coef) +").pkl")

	return X_train


In [128]:
def load_data(data_path="/home/samariddin/projects/recommender-system/data/dataset/", ratings_data="ml-100k/u1.base", users_data="ml-100k/u.user", test_data="ml-100k/u1.test", rating_sep='\t', users_sep='\\|'):
	ratings = pd.read_csv(data_path +"/"+ratings_data,
						sep=rating_sep,
						engine='python',
						names=['UID', 'MID', 'rate', 'time'])
	df_user = pd.read_csv(data_path +"/"+users_data,
						sep=users_sep,
						engine='python',
						names=['UID', 'age', 'gender', 'job', 'zip'])
	train = np.loadtxt(data_path +'/'+ ratings_data, skiprows=0, delimiter=rating_sep).astype("int32")
	test = np.loadtxt(data_path + '/'+ test_data, skiprows=0, delimiter=rating_sep).astype("int32")
	total = np.concatenate((train, test), axis=0)

	# User Features
	df_user = convert_categorical(df_user, 'job')
	df_user = convert_categorical(df_user, 'gender')
	df_user['bin'] = pd.cut(df_user['age'], [0, 10, 20, 30, 40, 50, 100],
							labels=['1', '2', '3', '4', '5', '6'])
	df_user['age'] = df_user['bin']

	df_user = df_user.drop(columns='bin')
	df_user = convert_categorical(df_user, 'age')
	df_user = df_user.drop(columns='zip')

	X_train = extract(ratings, df_user)
	# - - - - - - - - -

	# Prepar data
	n_u = np.unique(total[:, 0]).size  # num of users
	n_m = np.unique(total[:, 1]).size  # num of movies
	n_train = train.shape[0]  # num of training ratings
	n_test = test.shape[0]  # num of test ratings

	train_r = np.zeros((n_m, n_u), dtype="float32")
	test_r = np.zeros((n_m, n_u), dtype="float32")

	for i in range(n_train):
		train_r[train[i, 1] - 1, train[i, 0] - 1] = train[i, 2]

	for i in range(n_test):
		test_r[test[i, 1] - 1, test[i, 0] - 1] = test[i, 2]

	train_m = np.greater(train_r, 1e-12).astype("float32")  # masks indicating non-zero entries

	# Append the movies in X_train to the end of the existing movies in train_r
	train_r = np.concatenate((train_r,  X_train.T), axis=0).astype('float32')

	# save the ndarray object to a file using pickle
	with open("/home/samariddin/projects/recommender-system/data/train_data/train_r_(" + str(n_u) +").pkl", "wb") as f:
		pickle.dump(train_r, f)

	return n_m, n_u, train_r, train_m, test_r


In [129]:
def load_data_1m(path=path_to_data):
    
    ratings = pd.read_csv(path+'/ratings.dat', sep='::', header=None, 
                          names=['user_id', 'item_id', 'rating', 'timestamp'], engine='python')
    users = pd.read_csv(path+'/users.dat', sep='::', header=None,
                        names=['user_id', 'gender', 'age', 'occupation', 'zip'], engine='python')

    n_u = users.shape[0]  # num of users
    n_ratings = ratings.shape[0]  # num of ratings

    max_movie_id = 3952  # maximum movie ID
    
    train_r = np.zeros((max_movie_id, n_u), dtype='float32')
    test_r = np.zeros((max_movie_id, n_u), dtype='float32')

    # Splitting the ratings into training and test sets
    train_size = int(0.8 * n_ratings)
    test_size = n_ratings - train_size
    shuffled_idx = np.random.permutation(n_ratings)

    train_idx = shuffled_idx[:train_size]
    test_idx = shuffled_idx[train_size:]

    train_ratings = ratings.iloc[train_idx]
    test_ratings = ratings.iloc[test_idx]

    for i, row in train_ratings.iterrows():
        item_id = row['item_id']
        if item_id <= max_movie_id:
            train_r[item_id-1, row['user_id']-1] = row['rating']

    for i, row in test_ratings.iterrows():
        item_id = row['item_id']
        if item_id <= max_movie_id:
            test_r[item_id-1, row['user_id']-1] = row['rating']

    train_m = np.greater(train_r, 1e-12).astype('float32')  # masks indicating non-zero entries
    test_m = np.greater(test_r, 1e-12).astype('float32')

    return max_movie_id, n_u, train_r, train_m, test_r, test_m


In [130]:
n_m, n_u, train_r, train_m, test_r = load_data()

In [149]:
# Load features
dataPath = "/home/samariddin/projects/recommender-system/data/"
# X_train = pd.read_pickle(dataPath + "extracted_features/features_alpha(0.045).pkl").values.astype(float)
# X_train.shape

(943, 35)

In [132]:
# Instantiate and train the model
recommender = GlocalK()
metrics = recommender.fit(train_r)
metrics

PRE-TRAINING finished.
FINE-TUNING finished.


{'epochs_p': 103,
 'epochs_f': 134,
 'best_rmse_p': 0.9673729,
 'best_rmse_f': 0.95729446}

In [133]:
# Recommend for all users
res = recommender.predict(np.arange(n_u))

In [134]:
def get_top_n_recommendations(user_id: int, n: int):
	predicted_ratings = res.T[user_id - 1]  # user_id starts from 1
	rated_movies = np.where(train_m[:, user_id - 1] > 0)[0]  # movies already rated by user
	unrated_movies = np.setdiff1d(np.arange(n_m), rated_movies)  # movies not rated by user
	predicted_ratings[rated_movies] = -np.inf  # set rated movies' rating to -inf, so they won't be recommended

	# get top-n recommended movie IDs
	top_n = predicted_ratings.argsort()[::-1][:n]
	top_n_movie_ids = [movie_id + 1 for movie_id in top_n if movie_id in unrated_movies]

	return top_n_movie_ids

In [135]:
user_id = 1
topN = get_top_n_recommendations(user_id, 10)
topN

[170, 134, 357, 483, 479, 647, 478, 318, 603, 427]

In [136]:
user_id = 1
topN = get_top_n_recommendations(user_id, 10)
topN

[170, 134, 357, 483, 479, 647, 478, 318, 603, 427]

In [137]:
# Evaluate the recommendations
k=50
ground_truth = np.argsort(-test_r, axis=0)[:k,:].T.tolist()
recommended = np.argsort(-res, axis=0)[:k,:].T.tolist()
random = np.random.randint(0,n_m,(n_u, k)).T.tolist()

print("Baseline (random):\t", ml_metrics.mapk(ground_truth, random, k=k), "\nGlocalK:\t\t", ml_metrics.mapk(ground_truth, recommended, k=k))

Baseline (random):	 0.0020718804433759665 
GlocalK:		 0.014454699631168999


In [138]:
# Save encoded features to file
encoded_features_df = pd.DataFrame(res)
encoded_features_df.to_pickle(dataPath + 'recommendations/recommendations.pkl')

In [139]:
# load the u.item file
item_df = pd.read_csv(dataPath+'dataset/ml-100k/u.item', sep='|', encoding='latin-1', header=None, usecols=[0,1], names=['MID', 'name'])


In [140]:
# Recommendations top-n
for mid in topN:
    movie_name = item_df.loc[item_df['MID'] == mid]['name'].values[0]
    print(movie_name)

Cinema Paradiso (1988)
Citizen Kane (1941)
One Flew Over the Cuckoo's Nest (1975)
Casablanca (1942)
Vertigo (1958)
Ran (1985)
Philadelphia Story, The (1940)
Schindler's List (1993)
Rear Window (1954)
To Kill a Mockingbird (1962)


In [141]:
# load the ratings data
ratings_d = pd.read_csv(dataPath+"dataset/ml-100k/u1.base", delimiter='\t', names=['UID', 'MID', 'rate', 'timestamp'], usecols=[0, 1, 2])

In [142]:
# Movies user rated about 4-5
ratings_d["UID"] = ratings_d["UID"].astype(int)
ratings_d["rate"] = ratings_d["rate"].astype(int)
ratings_d = ratings_d[ratings_d["rate"] > 4]
ratings_d = ratings_d[ratings_d["UID"] == user_id]
ratings_d = ratings_d.sort_values(by="rate", ascending=False)
ratings_d = ratings_d.drop_duplicates(subset=["MID"], keep="first")
ratings_d = ratings_d.drop(["UID", "rate"], axis=1)

for mid in ratings_d["MID"]:
    movie_name = item_df.loc[item_df["MID"] == mid]["name"].values[0]
    print(movie_name)

Toy Story (1995)
Amadeus (1984)
Jean de Florette (1986)
Manon of the Spring (Manon des sources) (1986)
Monty Python and the Holy Grail (1974)
Wrong Trousers, The (1993)
Empire Strikes Back, The (1980)
Princess Bride, The (1987)
Aliens (1986)
12 Angry Men (1957)
Return of the Jedi (1983)
Terminator, The (1984)
Dead Man Walking (1995)
Graduate, The (1967)
Nikita (La Femme Nikita) (1990)
Back to the Future (1985)
Cyrano de Bergerac (1990)
When Harry Met Sally... (1989)
Sling Blade (1996)
Chasing Amy (1997)
Chasing Amy (1997)
Full Monty, The (1997)
Sleeper (1973)
Big Night (1996)
Godfather, The (1972)
Lone Star (1996)
Mighty Aphrodite (1995)
Mr. Holland's Opus (1995)
French Twist (Gazon maudit) (1995)
Antonia's Line (1995)
Crumb (1994)
Clerks (1994)
Eat Drink Man Woman (1994)
Hoop Dreams (1994)
Star Wars (1977)
Professional, The (1994)
Priest (1994)
Three Colors: Red (1994)
Searching for Bobby Fischer (1993)
Blade Runner (1982)
Welcome to the Dollhouse (1995)
Mystery Science Theater 3000: 