# Preload Stuff

In [None]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import ast
from scipy.stats import pearsonr
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [None]:
def load_jsons(data_path, file):
    """ helper function to load '.json' files (they're not proper jsons) """
    file_path = data_path + file
    with open(file_path) as jsons:
        lines = [json.loads(json_line) for json_line in jsons]
    return pd.DataFrame(lines)

In [None]:
cities = os.listdir('data/')

review_file = 'review.json'
business_file = 'business.json'
user_file = 'user.json'
tip_file = 'tip.json'
checkin_file = 'checkin.json'   

# Functions Algorithms

In [None]:
def split_data(data, d = 0.75):
    """Split data in a training and test set.
    
    Arguments:
    data -- any dataFrame.
    d    -- the fraction of data in the training set
    """
    np.random.seed(seed=5)
    mask_test = np.random.rand(data.shape[0]) < d
    return data[mask_test], data[~mask_test]  

def pivot_categories(df):
    """Create a one-hot encoded matrix for genres.
    
    Arguments:
    df -- a dataFrame containing at least the columns 'movieId' and 'genre'
    
    Output:
    a matrix containing '0' or '1' in each cell.
    1: the movie has the genre
    0: the movie does not have the genre
    """
    return df.pivot_table(index = 'business_id', columns = 'category', aggfunc = 'size', fill_value=0)

def pivot_ratings(df):
    """Creates a utility matrix for user ratings for movies
    
    Arguments:
    df -- a dataFrame containing at least the columns 'movieId' and 'genres'
    
    Output:
    a matrix containing a rating in each cell. np.nan means that the user did not rate the movie
    """
    return df.pivot(values='stars', columns='user_id', index='business_id')

def create_similarity_matrix_jaccard(matrix):
    m11 = matrix @ matrix.T
    m10 = pd.DataFrame(matrix.sum(axis = 1).values + np.zeros(m11.shape), index = m11.index, columns = m11.index)
    m01 = m10.T
    return m11/(m01 + m10 - m11)

def predict_ratings(similarity, utility, to_predict):
    """Predicts the predicted rating for the input test data.
    
    Arguments:
    similarity -- a dataFrame that describes the similarity between items
    utility    -- a dataFrame that contains a rating for each user (columns) and each movie (rows). 
                  If a user did not rate an item the value np.nan is assumed. 
    to_predict -- A dataFrame containing at least the columns movieId and userId for which to do the predictions
    """
    # copy input (don't overwrite)
    ratings_test_c = to_predict.copy()
    # apply prediction to each row
    ratings_test_c['predicted rating'] = to_predict.apply(lambda row: predict_ids(similarity, utility, row['user_id'], row['business_id']), axis=1)
    return ratings_test_c

### Helper functions for predict_ratings_item_based ###

def predict_ids(similarity, utility, user_id, itemId):
    # select right series from matrices and compute
    if user_id in utility.columns and itemId in similarity.index:
        return predict_vectors(utility.loc[:,user_id], similarity[itemId])
    return 0

def predict_vectors(user_ratings, similarities):
    # select only businesses actually rated by user
    relevant_ratings = user_ratings.dropna()
    
    # select corresponding similairties
    similarities_s = similarities[relevant_ratings.index]
    
    # select neighborhood
    similarities_s = similarities_s[similarities_s > 0.0]
    relevant_ratings = relevant_ratings[similarities_s.index]
    
    # if there's nothing left return a prediction of 0
    norm = similarities_s.sum()
    if(norm == 0):
        return 0
    
    # compute a weighted average (i.e. neighborhood is all) 
    return np.dot(relevant_ratings, similarities_s)/norm

def unpack(bus_atr):
    
    attributes = pd.Series(bus_atr, dtype=object)
    for bus_atr in attributes:
        if isinstance(bus_atr, str):
            attributes_index = attributes.index[0]
            attributes = attributes.drop(attributes_index)
            atr = ast.literal_eval(bus_atr)
            
            # Following part is a bit off, had an idea at first but
            # changed it later, don't know how to efficiently rewrite it.
            # So now, it's just add to dictionary and change to list.
            if isinstance(atr, dict):
                atr_series = pd.Series(atr, dtype=object)
                attributes = attributes.append(atr_series)
            else:
                atr_series = pd.Series({attributes_index:atr}, dtype=object)
                attributes = attributes.append(atr_series)
    attributes = [k for k, v in attributes.to_dict().items() if v == True]
    return attributes

def extract_categories(businesses):
    categories1 = businesses.apply(lambda row: pd.Series([row['business_id']] + row['categories'].lower().split(', ') + row['attributes']), axis=1)
    categories_stack = categories1.set_index(0).stack()
    categories_frame = categories_stack.to_frame()
    categories_frame['business_id'] = categories_stack.index.droplevel(1)
    categories_frame.columns = ['category', 'business_id']
    categories_frame = categories_frame.reset_index()[['business_id', 'category']]
    return categories_frame

def pearson_distance(matrix, id1, id2):
    # only take the features that have values for both id1 and id2
    selected_features = matrix.loc[id1].notna() & matrix.loc[id2].notna()
    
    # if no matching features, return NaN
    if not selected_features.any():
        return np.nan
    
    # get the features from the matrix
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]
    
    print('f1:',features1,'f2:',features2)
    pearson_correlation_coefficient = pearsonr(features1,features2)[0]
    
    print(pearson_correlation_coefficient)
    return pearson_correlation_coefficient

def pearson_similarity(matrix, id1, id2):
    """Compute manhattan similarity between two rows."""
    # compute distance
    similarity = pearson_distance(matrix, id1, id2)
    
    # if no distance could be computed (no shared features) return a similarity of 0
    if similarity is np.nan:
        return 0
    
    # else return similarity
    return similarity

def create_similarity_matrix_pearson(matrix):
    """creates the similarity matrix based on eucledian distance"""
    similarity_matrix = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    for index, row in similarity_matrix.iterrows():
        for col in row.index:
            similarity_matrix.at[index, col] = pearson_similarity(matrix, index, col)
    return similarity_matrix

def select_neighborhood(similarities, ratings, k):
    neighborhood = similarities[similarities.index.isin(ratings[ratings.notnull()].index)].nlargest(n=k, keep='first')
    neighborhood = neighborhood[neighborhood>0]
    return neighborhood

def weighted_mean(neighborhood, ratings):  
    try:
        return ((ratings[neighborhood.index] * neighborhood.values).values.sum()) / sum(neighborhood.values)
    except:
        return np.nan
    
def predict_ratings_item_based(similarity, utility, user_item_pairs):
    ratings_test_c = user_item_pairs.copy()
    ratings_test_c['predicted rating'] = pd.Series(0, index = review.index, dtype=float)
    for value, row in ratings_test_c.iterrows():
        business_id = row['business_id']
        user_id = row['user_id']
        neighborhood = select_neighborhood(similarity[business_id], utility[user_id], 100)
        ratings_test_c.at[value, 'predicted rating'] = weighted_mean(neighborhood, utility[user_id])
    return ratings_test_c 

# def recommended(predictions, treshold):
#     for row in predictions.iterrows():
#          predicted_item_based_recommended = predictions.loc[predictions['predicted rating'].sort_values(ascending = False)]
#     return predicted_item_based_recommended[['user_id', 'business_id']][:treshold]

def cosine_distance(matrix, id1, id2):
    # only take the features that have values for both id1 and id2
    selected_features = matrix.loc[id1].notna() & matrix.loc[id2].notna()
    
    # if no matching features, return NaN
    if not selected_features.any():
        return 0.0
    
    # get the features from the matrix
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]

    if (list(features1.unique()) == [0]) & (list(features2.unique()) == [0]):
        return 1
    
    product = features1*features2
    feature_squared_1 = features1**2
    feature_squared_2 = features2**2

    if ((np.sqrt(feature_squared_1.sum()))*(np.sqrt(feature_squared_2.sum()))) == 0:
        return 0
    
    return product.sum()/((np.sqrt(feature_squared_1.sum()))*(np.sqrt(feature_squared_2.sum())))

def create_similarity_matrix_cosine(matrix):
    """ creates the similarity matrix based on cosine similarity """
    # TODO
    similarity_matrix_cosine = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    
    # vul elke cel in met de cosine afstand
    for x in similarity_matrix_cosine.index:
        for y in similarity_matrix_cosine.columns:
            similarity_matrix_cosine[y][x] = cosine_distance(matrix, y, x)
    return similarity_matrix_cosine

def mse(predicted_ratings):
    """Computes the mean square error between actual ratings and predicted ratings
    
    Arguments:
    predicted_ratings -- a dataFrame containing the columns rating and predicted rating
    """
    diff = predicted_ratings['stars'] - predicted_ratings['predicted rating']
    return (diff**2).mean()

def rmse(predicted_ratings):
    """Computes the mean square error between actual ratings and predicted ratings
    
    Arguments:
    predicted_ratings -- a dataFrame containing the columns rating and predicted rating
    """
    diff = predicted_ratings['stars'] - predicted_ratings['predicted rating']
    return ((diff**2)**0.5).mean()

# Dataset load and Transform

In [None]:
number_of_cities = 100

In [None]:
reviews = pd.concat([load_jsons('./data/' + city + '/', review_file) for city in cities[:number_of_cities]])
reviews = reviews.sort_values('stars').drop_duplicates(subset=['user_id', 'business_id'], keep='last')

In [None]:
businesses = pd.concat([load_jsons('./data/' + city + '/', business_file) for city in cities[:number_of_cities]])
businesses = businesses[businesses['categories'].str.contains('Restaurants', na=False)]


In [None]:
restaurant_ids = businesses.business_id.tolist()
reviews = reviews[reviews['business_id'].isin(restaurant_ids)]
reviews = reviews[reviews.groupby('user_id')['user_id'].transform('size') > 3]
reviews_training, reviews_test = split_data(reviews, d=0.80)




# Utility matrixes for both algorithms

In [None]:
utility_reviews = pivot_ratings(reviews_training)
display(utility_reviews)

In [None]:
total_utility = pivot_ratings(reviews)
display(total_utility)

# CbF

In [None]:
businesses2 = businesses.copy()
businesses2.index = range(businesses.shape[0])
for i in range(businesses.shape[0]):
    unpacked = unpack(businesses['attributes'].iloc[i])
    businesses2.at[i, 'attributes'] = unpacked

In [None]:
businesses_cats = extract_categories(businesses2)
businesses_cats_matrix = pivot_categories(businesses_cats)
businesses_cats_matrix = businesses_cats_matrix.drop(columns=['restaurants'])
businesses_cats_matrix.drop([col for col, val in businesses_cats_matrix.sum().iteritems() if val < businesses2.shape[0]/100], axis=1, inplace=True)

display(businesses_cats_matrix)

In [None]:
similarity_categories = create_similarity_matrix_jaccard(businesses_cats_matrix)
display(similarity_categories)

In [None]:
predicted_ratings_cbf = predict_ratings(similarity_categories, utility_reviews, reviews_test[['user_id', 'business_id', 'stars']])
display(predicted_ratings_cbf)

# Item-based CF

In [None]:
mp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
utility_imputed = mp.fit(utility_reviews).transform(utility_reviews)

pearson_sim = 1-pairwise_distances(utility_imputed, metric="correlation")

similarity_cf = pd.DataFrame(pearson_sim, index=utility_reviews.index, columns=utility_reviews.index)
display(similarity_cf)

In [None]:
scaler = MinMaxScaler()
similarity_scaled = scaler.fit(similarity_cf).transform(similarity_cf)
display(pd.DataFrame(similarity_scaled,index=utility_reviews.index, columns=utility_reviews.index))

In [None]:
predicted_ratings_cf = predict_ratings(similarity_cf, utility_reviews, reviews_test[['user_id', 'business_id', 'stars']])
display(predicted_ratings_cf)

# Evaluation

In [None]:
ratings_rndm_test = reviews_test.copy()
ratings_rndm_test['predicted rating'] = np.random.uniform(0,5,size=len(ratings_rndm_test))
mse_random = mse(ratings_rndm_test)

mse_mean_cf = mse(predicted_ratings_cf)
mse_mean_cbf = mse(predicted_ratings_cbf)

print(f'MSE for random prediction: {mse_random:.2f}')
print(f'MSE for content based filtering prediction : {mse_mean_cbf:.2f}')
print(f'MSE for item-based collaborative filtering prediction : {mse_mean_cf:.2f}')

In [None]:
rmse_random = rmse(ratings_rndm_test)

rmse_mean_cf = rmse(predicted_ratings_cf)
rmse_mean_cbf = rmse(predicted_ratings_cbf)

print(f'RMSE for random prediction: {rmse_random:.2f}')
print(f'RMSE for content based filtering prediction : {rmse_mean_cbf:.2f}')
print(f'RMSE for item-based collaborative filtering prediction : {rmse_mean_cf:.2f}')

# Intra-list Similarity

In [None]:
random_user2 = reviews.user_id.unique()
random_user = np.random.choice(random_user2)

In [None]:
business_ids = businesses.business_id
column_names = ["user_id", "business_id"]
test = pd.DataFrame(columns = column_names)
test.business_id = business_ids
test['user_id'] = random_user
test = test.reset_index(drop=True)

test_all = test.merge(reviews[['user_id', 'business_id']], on=['user_id', 'business_id'], how='left', indicator=True)
seen = test_all[test_all['_merge'] == 'both']['business_id']

for value in seen.values:
    test = test[test['business_id'] != value]

test_cbf = test.copy()   
test_cf = test.copy() 

In [None]:
test_cbf['predicted rating'] = test.apply(lambda row: predict_ids(similarity_categories, total_utility, row['user_id'], row['business_id']), axis=1)
results_cbf = test_cbf.sort_values(by= 'predicted rating', ascending=False).head(20)

print('Best 20 results for CBF')
display(results_cbf)

In [None]:
test_cf['predicted rating'] = test.apply(lambda row: predict_ids(similarity_cf, total_utility, row['user_id'], row['business_id']), axis=1)
results_cf = test_cf.sort_values(by= 'predicted rating', ascending=False).head(20)

print('Best 20 results for CF')
display(results_cf)

In [None]:
tests = businesses_cats_matrix.loc[results_cbf['business_id'].values]
similair = create_similarity_matrix_jaccard(tests)
lijst_similair_values_cbf = similair.mean().sort_values(ascending=False)
display(lijst_similair_values_cbf)
print('Mean similarity for CbF: ', lijst_similair_values_cbf.mean())

In [None]:
tests = businesses_cats_matrix.loc[results_cf['business_id'].values]
similair = create_similarity_matrix_jaccard(tests)
lijst_similair_values_cf = similair.mean().sort_values(ascending=False)
display(lijst_similair_values_cf)
print('Mean similarity for CF: ', lijst_similair_values_cf.mean())