In [1]:
import pandas as pd
import json
import numpy as np
import random
import copy
from math import sqrt

def load_jsons(data_path, file):
    """ helper function to load '.json' files (they're not proper jsons) """
    file_path = data_path + file
    with open(file_path) as jsons:
        lines = [json.loads(json_line) for json_line in jsons]
    return pd.DataFrame(lines)

### Data inladen 

    Load data for town called Ambridge
    data_path = './data/ambridge/'
    review_file = 'review.json'
    business_file = 'business.json'
    user_file = 'user.json'
    tip_file = 'tip.json'
    checkin_file = 'checkin.json'

    reviews = load_jsons(data_path, review_file)
    businesses = load_jsons(data_path, business_file)
    users = load_jsons(data_path, user_file)
    tips = load_jsons(data_path, tip_file)
    checkins = load_jsons(data_path, checkin_file)

In [26]:
city = 'Cleveland'
data_path = f'../data/{city}/'
review_file = 'review.json'
business_file = 'business.json'
user_file = 'user.json'
tip_file = 'tip.json'
checkin_file = 'checkin.json'

reviews = load_jsons(data_path, review_file)
businesses = load_jsons(data_path, business_file)
users = load_jsons(data_path, user_file)
tips = load_jsons(data_path, tip_file)
checkins = load_jsons(data_path, checkin_file)

In [45]:
def number_of_businesses(reviews):
    """ determine the number of unique movie id's in the data """
    return len(reviews['business_id'].unique())

def number_of_users(reviews):
    """ determine the number of unique user id's in the data """
    return len(reviews['user_id'].unique())

def number_of_reviews(reviews):
    """ count the number of ratings of a dataset """
    return reviews.shape[0]

def rating_density(reviews):
    """ compute the ratings given a data set """
    return number_of_reviews(reviews) / (number_of_businesses(reviews) * number_of_users(reviews))


### Utility Matrix

In [71]:
def pivot_ratings(reviews):
    """ takes a review table as input and computes the mean utility matrix """
    return reviews.pivot_table(index='business_id', columns='user_id', values='stars').fillna(np.nan)

### Mean Center Columns

In [5]:
def mean_center_columns(matrix):
    means = matrix.mean()
    matrix2 = matrix.apply(lambda x: x - means[x.name])
    return matrix2

### Cosine Similarity

In [6]:
def cosine_distance(matrix, id1, id2):
    features1 = matrix.loc[id1]
    features2 = matrix.loc[id2]
    if features1.equals(features2):
        return 1.0

    selected_features = features1.notna() & features2.notna()
    
    if not selected_features.any() or not (features1.any() and features2.any()):
        return np.nan
    
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]
    deler = sum(features1 * features2)
    noemer = sqrt(sum([x**2 for x in features1])) * sqrt(sum([x**2 for x in features2]))
    try:
        return deler/noemer
    except:
        return np.nan

def create_similarity_matrix_cosine(matrix):
    cosine_matrix = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    for x in matrix.index:
        for y in matrix.index:
            cosine_matrix.at[x,y] = cosine_distance(matrix, x, y)
    return cosine_matrix

### Neighborhood

In [46]:
def select_neighborhood(similarities, ratings, k):
    """ selects all items with similarity > 0  """
    sim = similarities[ratings.notna()]
    return sim[sim > 0].sort_values(ascending=False)[:k]  

### Weighted mean

In [47]:
def weighted_mean(neighborhood, ratings):  
    return (sum([neighborhood[x] * ratings[x] for x in neighborhood.index]) / sum([x for x in neighborhood])) if neighborhood.any() else np.nan


### Test Data 

In [68]:
def split_data(data,d = 0.75):
    """ split data in a training and test set 
       `d` is the fraction of data in the training set"""
    np.random.seed(seed=5)
    mask_test = np.random.rand(data.shape[0]) < d
    return data[mask_test], data[~mask_test]

reviews_t80 = load_jsons(data_path, review_file)
reviews_t80 = reviews.head(9100)
reviews_t80_training, reviews_t80_test = split_data(reviews_t80)
reviews_t80_training.to_pickle(f'./{city}/training_set.pkl')
reviews_t80_test.to_pickle(f'./{city}/test_set.pkl')

### Matrix

In [73]:
utility_training = pivot_ratings(reviews_t80_training)
utility_training_centered = mean_center_columns(utility_training)

In [13]:
similarity_training = create_similarity_matrix_cosine(utility_training_centered)
similarity_training.to_pickle(f'./{city}/similarity_training.pkl')

### Snel inladen data

In [74]:
s = pd.read_pickle(f'./{city}/similarity_training.pkl')
u = utility_training
training = pd.read_pickle(f'./{city}/training_set.pkl')
test = pd.read_pickle(f'./{city}/test_set.pkl')
display(u)

user_id,-0452IkNZtHQbzSVf5uTwA,-0S_XaK3Q_Mesal2Unta2w,-0YN7DR3V0ynR9ureBeOKA,-0cu1eLTYG-TJCer9VVF9A,-1gsm9r8FAvnbbio8mz0sw,-2p_A5675Eh6gcZIGkf2Yw,-4UYrm18j2vQ27iR5JOX2g,-52vWj_NKngEeTlflRFzKA,-5kQB0IQelaB2FDBJFpDMw,-6TGxmTFJBWVx9E7oLcerw,...,zneHs9nkYY2MogPFGree-Q,zpMvXReWlU4sKjY8mNW_QQ,zpWv_-s5AKgdLiGXhc7FiQ,zqXvcP1bvdV7gYstlzt2Ow,zskUxXh4q3om5ECvkhJJ5w,zuQM01AUldHVO9jP4TGFWA,zvoKQvcCsnJeC7ZRKrsWPA,zxA6KTqMirq7EsaIH93LSA,zxGejXvesnPiiMVmuvZjYQ,zxRHyxQm-32j5Z7Pi7bHCA
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1vulUmD1pYf0uvXBY1Fzg,,,,,,,,,,,...,,,,,,,,,,
-H1YgsXYBjH-va7cLIqXxg,,,,,,,,,,,...,,,,,,,,,,
-SNEYS8erOwt9SkPFO3sTA,,,,,,,,,,,...,,,,,,,,,,
-YRnaVhJSwIgaCIIvOIoew,,,,,,,,,,,...,,,,,,,,,,
-prmBNT8sD8V2o2kXV6Ywg,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yhOvThxVOqRRavZ_qPcAWw,,,,,,,,,,,...,,,,,,,,,,
ymn4YVFgXy_rLlnoatEqzQ,,,,,,,,,,,...,,,,,,,,,,
z82CyKgRSX2KyUWgphygtg,,,,,,,,,,,...,,,,,,,,,,
zKYFtV6sutpPTanxYpZJEA,,,,,,,,,,,...,,,,,,,,,,


### Extra functies

In [48]:
def select_neighborhood(similarities, ratings, k):
    sim = similarities[ratings.notna()]
    return sim[sim > 0].sort_values(ascending=False)[:k]   


In [49]:
def weighted_mean(neighborhood, ratings):  
    return (sum([neighborhood[x] * ratings[x] for x in neighborhood.index]) / sum([x for x in neighborhood])) if neighborhood.any() else np.nan


In [50]:
def rate_user(similarity, utility, k):
    neighborhood = select_neighborhood(similarity, utility, k)
    return weighted_mean(neighborhood, utility)

def predict_ratings_user_based(similarity, utility, user_item_pairs):
    ratings_test_c = user_item_pairs.copy()
    list1 = []
    for business, user in zip(ratings_test_c.business_id, ratings_test_c.user_id):
        try:
            list1.append(rate_user(similarity[business], utility[user], 150))
        except:
            list1.append(np.nan)
    ratings_test_c['predicted rating'] = pd.Series(list1,index=ratings_test_c.index)
    return ratings_test_c

### Voorspelling doen

In [75]:
predicted_user_based = predict_ratings_user_based(s, u, test[['user_id', 'business_id', 'stars']])


predict_random = test.copy()[['user_id', 'business_id', 'stars']]
predict_random['predicted rating'] = np.random.uniform(1, 5.0, len(predict_random))

In [76]:
display(predicted_user_based)
display(predict_random)

Unnamed: 0,user_id,business_id,stars,predicted rating
1,liUNZm0tOJJ-d-pFCdqu9A,ZNB91myFoOYgyXoG5LQeGQ,1.0,
3,wUAQq8gZ9tpWpMDPNTB6TA,CtYWpX_cy1YdZgoKtS0Tqg,1.0,
6,tcFmCEOQj8BqZ4VZheKCrw,m3DVIvPsuLuA9OMSgWNLcQ,4.0,5.000000
14,iCG9Z_4TDOxMG_aLWM6LHw,OOXUDZVOREgjwrLiwr3Spg,4.0,4.312885
27,49j4mN81moEbq3oyi_oB4g,XUA7xA7myMaCuN6G3xYdIA,4.0,
...,...,...,...,...
9088,kKnHnx4JzpmNw516ekHyrg,anzCdD5eIEHwydTpXhLlUA,5.0,
9089,BO1A62kTQk4MfwZPOD9sKg,TnO9SmZoRNfJlbNV1wBKYw,3.0,4.377500
9097,umgyhCR8sLbHsSxnmpK0Rg,XjC89x8uGa3DiRB-k9Gn3w,5.0,
9098,EQUwFnrRUnCNA0tflO4zWQ,vdloJPM3bSpQTMlwdFek2A,4.0,


Unnamed: 0,user_id,business_id,stars,predicted rating
1,liUNZm0tOJJ-d-pFCdqu9A,ZNB91myFoOYgyXoG5LQeGQ,1.0,1.408389
3,wUAQq8gZ9tpWpMDPNTB6TA,CtYWpX_cy1YdZgoKtS0Tqg,1.0,2.783946
6,tcFmCEOQj8BqZ4VZheKCrw,m3DVIvPsuLuA9OMSgWNLcQ,4.0,3.721883
14,iCG9Z_4TDOxMG_aLWM6LHw,OOXUDZVOREgjwrLiwr3Spg,4.0,2.703268
27,49j4mN81moEbq3oyi_oB4g,XUA7xA7myMaCuN6G3xYdIA,4.0,4.114075
...,...,...,...,...
9088,kKnHnx4JzpmNw516ekHyrg,anzCdD5eIEHwydTpXhLlUA,5.0,4.263571
9089,BO1A62kTQk4MfwZPOD9sKg,TnO9SmZoRNfJlbNV1wBKYw,3.0,1.787564
9097,umgyhCR8sLbHsSxnmpK0Rg,XjC89x8uGa3DiRB-k9Gn3w,5.0,3.606260
9098,EQUwFnrRUnCNA0tflO4zWQ,vdloJPM3bSpQTMlwdFek2A,4.0,4.343122


### Mean Squared Error

In [77]:
def mse(predicted_ratings):
    mse = [(x-y)**2 for x,y in zip(predicted_ratings['stars'], predicted_ratings['predicted rating'])if np.isnan(y) == False]
    return sum(mse) / len(mse)

In [78]:
mse_user_based = mse(predicted_user_based)
mse_random = mse(predict_random)
print(f"MSE for user based: {mse_user_based}")
print(f"MSE for random: {mse_random}")

MSE for user based: 1.4163953480376854
MSE for random: 3.8130985024612847


### Root Mean Squared Error

In [34]:
rmse_user_based = sqrt(mse_user_based)
rmse_random = sqrt(mse_random)
print(f"RMSE for user based: {rmse_user_based}")
print(f"RMSE for random: {rmse_random}")

RMSE for user based: 1.3518824678560455
RMSE for random: 2.0446012481166127


### Mean Absolute Error

In [79]:
def mae(predicted_ratings):
    mae = [abs(x-y) for x,y in zip(predicted_ratings['stars'], predicted_ratings['predicted rating'])if np.isnan(y) == False]
    return sum(mae) / len(mae)

In [80]:
mae_user_based = mae(predicted_user_based)
mae_random = mae(predict_random)
print(f"MAE for user based: {mae_user_based}")
print(f"MAE for random: {mae_random}")

MAE for user based: 0.8136250345760488
MAE for random: 1.625743588902187
