In [13]:
import pandas as pd
import json
import numpy as np
from math import sqrt
def load_jsons(data_path, file):
    """ helper function to load '.json' files (they're not proper jsons) """
    file_path = data_path + file
    with open(file_path) as jsons:
        lines = [json.loads(json_line) for json_line in jsons]
    return pd.DataFrame(lines)

### Data inladen 

    Load data for town called Ambridge
    data_path = './data/ambridge/'
    review_file = 'review.json'
    business_file = 'business.json'
    user_file = 'user.json'
    tip_file = 'tip.json'
    checkin_file = 'checkin.json'

    reviews = load_jsons(data_path, review_file)
    businesses = load_jsons(data_path, business_file)
    users = load_jsons(data_path, user_file)
    tips = load_jsons(data_path, tip_file)
    checkins = load_jsons(data_path, checkin_file)

In [2]:
data_path = './data/akron/'
review_file = 'review.json'
business_file = 'business.json'
user_file = 'user.json'
tip_file = 'tip.json'
checkin_file = 'checkin.json'

reviews = load_jsons(data_path, review_file)
businesses = load_jsons(data_path, business_file)
users = load_jsons(data_path, user_file)
tips = load_jsons(data_path, tip_file)
checkins = load_jsons(data_path, checkin_file)

In [38]:
def number_of_businesses(reviews):
    """ determine the number of unique movie id's in the data """
    return len(reviews['business_id'].unique())

def number_of_users(reviews):
    """ determine the number of unique user id's in the data """
    return len(reviews['user_id'].unique())

def number_of_reviews(reviews):
    """ count the number of ratings of a dataset """
    return reviews.shape[0]

def rating_density(reviews):
    """ compute the ratings given a data set """
    return number_of_reviews(reviews) / (number_of_businesses(reviews) * number_of_users(reviews))


### Utility Matrix

In [3]:
def pivot_ratings(ratings):
    """ takes a rating table as input and computes the utility matrix """
    return ratings.pivot_table(index='business_id', columns='user_id', values='stars').fillna(np.nan)

In [4]:
utility_matrix = pivot_ratings(reviews)
display(utility_matrix)

user_id,--9Et6koJ2Apqk4nxuv3TQ,--tuEsv6L10jqcwBANrSnw,-01kVTKImAOZTyAGAn-YZg,-0x2ov-qcCopv32Imm-TYg,-1eUcYsPzr0K_nw67oohdw,-1sHorO23302dx_aJxUb4g,-1tofHIb9lX6SElFgzmOkw,-1y0FYmHNT6WG9wHzyZLiQ,-2OYbrCvwt-titaTvNRhIA,-2uUrtgM5fiOaCpQEnPv0g,...,zv89WfVUErm-nGNP0EYaPA,zvK2h4kMtzmGDbho_3mv4w,zvoKQvcCsnJeC7ZRKrsWPA,zw6YMjaWYgpe0PQQfJbOkg,zwBUSCN1BmevqsM-YdEYwQ,zxAHQcVGVP_eriXRfiwAFQ,zxfjM9MxvnwRbZZnZudWGg,zyantlaY2HgWKDnXzvMZXw,zz8qDpBF_Qup2WIfwbYPOQ,zzdezZFBRxPo4LBQHx6REg
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0LPtgJC31FWMrMv317p0Q,,,,,,,,,,4.0,...,,,,,,,,,,
-6Lc2gIhLkggEdCMpVidlQ,,,,,,,,,,,...,,,,,,,,,,
-CBW4yvallpWtfWBbZCHqg,,,,,,,,,,,...,,,,,,,,,,
-SNFOrPHya_I4m6vj491UQ,,,,,,,,,,,...,,,,,,,,,,
-YnOFIP-xZeWEwbatrDO9g,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zNN5HBJ8w_ahxu-UyXtVLg,,,,,,,,,,,...,,,,,,,,,,
zOHQdGK6D3S0rvyNsrQi0Q,,,,,,,,,,,...,,,,,,,,,,
zgGCFFW9fmMve9iyeCJc0g,,,,,,,,,,,...,,,,,,,,,,
zxxwy18lW8uviih0xzeuig,,,,,,,,,,,...,,,,,,,,,,


### Mean Center Columns

In [7]:
def mean_center_columns(matrix):
    # TODO
    means = matrix.mean()
    matrix2 = matrix.apply(lambda x: x - means[x.name])
    return matrix2

centered_utility_matrix = mean_center_columns(utility_matrix)

### Cosine Similarity

In [22]:
def cosine_distance(matrix, id1, id2):
    features1 = matrix.loc[id1]
    features2 = matrix.loc[id2]
    if features1.equals(features2):
        return 1.0

    selected_features = features1.notna() & features2.notna()
    
    if not selected_features.any() or not (features1.any() and features2.any()):
        return np.nan
    
    features1 = matrix.loc[id1][selected_features]
    features2 = matrix.loc[id2][selected_features]
    deler = sum(features1 * features2)
    noemer = math.sqrt(sum([x**2 for x in features1])) * sqrt(sum([x**2 for x in features2]))
    try:
        return deler/noemer
    except:
        return np.nan

def create_similarity_matrix_cosine(matrix):
    cosine_matrix = pd.DataFrame(0, index=matrix.index, columns=matrix.index, dtype=float)
    for x in matrix.index:
        for y in matrix.index:
            cosine_matrix.at[x,y] = cosine_distance(matrix, x, y)
    return cosine_matrix

In [23]:
sim_matrix_cosine_centered = create_similarity_matrix_cosine(centered_utility_matrix)

### Save matrix as pickle 

In [12]:
sim_matrix_cosine.to_pickle('./similarity_matrix.pkl')

In [24]:
sim_matrix_cosine_centered.to_pickle('./similarity_matrix_centered.pkl')

In [26]:
display(sim_matrix_cosine_centered)

business_id,-0LPtgJC31FWMrMv317p0Q,-6Lc2gIhLkggEdCMpVidlQ,-CBW4yvallpWtfWBbZCHqg,-SNFOrPHya_I4m6vj491UQ,-YnOFIP-xZeWEwbatrDO9g,-bHpvkmiFzm_swHLYsGiqw,-tSTLaafhkQ7iB5Bl5zgPg,05XgO_G-BlkisVpmUyBUJw,0CEUzhFu5IQdZCjiNj29YA,0OlaAWc97auYJuV4-HreZA,...,yb3kBSowKTTF5cFN_OwBNg,yfG3wHI0HgIhorjP5k6TcQ,yjrEeOjtu_e35SW6zcwrCg,z0hCSbO4vgxk6wAya6MCSg,z6kCzBQt5Vv1KFkQgrhVnA,zNN5HBJ8w_ahxu-UyXtVLg,zOHQdGK6D3S0rvyNsrQi0Q,zgGCFFW9fmMve9iyeCJc0g,zxxwy18lW8uviih0xzeuig,zy9lwMhCU6vwC8Uff60iZg
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0LPtgJC31FWMrMv317p0Q,1.0,,,,,,,,,,...,,,,,,,1.0,,1.0,
-6Lc2gIhLkggEdCMpVidlQ,,1.0,,,,-1.0,1.000000,,,,...,,,,,,,,,,
-CBW4yvallpWtfWBbZCHqg,,,1.000000,-0.723951,,-1.0,0.857534,,,-1.0,...,,,,,-0.433255,1.0,,,,
-SNFOrPHya_I4m6vj491UQ,,,-0.723951,1.000000,,,-0.723394,,,1.0,...,,,,,1.000000,,1.0,,,
-YnOFIP-xZeWEwbatrDO9g,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zNN5HBJ8w_ahxu-UyXtVLg,,,1.000000,,,,,,,,...,,,,,,1.0,,,,
zOHQdGK6D3S0rvyNsrQi0Q,1.0,,,1.000000,,,0.721338,,,,...,-1.0,,,,,,1.0,,,
zgGCFFW9fmMve9iyeCJc0g,,,,,,,,,,,...,,,,,,,,1.0,,
zxxwy18lW8uviih0xzeuig,1.0,,,,,,,,,,...,,,,,,,,,1.0,


### Neighborhood

In [27]:
def select_neighborhood(similarities, ratings, k):
    """ selects all items with similarity > 0  """
    # TODO
    sim = similarities[ratings.notna()]
    return sim[sim > 0].sort_values(ascending=False)[:k]  

### Weighted mean

In [28]:
def weighted_mean(neighborhood, ratings):  
    # TODO
    return (sum([neighborhood[x] * ratings[x] for x in neighborhood.index]) / sum([x for x in neighborhood])) if neighborhood.any() else np.nan


### Test Data 

In [36]:
def split_data(data,d = 0.75):
    """ split data in a training and test set 
       `d` is the fraction of data in the training set"""
    np.random.seed(seed=5)
    mask_test = np.random.rand(data.shape[0]) < d
    return data[mask_test], data[~mask_test]

reviews_t80 = load_jsons(data_path, review_file)
reviews_t80_training, reviews_t80_test = split_data(reviews_t80)
reviews_t80_training.to_pickle('./training_set.pkl')
reviews_t80_test.to_pickle('./test_set.pkl')

### Matrix

In [40]:
utility_training = mean_center_columns(pivot_ratings(reviews_t80_training))
similarity_training = create_similarity_matrix_cosine(utility_training)
similarity_training.to_pickle('./similarity_training.pkl')

In [41]:
display(similarity_training)

business_id,-0LPtgJC31FWMrMv317p0Q,-6Lc2gIhLkggEdCMpVidlQ,-CBW4yvallpWtfWBbZCHqg,-SNFOrPHya_I4m6vj491UQ,-YnOFIP-xZeWEwbatrDO9g,-bHpvkmiFzm_swHLYsGiqw,-tSTLaafhkQ7iB5Bl5zgPg,05XgO_G-BlkisVpmUyBUJw,0CEUzhFu5IQdZCjiNj29YA,0OlaAWc97auYJuV4-HreZA,...,yb3kBSowKTTF5cFN_OwBNg,yfG3wHI0HgIhorjP5k6TcQ,yjrEeOjtu_e35SW6zcwrCg,z0hCSbO4vgxk6wAya6MCSg,z6kCzBQt5Vv1KFkQgrhVnA,zNN5HBJ8w_ahxu-UyXtVLg,zOHQdGK6D3S0rvyNsrQi0Q,zgGCFFW9fmMve9iyeCJc0g,zxxwy18lW8uviih0xzeuig,zy9lwMhCU6vwC8Uff60iZg
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0LPtgJC31FWMrMv317p0Q,1.0,,,,,,,,,,...,,,,,,,,,,
-6Lc2gIhLkggEdCMpVidlQ,,1.0,,,,-1.0,1.000000,,,,...,,,,,,,,,,
-CBW4yvallpWtfWBbZCHqg,,,1.000000,-0.244041,,-1.0,0.931508,,,-1.0,...,,,,,1.0,,,,,
-SNFOrPHya_I4m6vj491UQ,,,-0.244041,1.000000,,,-0.918456,,,1.0,...,,,,,1.0,,-1.0,,,
-YnOFIP-xZeWEwbatrDO9g,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zNN5HBJ8w_ahxu-UyXtVLg,,,,,,,,,,,...,,,,,,1.0,,,,
zOHQdGK6D3S0rvyNsrQi0Q,,,,-1.000000,,,0.711094,,,,...,-1.0,,,,,,1.0,,,
zgGCFFW9fmMve9iyeCJc0g,,,,,,,,,,,...,,,,,,,,1.0,,
zxxwy18lW8uviih0xzeuig,,,,,,,,,,,...,,,,,,,,,1.0,


In [48]:
"0OlaAWc97auYJuV4-HreZA -SNFOrPHya_I4m6vj491UQ"

display(reviews[reviews.business_id == '0OlaAWc97auYJuV4-HreZA'])
display(reviews[reviews.business_id == '-SNFOrPHya_I4m6vj491UQ'])

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
2245,0nKHFfJGsAgWp3qcI4JPDA,OhYlDlhbmfr-tn3upJMH7Q,0OlaAWc97auYJuV4-HreZA,5.0,0,0,0,Doing some xmas shopping and Melonie helped me...,2017-12-11 21:51:23
2292,p3II7sAyhUvZ0jJs3ugSEQ,cjczEKTE0CG57lRlajBS6w,0OlaAWc97auYJuV4-HreZA,1.0,8,2,0,Tried to return an item I bought in Sept. as a...,2011-12-31 14:16:14
2400,ikPMhwRmHCbmbXuy6vhpQg,iHe2FR5fOTDG-AEvxfUCNw,0OlaAWc97auYJuV4-HreZA,4.0,2,0,1,Great department store experience. Dillard's s...,2011-08-22 21:04:28


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
1748,cDYXu5w-fWGB8yjI-Z_XJg,y7Nno2Oc9j5svBDpdw1seQ,-SNFOrPHya_I4m6vj491UQ,1.0,0,0,0,Our food took 2 hours to arrive! We were told ...,2017-12-03 01:20:06
1809,KfmcOlRa_1Zlmruc5UuEkQ,rAusO94KkwXDDKVhtY2qRQ,-SNFOrPHya_I4m6vj491UQ,5.0,0,0,0,I could not believe this place doesn't have a ...,2018-07-18 13:14:10
1816,MdWatvx88k8d2m1shYyZfw,43Hn0CqNSkCLIq4wcbHb5Q,-SNFOrPHya_I4m6vj491UQ,3.0,1,0,0,Do alot of carry out here.\n\n Never an issue\...,2018-09-15 00:51:14
1923,EkOHTzTLGs0q0xnVv6YS5A,iHe2FR5fOTDG-AEvxfUCNw,-SNFOrPHya_I4m6vj491UQ,4.0,1,0,0,It's been a few years (maybe 10) since my last...,2014-02-09 22:31:57
1964,826ak8eo59AJ9x1Pfw3f6Q,o8vSBwAYCiMkYaVPnSDeaQ,-SNFOrPHya_I4m6vj491UQ,1.0,1,0,2,Moved into the area a year ago and was super s...,2015-10-20 03:27:41
2070,fK9ghwjIXQ63dXhr8j2sGw,0KfjL5gBg21DI8vlzZrDtg,-SNFOrPHya_I4m6vj491UQ,5.0,1,0,0,I've only ordered here a couple times over the...,2015-07-23 02:15:33
2133,sKgyUHH5k5v5-e5jLNKMHg,CNS9ROQHS-QnaNmMWs1Q9g,-SNFOrPHya_I4m6vj491UQ,1.0,0,0,0,"One star because no star, as far as I can tell...",2015-10-17 00:16:11
2244,SZ4xf3LzogsY0FZBPA__Pg,7LCG3o2KW2jgKgbKN0DQOg,-SNFOrPHya_I4m6vj491UQ,3.0,0,1,1,Their standard pizza is good. I'm writing thes...,2015-10-03 21:56:47
2274,CaOBnyJmpcGV4TxqYgyIHQ,uKQViHn1KbdW-ibqhSfxFw,-SNFOrPHya_I4m6vj491UQ,3.0,0,0,1,I've had pizza here over the past few years an...,2017-09-17 02:41:23
