In [2]:
import os
import pandas as pd
import numpy as np

### Loading training and testing review data

In [3]:
data_path = './data/'

# define file path
train_file = os.path.join(data_path, 'review_train.csv')

# read datasets
train_data = pd.read_csv(train_file)


### Generating business pairs for the Pearson correlation

In [4]:
# notice that the similarity of (item1, item2) is the same as (item2, item1)
# reduce the number of pairs in order to reduce the computation

businesses = list(set(train_data['bid']))
business_pairs = []

for i in range(len(businesses)):
    for j in range((i + 1), len(businesses)):
        business_pairs.append(sorted([businesses[i], businesses[j]]))
    

### Computing the weight matrix (Pearson similarity)

In [5]:
# convert the review data into a dictionary {(uid, bid): rating}
review_dict = {(row[0], row[1]): row[2] for row in train_data.values.tolist()}

# map each business id to a list of users rated on that {business_id: list[user_ids]}
business_groups = train_data.groupby('bid')
business_user_dict = {bid: list(business_groups.get_group(bid)['uid']) for bid in business_groups.groups}

print(business_user_dict)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [6]:
# define a function to find common (co-rated) user list for two given businesses

def find_co_rated_users(bid_1, bid_2):
    
    # Hint: utilize the "business_user_dict" to query the user lists
    
    ### pseudocode ###
    # user_list_1 = the users who rated on bid_1
    # user_list_2 = the users who rated on bid_2
    # co_rated_users = the intersection (user_list_1, user_list_2)
    # return co_rated_users
    
    user_list_1 = business_user_dict.get(bid_1, [])
    user_list_2 = business_user_dict.get(bid_2, [])
    
    return set(user_list_1).intersection(set(user_list_2))


In [7]:
# define a function to compute Pearson correlation

import math

def compute_pearson_correlation(rating_list_1, rating_list_2):

    ### pseudocode ###
    # avg_rating_1 = the average rating of rating_list_1 
    # avg_rating_2 = the average rating of rating_list_2
    # e.g., rating_list_1 = [2, 5], rating_list = [1, 3]
    
    #avg_rating_1 = mean(rating_list)
    avg_rating_1 = float(sum(rating_list_1))/float(len(rating_list_1))
    avg_rating_2 = float(sum(rating_list_2))/float(len(rating_list_2))

    weight_sum, weight_1, weight_2 = 0.0, 0.0, 0.0
    # calculate the value for weight_sum (numerator) and weight_1, weight_2 (denominator)
    
    for i in range(len(rating_list_1)):
        ele1 = rating_list_1[i]
        ele2 = rating_list_2[i]
                                                   
        ele1 = ele1 - avg_rating_1
        ele2 = ele2 - avg_rating_2
                                                   
        weight_sum += ele1 * ele2
        weight_1 = ele1 * ele1
        weight_2 = ele2 * ele2
        
    if weight_1 == 0.0 or weight_2 == 0.0:
        return 0.0
    
    else:
        return weight_sum / math.sqrt(weight_1) / math.sqrt(weight_2)


In [8]:
len(business_pairs)

2318781

In [12]:
# compute the Pearson correlation for each business pair

PEARSON_THRED = 0.1 
weight_matrix = []

for pair in business_pairs:
    
    bid_1, bid_2 = pair[0], pair[1]
    
    ### pseudocode ###
    
    # call find_co_rated_users()
    co_rated_users = find_co_rated_users(bid_1, bid_2)
    # if the number of co_rated_users <= 1, weight should be 0.0
    if len(co_rated_users) <= 1:
        continue
        
    rating_list_1 = [review_dict[(u, bid_1)] for u in co_rated_users]
    rating_list_2 = [review_dict[(u, bid_2)] for u in co_rated_users]
    
    
    # get the rating list of the co-rated users on bid_1 and bid_2
    # call compute_pearson_correlation()
    weight = compute_pearson_correlation(rating_list_1, rating_list_2)

    if weight > PEARSON_THRED:  # you can set some threshold to filter the low correlated values
        weight_matrix.append([bid_1, bid_2, weight])  # (bid_1, bid_2) is in alphabetical order


In [14]:
weight_matrix[:5]

[['Wfl_ch21ojyHsPnkvvnstw', 'pFQaji0idkrzv5J-qTQ82Q', 2.0],
 ['pFQaji0idkrzv5J-qTQ82Q', 'zpoZ6WyQUYff18-z4ZU1mA', 3.0],
 ['Rd11Bosr8JkfoUhZLzpuxg', 'pFQaji0idkrzv5J-qTQ82Q', 2.0],
 ['GI-CAiZ_Gg3h21PwrANB4Q', 'pFQaji0idkrzv5J-qTQ82Q', 2.9999999999999987],
 ['CiYLq33nAyghFkUR15pP-Q', 'pFQaji0idkrzv5J-qTQ82Q', 130.49999999999991]]

In [15]:
# write the results to a CSV file

weight_matrix_df = pd.DataFrame(weight_matrix, columns=['bid1', 'bid2', 'corr'])
weight_matrix_file_path = os.path.join(data_path, 'weight_matrix.csv')
weight_matrix_df.to_csv(weight_matrix_file_path, index=False)
