In [0]:
# Import some libraries we'll need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math
import seaborn as sns
from sklearn.model_selection import train_test_split
from numpy import linalg as LA

In [0]:
def read_review_data(file_name):
    review_data = pd.read_csv(file_name)
    print "Sample Data"
    print "-----------"
    print review_data.sample(5)
    return review_data

In [0]:
business_data = pd.read_csv('../business.csv')
business_data.head(1)

In [0]:
business_data.set_index('business_id', inplace = True)

In [0]:
def get_restaurant_data(review_data, business_data):
    review_data['city'] = review_data.apply(lambda x: business_data.loc[x['business_id'], 'city'], axis=1)
    review_data['cat'] = review_data.apply(lambda x: business_data.loc[x['business_id'], 'categories'], axis=1)
    restaurant_reviews = review_data[review_data['cat'].str.contains("Restaurants",na=False)]
    return restaurant_reviews

def get_city_restaurant_data(city, restaurant_reviews):
    city_rest_data = restaurant_reviews.loc[review_data['city'] == city]
    return city_rest_data

In [0]:
eider.env.getUploaded("40core.csv", "/tmp/40core.csv")
review_data = read_review_data('/tmp/40core.csv')
print '-----------------------------'
print 'Filtering the Restaurant data'
print '-----------------------------'
restaurant_reviews = get_restaurant_data(review_data, business_data)
print restaurant_reviews.head(3)
print '-----------------------------'
print 'restaurant reviews top 10 cities'
print '-----------------------------'
print restaurant_reviews['city'].value_counts().head(10)
print '-----------------------------'

In [0]:
def get_city_restaurant_data(city, restaurant_reviews):
    city_rest_data = restaurant_reviews.loc[review_data['city'] == city]
    return city_rest_data

vegas_df = get_city_restaurant_data('Las Vegas', restaurant_reviews)
vegas_df.head()

In [0]:
# Count number of unique users and number of unique restaurants in our dataset
unique_users = vegas_df['user_id'].unique().tolist()
unique_rests = vegas_df['business_id'].unique().tolist()
num_user = len(unique_users)
num_rest = len(unique_rests)

# Split to train and test
train, test = train_test_split(vegas_df, random_state = 8675309, stratify = vegas_df['business_id'])
num_train = train.shape[0]
num_test = test.shape[0]

print "Number of Users: {}".format(num_user)
print "Number of restaurants: {}".format(num_rest)

In [0]:
user_ratings = vegas_df.groupby('user_id').agg(['count','mean']).reset_index()
restaurant_ratings = vegas_df.groupby('business_id').agg(['count','mean']).reset_index()

user_ratings.head()



In [0]:
#   On the training dataset, what is the fraction of restaurants that have more that 32, 64, and 128 reviews respectively? How about the number of Users that have more than 32, 64, and 128 reviews respectively?

def GetUserCountWithMoreReviews(review_num):
    return user_ratings[user_ratings['stars']['count'] > review_num].shape[0]

def GetRestaurantPercentWithMoreReviews(review_num):
    return 100 * restaurant_ratings[restaurant_ratings['stars']['count'] > review_num].shape[0] / restaurant_ratings.shape[0]
    
print '{}% restaurants have more than {} reviews'.format(GetRestaurantPercentWithMoreReviews(128), 128)
print '{}% restaurants have more than {} reviews'.format(GetRestaurantPercentWithMoreReviews(64), 64)
print '{}% restaurants have more than {} reviews'.format(GetRestaurantPercentWithMoreReviews(32), 32)

print '{} Users have more than {} reviews'.format(GetUserCountWithMoreReviews(128), 128)
print '{} Users have more than {} reviews'.format(GetUserCountWithMoreReviews(64), 64)
print '{} Users have more than {} reviews'.format(GetUserCountWithMoreReviews(32), 32)

In [0]:
plt.xlabel('User Rating count')
plt.ylabel('User Rating mean')
plt.title('User rating count  - mean')
plt.plot(user_ratings['stars']['count'], user_ratings['stars']['mean'], 'r.')
plt.show()

plt.xlabel('Restaurant Rating count')
plt.ylabel('Restaurant Rating mean')
plt.title('Restaurant rating count  - mean')
plt.plot(restaurant_ratings['stars']['count'], restaurant_ratings['stars']['mean'], 'g.')
plt.show()

In [0]:
def construct_matrix(data_frame, total_set, row_label, col_label):
    indexes = []
    matrix = []
    mask_matrix = []
    for index, grouped in data_frame.groupby(row_label):
        indexes.append(index)
        joined_ratings = pd.merge(restaurant_ratings,grouped, on=col_label, how='outer').fillna(0)
            
        matrix.append(joined_ratings['stars'].tolist())
        mask_matrix.append(joined_ratings['stars'].map(lambda x : 1 if x != 0 else 0))
    return indexes, matrix, mask_matrix
    
train_user_index, train_user_ratings, train_user_ratings_mask = construct_matrix(train, user_ratings, 'user_id', 'business_id')
test_user_index, test_user_ratings, test_user_ratings_mask = construct_matrix(test, user_ratings, 'user_id', 'business_id')

In [0]:
print restaurant_ratings.sample()
rest_avg_rating = vegas_df.groupby('business_id')['stars'].mean().reset_index(name='mean')
a = np.ones((num_user, 1))
f = np.array([rest_avg_rating['mean'].tolist()])
baseline_p = np.dot(a, f)

f_hist, bin_edges = np.histogram(f, bins=[3,3.5,4,4.5,5])
plt.bar(bin_edges[:-1], f_hist)
plt.show()

In [0]:
def get_RMS(input_matrix, baseline, mask, K):
    delta = input_matrix - baseline
    return LA.norm(np.multiply(mask, delta)) / math.sqrt(K)

print'train data RMS = {0}'.format(get_RMS(baseline_p, train_user_ratings, train_user_ratings_mask, num_train))

In [0]:
def calculate(R, S, A, F):
    return np.multiply(R, S - np.dot(A, F))
    
def calculate_LP(K, R, S, A, F):
    norm = LA.norm(calculate(R, S, A, F))
    return norm * norm / K

def learn_AF(K, R, S, learning_rate, step, K_test, R_test, S_test):
    seq_A = [np.random.rand(num_user,2)]
    seq_F = [np.random.rand(2, num_rest)]
    train_seq_LP = [calculate_LP(K, R, S, seq_A[-1], seq_F[-1])]
    test_seq_LP = [calculate_LP(K_test, R_test, S_test, seq_A[-1], seq_F[-1])]
    for i in range(step):
        prev_A = seq_A[-1]
        prev_F = seq_F[-1]
        F_T = np.transpose(prev_F)
        A_T = np.transpose(prev_A)
        temp = calculate(R, S, prev_A, prev_F) 
        seq_A.append(prev_A - (-2.0) * np.dot(temp, F_T) * learning_rate / K)
        seq_F.append(prev_F - (-2.0) * np.dot(A_T, temp) * learning_rate / K)
        train_seq_LP.append(calculate_LP(K, R, S, seq_A[-1], seq_F[-1]))
        test_seq_LP.append(calculate_LP(K_test, R_test, S_test, seq_A[-1], seq_F[-1]))
    plt.xlabel('step')
    plt.ylabel('L(p)')
    plt.title('Training data Learning graph, rate = ' + str(learning_rate))
    plt.plot(range(step + 1),train_seq_LP)
    plt.show()
    plt.xlabel('step')
    plt.ylabel('L(p)')
    plt.title('Test data Learning graph, rate = ' + str(learning_rate))
    plt.plot(range(step + 1),test_seq_LP)
    plt.show()
    return seq_A, seq_F, train_seq_LP, test_seq_LP

a_params_seq, f_params_seq, train_lp_seq, test_lp_seq = learn_AF(train.shape[0], train_user_ratings_mask, train_user_ratings, 50.0, 50, test.shape[0], test_user_ratings_mask, test_user_ratings)

prediction =  np.dot(a_params_seq[-1], f_params_seq[-1])
print "learning result, prediction = {}".format(prediction)
print "test prediction RMS = {}".format(get_RMS(prediction, test_user_ratings, test_user_ratings_mask, test.shape[0]))