In [0]:
# Import some libraries we'll need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math
import scipy.sparse as sps
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

In [0]:
def read_review_data(file_name):
    review_data = pd.read_csv(file_name)
    print "Sample Data"
    print "-----------"
    print review_data.sample(5)
    return review_data

In [0]:
eider.env.getUploaded("business.csv", "/tmp/business.csv")
business_data = pd.read_csv('/tmp/business.csv')
business_data.head(1)

In [0]:
business_data.set_index('business_id', inplace = True)

In [0]:
def get_restaurant_data(review_data, business_data):
    review_data['city'] = review_data.apply(lambda x: business_data.loc[x['business_id'], 'city'], axis=1)
    review_data['cat'] = review_data.apply(lambda x: business_data.loc[x['business_id'], 'categories'], axis=1)
    restaurant_reviews = review_data[review_data['cat'].str.contains("Restaurants",na=False)]
    return restaurant_reviews

In [0]:
def get_city_restaurant_data(city, restaurant_reviews):
    city_rest_data = restaurant_reviews.loc[review_data['city'] == city]
    return city_rest_data

In [0]:
def build_sparse_matrix(df):
    users = list(df['user_id'].unique())
    rests = list(df['business_id'].unique())
    data = df['stars'].tolist()
    row = df['user_id'].astype('category', categories=users).cat.codes
    col = df['business_id'].astype('category', categories=rests).cat.codes
    rating_matrix = csr_matrix((data, (row, col)), shape=(len(users), len(rests)))
    return rating_matrix

In [0]:
def split_train_test(df):
    '''Split for train and test data'''
    rating_matrix = build_sparse_matrix(df)
    train, test = split_rating_matrix(rating_matrix)
    num_train = train.shape[0]
    num_test = test.shape[0]
    
    print "Number of training samples: {}".format(num_train)
    print "Number of test samples: {}".format(num_test)
    
    return train, test

In [0]:
def get_sparse_matrix(df):
    '''Converts the df into a sparse ratings matrix'''
    unique_users = df['user_id'].unique().tolist()
    unique_rests = df['business_id'].unique().tolist()
    data = df['stars'].tolist()
    row = df['user_id'].astype('category', categories=unique_users).cat.codes
    col = df['business_id'].astype('category', categories=unique_bus).cat.codes
    sparse_matrix = csr_matrix((data, (row, col)), shape=(len(unique_users), len(unique_rests)))
    return sparse_matrix

In [0]:
def split_rating_matrix(rating_matrix, samples = 2):
    users, restaurants =  rating_matrix.nonzero()
    test = csr_matrix(rating_matrix.shape)
    train = rating_matrix.copy()

    num_users = rating_matrix.shape[0]
    for u in range(num_users):
        idx = restaurants[np.where(users == u)]
        np.random.shuffle(idx)
        test_idx = idx[-samples:]
        train_idx = idx[:-samples]
        test[u,test_idx] = rating_matrix[u,test_idx]
        train[u,test_idx] = 0

    data = np.array(train[train.nonzero()])[0]
    row = train.nonzero()[0]
    col = train.nonzero()[1]
    size = train.shape
    train = csr_matrix((data,(row,col)),shape = size)

    mult = train.multiply(test)
    assert(mult.nnz == 0)
    
    return train, test

In [0]:
def get_rating_matrix(city_review_data):
    unique_users = city_review_data['user_id'].unique().tolist()
    unique_rests = city_review_data['business_id'].unique().tolist()
    num_user = len(unique_users)
    num_rest = len(unique_rests)

    print "Number of Users: {}".format(num_user)
    print "Number of Restaurent: {}".format(num_rest)
    
    train, test = split_train_test(city_review_data)
    return train, test

In [0]:
#sparsity of rating matrix
def get_sparsity(rating_matrix):
    total_filled_values = rating_matrix.nnz
    return 1 - float(total_filled_values)/float(rating_matrix.shape[0]*rating_matrix.shape[1])

In [0]:
# reading the review data
eider.env.getUploaded("15core.csv", "/tmp/15core.csv")
review_data = read_review_data('/tmp/15core.csv')
print '-----------------------------'
print 'Filtering the Restaurant data'
print '-----------------------------'
restaurant_reviews = get_restaurant_data(review_data, business_data)
print restaurant_reviews.head(3)
print '-----------------------------'
print 'restaurant reviews top 10 cities'
print '-----------------------------'
print restaurant_reviews['city'].value_counts().head(10)
print '-----------------------------'
vegas_40 = get_city_restaurant_data('Las Vegas', restaurant_reviews)
vegas_40_train, vegas_40_test = get_rating_matrix(vegas_40)

In [0]:
print vegas_40_train.shape
print vegas_40_test.shape

In [0]:
print get_sparsity(vegas_40_train)

In [0]:
def fill_missing_with_mean(matrix):
    dense = matrix.toarray()
    ndf = pd.DataFrame(dense)
    ndf = ndf.replace(to_replace = 0, value = np.nan)
    ndf = ndf.fillna(ndf.mean())
    return ndf.values

In [0]:
from sklearn.preprocessing import scale
vegas_40_mean_filled = fill_missing_with_mean(vegas_40_train)
mean_centered = scale(vegas_40_mean_filled, axis = 1, with_std=False)

In [0]:
transpose = mean_centered.transpose()
item_similarity = np.matmul(transpose, mean_centered)
item_similarity.shape

In [0]:
U, S, V = np.linalg.svd(item_similarity)
print("smallest singular value = ",min(S))
print("largest singular value = ",max(S))
S_diag = np.diag(S)

In [0]:
print U.shape
print S.shape
print V.shape

In [0]:
fig = plt.figure(figsize=(8,5))
plt.semilogy(S[:200], '-', linewidth=1)
plt.title('Elbow Plot')
plt.xlabel('Principal Component')
plt.ylabel('Singular Values')
plt.show()

In [0]:
projection = U[:,:50]
low_dim_rating_projection = np.matmul(mean_centered, projection)
low_dim_rating_projection.shape

In [0]:
from scipy.stats import pearsonr
N = low_dim_rating_projection.shape[0]
out = [[0 for i in range(N)] for j in range(N)]
for i in range(N):
    out[i] = [pearsonr(low_dim_rating_projection[i],low_dim_rating_projection[j])[0] for j in range(N)]
print out

In [0]:
top_100_similar_users = []
for j in range(len(out)):
    top_100_similar_users.append(sorted(range(len(out[j])), key=lambda i: out[j][i], reverse = True)[:100])

In [0]:
dense =vegas_40_train.toarray()
ndf = pd.DataFrame(dense)
ndf = ndf.replace(to_replace = 0, value = np.nan)
ndf.head()

In [0]:
arr = ndf.values
train = ndf.values

In [0]:
def predict_rating(row, col):
    neighbors = top_100_similar_users[row]
    ls = []
    for n in neighbors:
        if np.isnan(train[n, col]):
            continue
        else:
            ls.append(float(arr[n,col]) * float(out[row][n]))
    if len(ls) == 0:
        return 0
    return float(sum(ls))/len(ls)

In [0]:
for col in ndf.columns:
    for item in ndf[col].iteritems():
        row = item[0]
        val = item[1]
        if pd.isnull(val):
            arr[row, col] = predict_rating(row, col)

In [0]:
result = pd.DataFrame(arr)
result.head()

In [0]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [0]:
del business_data
del review_data
del restaurant_reviews

import gc
gc.collect()

In [0]:
result.head()

In [0]:
test_df = pd.DataFrame(vegas_40_test.todense())
test_df = test_df.replace(to_replace = 0, value = np.nan)
test_df_mask = ((test_df.notnull()).astype('int'))
test_df_mask.head()

In [0]:
from sklearn.metrics import mean_absolute_error
def get_mse(pred, actual):
    # Ignore zero terms.
    pred = pred[actual.nonzero()].flatten()
    print len(pred)
    actual = actual[actual.nonzero()].flatten()
    print len(actual)
    return mean_absolute_error(pred, actual)

In [0]:
mask_result = np.multiply(test_df_mask.values, result.values)
actual = test_df.replace(to_replace = np.nan, value = 0).values
actual = np.multiply(actual, test_df_mask.values)
get_mse(mask_result, actual)