In [1]:
from pandas import Series, Timestamp, DataFrame
import pandas as pd
import numpy as np
import simplejson as json
import matplotlib.pyplot as plt
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [2]:
def load_jsons(data_path, file):
    """ helper function to load '.json' files (they're not proper jsons) """
    file_path = data_path + file
    with open(file_path) as jsons:
        lines = [json.loads(json_line) for json_line in jsons]
    return pd.DataFrame(lines)

In [3]:
# Load data for town called Ambridge
data_path = '../data/ambridge/'

# Seperate the files 
review_file = 'review.json'
business_file = 'business.json'
user_file = 'user.json'
tip_file = 'tip.json'
checkin_file = 'checkin.json'

reviews = load_jsons(data_path, review_file)
businesses = load_jsons(data_path, business_file)
users = load_jsons(data_path, user_file)
tips = load_jsons(data_path, tip_file)
checkins = load_jsons(data_path, checkin_file)

In [4]:
display(reviews)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,qb2EVdmVNvw3D0kBMN6Xrg,hXydWH25S92HjI5hmWRSyA,dJ0R-XT78LUQeNHQkD-G9g,5.0,0,0,0,Best place to get ice cream. They have only tw...,2018-08-14 04:03:02
1,Ecr_pKR7786kmcLVXLp5NA,vo6vLeHoPl_h-Vt-YHs9_A,Q_0eGl-aElqHKukHvmLdwA,1.0,0,0,0,"Sorry to say, Nelias did not live up to the ot...",2015-12-05 23:21:39
2,Ro6-JL0KCS5JULXUNRST-w,i1qyYL4fpAel8Ljt4WaZ3g,Q_0eGl-aElqHKukHvmLdwA,1.0,0,0,0,Wouldn't give it one star if I could. This pla...,2015-08-07 19:33:13
3,tuDIfqFjtj5zTLjtY9W1Hg,dUOg3fS3RTYDZUpu0CqqiA,729grSa1Wsn-hfv7D5uOxg,5.0,2,0,1,Police Station Pizza has always been my favori...,2014-10-28 18:29:48
4,zw3oXGWKQdVjST5UI9kM3g,k2Bsnh6CV0HFX8RgGR1p9A,Q_0eGl-aElqHKukHvmLdwA,5.0,0,0,0,There is nothing fancy about this place - it i...,2013-04-13 15:58:06
...,...,...,...,...,...,...,...,...,...
374,_mw0xf5enQSEUMUAoIiLZg,voyChD-aoglG2mWMySA6Ow,xM8dVGLkYaL94EuAIkjMEA,5.0,0,0,0,I have to recommend spending the afternoon at ...,2015-11-07 22:25:16
375,iKcapQ8yQ1PyVj2aZ7qEuQ,QPPgrlRtln6lbf9TRkzxLA,xM8dVGLkYaL94EuAIkjMEA,5.0,0,0,0,We held our wedding in the garden at Old Econo...,2017-08-23 23:19:12
376,1rYRDNCQMue4ptTfVw1NRA,8lCl40obN7k8v-wc4dAaiA,XJfvPt-8f-6d5Foaz_HYLQ,5.0,0,0,0,For years and years there was no place in Ambr...,2018-03-11 17:17:48
377,G6HnqzjaG3Jcb_NlfyNcbg,mLoENT0k1il695DIcPCdGA,XJfvPt-8f-6d5Foaz_HYLQ,5.0,2,0,2,Good service. Low Prices. Food was really good...,2018-05-14 12:50:38


# All functions below were taken and modified from bit.ly/2T8vnLe

In [10]:
tfidf = TfidfVectorizer(stop_words = "english")
reviews["text"] = reviews["text"].fillna("")

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
overview_matrix = tfidf.fit_transform(reviews["text"])

In [11]:
similarity_matrix = linear_kernel(overview_matrix,overview_matrix)

display(similarity_matrix)

array([[1.        , 0.01297296, 0.01069283, ..., 0.04352182, 0.05870026,
        0.        ],
       [0.01297296, 1.        , 0.08029609, ..., 0.        , 0.        ,
        0.        ],
       [0.01069283, 0.08029609, 1.        , ..., 0.02906881, 0.0212619 ,
        0.01750167],
       ...,
       [0.04352182, 0.        , 0.02906881, ..., 1.        , 0.28176846,
        0.04385523],
       [0.05870026, 0.        , 0.0212619 , ..., 0.28176846, 1.        ,
        0.        ],
       [0.        , 0.        , 0.01750167, ..., 0.04385523, 0.        ,
        1.        ]])

In [12]:
mapping = pd.Series(reviews.index, index = reviews["text"])

In [23]:
"""
Functions returns a list of 15 business ids based on similair reviews.
1 business id can be returned multiple times.
"""

def recommended_business(review):
    review_index = mapping[review]

    #get similarity values with other businesses
    #similarity_score is the list of index and similarity matrix
    similarity_score = list(enumerate(similarity_matrix[review]))

    #sort in descending order the similarity score of movie inputted with all the other movies
    similarity_score = sorted(similarity_score, key = lambda x: x[1], reverse=True)

    # Get the scores of the 15 most similar businesses. Ignore the first business.
    similarity_score = similarity_score[1:16]

    #find ids of similair reviews
    review_indices = [i[0] for i in similarity_score]
    similair_reviews = reviews["text"].iloc[review_indices].index
    
    similair_businesses = []
    
    for review_id in similair_reviews:
        similair_businesses.append(reviews.iloc[review_id]["business_id"])
    
    return Series(similair_businesses)

In [24]:
recommended_business(1)

0     Q_0eGl-aElqHKukHvmLdwA
1     -InU2nAbC9AuS-Um2Cowgw
2     Q_0eGl-aElqHKukHvmLdwA
3     Q_0eGl-aElqHKukHvmLdwA
4     Q_0eGl-aElqHKukHvmLdwA
5     4mpSNvmyG89Uqy2ahP4JMQ
6     -InU2nAbC9AuS-Um2Cowgw
7     GI1WxFbY9tJ9-ChRrwnrzg
8     N92Pbr2ygKDLkjmr-4BAPw
9     Q_0eGl-aElqHKukHvmLdwA
10    Q_0eGl-aElqHKukHvmLdwA
11    muFJIZKZwbAfy_pEFKF_pw
12    Eu_zPTrNVAXkpdSxf7CJ2w
13    Q_0eGl-aElqHKukHvmLdwA
14    729grSa1Wsn-hfv7D5uOxg
dtype: object