In [1]:
from pprint import pprint
from py2neo import Graph

In [2]:
graph = Graph(host="localhost://7474", auth=("neo4j", "123456"))

In [5]:
query = """MATCH (p:Reviewer)-[w:Wrote]->(r:Review)-[a:About]->(re:Restaurant)
RETURN p.name AS reviewer, COUNT(w) AS num_of_reviews
ORDER BY num_of_reviews DESC LIMIT 20"""


In [7]:
#Find out reviewer order by number of wrote reviews
a = graph.run(query).to_data_frame()
a

Unnamed: 0,num_of_reviews,reviewer
0,677,U0044
1,657,U0492
2,498,U0008
3,353,U1418
4,336,U0088
5,312,U0241
6,301,U0255
7,295,U0057
8,293,U0158
9,274,U0143


In [9]:
reviewer = a['reviewer'].tolist()
print(reviewer)

['U0044', 'U0492', 'U0008', 'U1418', 'U0088', 'U0241', 'U0255', 'U0057', 'U0158', 'U0143', 'U0028', 'U0073', 'U0082', 'U0257', 'U0014', 'U0034', 'U0686', 'U0353', 'U0273', 'U0087']


# Build up recommendation engine

Our recommendation engine will recommend restaurant to target user based on the preference of the usesr that will have a similar taste on restaurant. For each user, we will identify the most similar users to them and preferences of these simialr users are then used to generate recommendations for the target user.

This algorithm is known as Collaborative Filtering.

Collaborative filtering (CF) is a technique commonly used to build personalized recommendation systems. Some popular websites that use CF technology include Amazon, Netflix, and IMDB. In CF, predictions about a user's interests are made by compiling preferences from similar users.


The similarity metric we will use here is Jaccard Similarity Coefficient or Jaccard Index also known as Intersection over Union(IoU).

<img src="reco_data\IoU.jpg">

Jaccard Index between two sets A and B is the ratio of the number of elements in the intersection of A and B over the number of elements in the union of A and B.

Reference: https://deepai.org/machine-learning-glossary-and-terms/jaccard-index

## Get the most similar users to our target user, the Jaccard Index, number of Restaurants in common (Intersection), number of Restaurants in total (Union)

In [6]:
query = """
        // get target user and their neighbors pairs and count 
        // of distinct restaurant that they have reviewed in common
        MATCH (p1:Reviewer)-[:Wrote]->(r1:Review)-[:About]->(re:Restaurant)<-[:About]-(r2:Review)<-[:Wrote]-(p2:Reviewer)
        WHERE p1 <> p2 AND p1.name = {p_name}
        WITH p1, p2, COUNT(DISTINCT re) as intersection
        
        // get count of all the distinct restaurants that they have reviewed in total (Union)
        MATCH (p:Reviewer)-[:Wrote]->(r:Review)-[:About]->(re:Restaurant)
        WHERE p in [p1, p2]
        WITH p1, p2, intersection, COUNT(DISTINCT re) as union
        
        // compute Jaccard index
        WITH p1, p2, intersection, union, (intersection * 1.0 / union) as jaccard_index
        
        // get top k nearest neighbors based on Jaccard index
        ORDER BY jaccard_index DESC, p2.name
        WITH p1, COLLECT([p2.name, jaccard_index, intersection, union])[0..{k}] as neighbors
        
        WHERE SIZE(neighbors) = {k}   // return users with enough neighbors
        RETURN p1.name as reviewer, neighbors

        """

In [7]:
neighbors = {}
for i in graph.run(query, p_name = "U0273", k = 10):
    neighbors[i[0]] = i[1]

print("# User Justin Teo's 10 nearest neighbors: user_name, jaccard_index, intersection, union")
pprint(neighbors)


Failed to write data to connection ('localhost', 7687) (Address(host='127.0.0.1', port=7687)); ("10054; '远程主机强迫关闭了一个现有的连接。'; None; 10054; None")


# User Justin Teo's 10 nearest neighbors: user_name, jaccard_index, intersection, union
{'U0273': [['U0082', 0.23509933774834438, 71, 302],
           ['U0492', 0.2, 57, 285],
           ['U0241', 0.18345323741007194, 51, 278],
           ['U0008', 0.17077464788732394, 97, 568],
           ['U0012', 0.16906474820143885, 47, 278],
           ['U0088', 0.16412213740458015, 43, 262],
           ['U0057', 0.16333333333333333, 49, 300],
           ['U0003', 0.16304347826086957, 45, 276],
           ['U0028', 0.15827338129496402, 44, 278],
           ['U0034', 0.1554054054054054, 46, 296]]}


## Find out the top 5 restaurants ranked by number of reviewed times from the nearest neighbors's reviewed restaurants 

In [8]:
# get the list of the nearest neighbors names
nearest_neighbors = [neighbors["U0273"][i][0] for i in range(len(neighbors["U0273"]))]

query = """
        // get top n recommendations for user 'Justin Teo' from his nearest neighbors
        MATCH (p1:Reviewer),
              (neighbor:Reviewer)-[:Wrote]->(r:Review)-[:About]->(re:Restaurant)    
        WHERE p1.name = {p_name}
          AND neighbor.name in {nearest_neighbors}
          AND not (p1)-[:Wrote]->(r:Review)-[:About]->(re:Restaurant)         // filter for movies that our user hasn't reviewed
        
        WITH p1, re, COUNT(DISTINCT neighbor) as countnns // times reviewed by nearest neighbors
        ORDER BY p1.name, countnns DESC               
        RETURN p1.name as user, COLLECT([re.name, countnns])[0..{n}] as recommendations  
        """

recommendations = {}
for i in graph.run(query, p_name = "U0273", nearest_neighbors = nearest_neighbors, n = 5):
    recommendations[i[0]] = i[1]
    
print("User U0273's recommendations: Restaurant, number of reviewed by neighbors")
pprint(recommendations)


User U0273's recommendations: Restaurant, number of reviewed by neighbors
{'U0273': [['Birds of a Feather', 10],
           ['Alter Ego', 9],
           ['The Coconut Club', 9],
           ['Sunday Folks', 9],
           ['Park Bench Deli', 9]]}


## Unify the two parts above as a whole function. And using before_2019 graph to prepare for validation list

In [10]:
import sys
from pprint import pprint
from py2neo import Graph

# p_name = sys.argv[1:]
p_name = reviewer

graph = Graph(host="localhost://7474", auth=("neo4j", "123456"))

def r_recommender(graph, cid, num_nearest_neighbors, num_recommendations):

    query = """
           MATCH (p1:Reviewer)-[:Wrote]->(r1:Review)-[:About]->(re:Restaurant)<-[:About]-(r2:Review)<-[:Wrote]-(p2:Reviewer)
           WHERE p1 <> p2 AND p1.name = {p_name}
           WITH p1, p2, COUNT(DISTINCT re) as intersection
           
           MATCH (p:Reviewer)-[:Wrote]->(r:Review)-[:About]->(re:Restaurant)
           WHERE p in [p1, p2]
           WITH p1, p2, intersection, COUNT(DISTINCT re) as union

           WITH p1, p2, intersection, union, 
              (intersection * 1.0 / union) as jaccard_index

           ORDER BY jaccard_index DESC, p2.name
           WITH p1, COLLECT(p2)[0..{k}] as neighbors
           WHERE SIZE(neighbors) = {k}                                              
           UNWIND neighbors as neighbor
           WITH p1, neighbor

           MATCH (neighbor)-[:Wrote]->(r:Review)-[:About]->(re:Restaurant)         
           WHERE not (p1)-[:Wrote]->(r:Review)-[:About]->(re:Restaurant)                        
           WITH p1, re, COUNT(DISTINCT neighbor) as countnns
           ORDER BY p1.name, countnns DESC                            
           RETURN p1.name as user, 
              COLLECT(re.name)[0..{n}] as recommendations  
           """

    recommendations = {}
    for p in p_name:
        for i in graph.run(query, p_name = p, k = num_nearest_neighbors, n = num_recommendations):
            recommendations[i[0]] = i[1]
    return recommendations

#query on burpple_before_2019
recommendation = r_recommender(graph, p_name, 10, 5)
pprint(recommendation)

{'U0008': ['Park Bench Deli',
           'Birds of a Feather',
           'Wildseed (The Summerhouse)',
           'Clinton St. Baking Co. & Restaurant Singapore',
           'KEK Keng Eng Kee Seafood (Alexandra)'],
 'U0014': ['The Coconut Club',
           'Clinton St. Baking Co. & Restaurant Singapore',
           'Birds of a Feather',
           'The Masses',
           'Sarnies (Telok Ayer)'],
 'U0028': ['Hvala ',
           'Matchaya (The Cathay)',
           'Burnt Ends',
           'Sunday Folks',
           'RONIN'],
 'U0034': ['Sunday Folks',
           'Park Bench Deli',
           'Wildseed (The Summerhouse)',
           'Clinton St. Baking Co. & Restaurant Singapore',
           'Alter Ego'],
 'U0044': ['Clinton St. Baking Co. & Restaurant Singapore',
           'KEK Keng Eng Kee Seafood (Alexandra)',
           'Sunday Folks',
           'Park Bench Deli',
           'Nesuto'],
 'U0057': ['Clinton St. Baking Co. & Restaurant Singapore',
           'Sunday Folks',
         

## Use all data graph to check if the user actually go to the recommended restaurant at 2019

In [21]:
graph2 = Graph(host="localhost://7474", auth=("neo4j", "123456"))

In [33]:
def validation(graph, dic):
    query = """
            MATCH (p:Reviewer)-[:Wrote]->(r:Review)-[:About]->(re:Restaurant)
            WHERE p.name = {p_name} AND re.name IN {re_name}
            RETURN p.name, re.name
    
            """
    result = []
    for item in dic:
        p = item
        temp = []
        temp.append(p)
        ans = graph.run(query, p_name=p, re_name=dic[item]).data()
        for val in ans:
            if val['re.name'] not in temp:
                temp.append(val['re.name'])          
        if len(temp) > 1:
            result.append(temp)

    
    return result

#validation on all burpple
result = validation(graph2, recommendation)

In [38]:
#print out the matched list
result_dic = {}
for i in range(len(result)):
    name = result[i][0]
    result_dic[name] = result[i][1:]
    
pprint(result_dic)

{'U0008': ['Birds of a Feather',
           'Wildseed (The Summerhouse)',
           'Clinton St. Baking Co. & Restaurant Singapore',
           'KEK Keng Eng Kee Seafood (Alexandra)',
           'Park Bench Deli'],
 'U0014': ['Birds of a Feather',
           'Clinton St. Baking Co. & Restaurant Singapore',
           'Sarnies (Telok Ayer)',
           'The Masses'],
 'U0028': ['Hvala ', 'Matchaya (The Cathay)', 'Sunday Folks'],
 'U0034': ['Wildseed (The Summerhouse)',
           'Clinton St. Baking Co. & Restaurant Singapore',
           'Alter Ego',
           'Sunday Folks',
           'Park Bench Deli'],
 'U0044': ['Clinton St. Baking Co. & Restaurant Singapore',
           'KEK Keng Eng Kee Seafood (Alexandra)',
           'Sunday Folks',
           'Park Bench Deli',
           'Nesuto'],
 'U0057': ['Clinton St. Baking Co. & Restaurant Singapore',
           'Sunday Folks',
           'Park Bench Deli',
           'RONIN'],
 'U0073': ['Birds of a Feather',
           'Clinton St.

## Get accuracy for each user by comparing the length of matched in 2019 and the length of recommendation.

In [43]:
accuracy = {}
for item in result_dic:

    accuracy[item] = len(result_dic[item])/len(recommendation[item])
    
print(accuracy)

{'U0044': 1.0, 'U0492': 1.0, 'U0008': 1.0, 'U1418': 1.0, 'U0088': 0.6, 'U0241': 1.0, 'U0255': 1.0, 'U0057': 0.8, 'U0158': 0.8, 'U0143': 0.8, 'U0028': 0.6, 'U0073': 0.8, 'U0082': 1.0, 'U0257': 1.0, 'U0014': 0.8, 'U0034': 1.0, 'U0686': 0.8, 'U0353': 0.6, 'U0273': 0.6, 'U0087': 0.6}


# Generate graph to visualize results

In [None]:
#Query to generate target user, neighbors and restaurant relation graph
"""
MATCH (p1:Reviewer)-[:VISITED]->(re:Restaurant)<-[:VISITED]-(p2:Reviewer)
WHERE p1 <> p2 AND p1.name = "U0273"
WITH p1, p2, COUNT(DISTINCT re) as intersection

MATCH (p:Reviewer)-[:VISITED]->(re:Restaurant)
WHERE p in [p1, p2]
WITH p1, p2, intersection, COUNT(DISTINCT re) as union

WITH p1, p2, intersection, union, (intersection * 1.0 / union) as jaccard_index

ORDER BY jaccard_index DESC, p2.name
WITH p1, COLLECT(p2)[0..5] as neighbors
WHERE SIZE(neighbors) = 5
UNWIND neighbors as neighbor
WITH p1, neighbor

MATCH (neighbor)-[:VISITED]->(re:Restaurant)<-[:VISITED]-(p1)
RETURN neighbor, p1,re
"""

In [None]:
#Query to generate neighbors and restaurants target user hasn't been yet
"""
MATCH (p1:Reviewer)-[:VISITED]->(re:Restaurant)<-[:VISITED]-(p2:Reviewer)
WHERE p1 <> p2 AND p1.name = "U0273"
WITH p1, p2, COUNT(DISTINCT re) as intersection

MATCH (p:Reviewer)-[:VISITED]->(re:Restaurant)
WHERE p in [p1, p2]
WITH p1, p2, intersection, COUNT(DISTINCT re) as union

WITH p1, p2, intersection, union, (intersection * 1.0 / union) as jaccard_index

ORDER BY jaccard_index DESC, p2.name
WITH p1, COLLECT(p2)[0..5] as neighbors
WHERE SIZE(neighbors) = 5
UNWIND neighbors as neighbor
WITH p1, neighbor

MATCH (neighbor)-[:VISITED]->(re:Restaurant)
WHERE not (p1)-[:VISITED]->(re:Restaurant)
RETURN neighbor, re
"""
