### This notebook is intended to construct a Neo4j (https://neo4j.com/) graph using the Beauty dataset.
### Note: The process may take some time to complete.

In [2]:
!pip install -q py2neo==2021.2.4

In [3]:
import urllib.request
from pathlib import Path
import tempfile
import shutil
import pandas as pd
from py2neo import Graph, Node, Relationship
import os
import gzip
import json
import pickle

In [4]:
# Replace with your actual Neo4j connection details
uri = "YOUR_URI" 
username = "neo4j"
password = "YOUR_PASSWORD"

graph = Graph(uri, auth=(username, password))

In [9]:
# Download "beauty"
urls = ['http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Beauty.csv',
        'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Beauty.json.gz']

folder_path = "./beauty/"
os.makedirs(folder_path, exist_ok=True)

for idx, url in enumerate(urls):
    tmproot = Path(tempfile.mkdtemp())
    tmpfile = tmproot.joinpath('file')
    urllib.request.urlretrieve(url, tmpfile)
    shutil.move(tmpfile, folder_path + url.split("/")[-1])

In [10]:
# Load the preprocessed data
with open("./LlamaRec/data/preprocessed/beauty_min_rating0-min_uc5-min_sc5/dataset.pkl", "rb") as fin:
    preprocessed_data = pickle.load(fin)
    
print(len(preprocessed_data["umap"]))
print(len(preprocessed_data["smap"]))
# dict_keys(['train', 'val', 'test', 'meta', 'umap', 'smap'])

22332
12086


In [11]:
# Load the ratings
ratings_df = pd.read_csv(folder_path + "ratings_Beauty.csv", header=None)
ratings_df.columns = ['uid', 'sid', 'rating', 'timestamp']

print(ratings_df.shape)
print(len(set(ratings_df['uid'].tolist())))
print(len(set(ratings_df['sid'].tolist())))
print(len(ratings_df['rating'].tolist()))

filtered_df = ratings_df[ratings_df['uid'].isin(preprocessed_data['umap'])]
filtered_df = filtered_df[filtered_df['sid'].isin(preprocessed_data['smap'])]
filtered_df = filtered_df.reset_index()

print(filtered_df.shape)
print(len(set(filtered_df['uid'].tolist())))
print(len(set(filtered_df['sid'].tolist())))
print(len(filtered_df['rating'].tolist()))

(2023070, 4)
1210271
249274
2023070
(198215, 5)
22332
12086
198215


In [12]:
# Verification
query = """MATCH (n:User)
RETURN COUNT(n) AS node_count"""
print(graph.run(query))

query = """MATCH (n:Item)
RETURN COUNT(n) AS node_count"""
print(graph.run(query))

query = """MATCH (u:User)-[r:RATED]->(i:Item)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

 node_count 
------------
      22332 

 node_count 
------------
      12086 

 relationship_count 
--------------------
             198215 



In [18]:
# Add rating relationships between users and items to the graph
for index, row in filtered_df.iterrows():    
    if row["uid"] in preprocessed_data["umap"] and row["sid"] in preprocessed_data["smap"]:
        user_id = preprocessed_data["umap"][row["uid"]]
        item_id = preprocessed_data["smap"][row["sid"]]
        rate = row["rating"]
        time = row["timestamp"]
    
        user_n = Node("User", id=str(user_id))
        graph.merge(user_n, "User", "id")

        item_n = Node("Item", id=str(item_id))
        graph.merge(item_n, "Item", "id")

        query = 'MATCH (u:User {id: $user_id}), (i:Item {id: $item_id}) CREATE (u)-[:RATED { rating: $rating, timestamp: $timestamp } ]->(i)'
        graph.run(query, user_id=str(user_id), item_id=str(item_id), rating=rate, timestamp=time)
        
    if index%1000==0:
        print(index)
    
# Verification
query = """MATCH (n:User)
RETURN COUNT(n) AS node_count"""
print(graph.run(query))

query = """MATCH (n:Item)
RETURN COUNT(n) AS node_count"""
print(graph.run(query))

query = """MATCH (u:User)-[r:RATED]->(i:Item)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

In [13]:
# Add bidirectional relationships between items and brands to the graph
def connectBrand(item_id, brand_name):
    brand_n = Node("Brand", id=str(brand_name))
    graph.merge(brand_n, "Brand", "id")
    
    query = 'MATCH (i:Item {id: $item_id}), (b:Brand {id: $brand_id}) CREATE (i)-[:BRAND_IS]->(b)'
    graph.run(query, item_id=str(item_id), brand_id=str(brand_name))

    query = 'MATCH (b:Brand {id: $brand_id}), (i:Item {id: $item_id}) CREATE (b)-[:BRAND_INCLUDES]->(i)'
    graph.run(query, brand_id=str(brand_name), item_id=str(item_id))


# Add bidirectional relationships between items and categories to the graph
def connectCats(item_id, cat_name):
    cat_n = Node("Category", id=str(cat_name))
    graph.merge(cat_n, "Category", "id")

    query = 'MATCH (i:Item {id: $item_id}), (c:Category {id: $cat_id}) CREATE (i)-[:CATEGORY_IS]->(c)'
    graph.run(query, item_id=str(item_id), cat_id=str(cat_name))

    query = 'MATCH (c:Category {id: $cat_id}), (i:Item {id: $item_id}) CREATE (c)-[:CATEGORY_INCLUDES]->(i)'
    graph.run(query, cat_id=str(cat_name), item_id=str(item_id))


# Add unidirectional relationships between an item and boughts to the graph
def connectAlsoBought(item_id, also_bought):
    for abou in also_bought:
        if abou in preprocessed_data["smap"]:
            abou = preprocessed_data["smap"][abou]
            query = 'MATCH (i1:Item {id: $item1_id}), (i2:Item {id: $item2_id}) CREATE (i1)-[:ALSO_BOUGHT]->(i2)'
            graph.run(query, item1_id=str(item_id), item2_id=str(abou))


# Add unidirectional relationships between an item and also_viewed to the graph
def connectAlsoViewed(item_id, also_viewed):
    for aw in also_viewed:
        if aw in preprocessed_data["smap"]:
            aw = preprocessed_data["smap"][aw]
            query = 'MATCH (i1:Item {id: $item1_id}), (i2:Item {id: $item2_id}) CREATE (i1)-[:ALSO_VIEWED]->(i2)'
            graph.run(query, item1_id=str(item_id), item2_id=str(aw))


# Add unidirectional relationships between an item and bought_together to the graph
def connectBoughtTogether(item_id, bought_together):
    for bt in bought_together:
        if bt in preprocessed_data["smap"]:
            bt = preprocessed_data["smap"][bt]
            query = 'MATCH (i1:Item {id: $item1_id}), (i2:Item {id: $item2_id}) CREATE (i1)-[:BOUGHT_TOGETHER]->(i2)'
            graph.run(query, item1_id=str(item_id), item2_id=str(bt))


# Add unidirectional relationships between an item and buy_after_viewing to the graph
def connectBuyAfterViweing(item_id, buy_after_viewing):
    for bav in buy_after_viewing:
        if bav in preprocessed_data["smap"]:
            bav = preprocessed_data["smap"][bav]
            query = 'MATCH (i1:Item {id: $item1_id}), (i2:Item {id: $item2_id}) CREATE (i1)-[:BUY_AFTER_VIEWING]->(i2)'
            graph.run(query, item1_id=str(item_id), item2_id=str(bav))

In [None]:
# Add all other relationships between items to the graph
metadata_path = folder_path + "meta_Beauty.json.gz"

count = 0

with gzip.open(metadata_path, 'rb') as f:
    for line in f:
        line = eval(line)
        
        id = line['asin'].strip()
        if id in preprocessed_data["smap"]:
            id = preprocessed_data["smap"][id]
            
            if 'brand' in line and len(line['brand']) > 0:
                brand = line['brand'].strip()
                connectBrand(id, brand)
                
            if 'categories' in line and len(line['categories']) > 0:
                cats = line['categories']
                connectCats(id, "|".join(cats[0]))
                
            if 'related' in line and len(line['related']) > 0:
                rels = line['related']
                
                if "also_bought" in rels:
                    also_bought = rels["also_bought"]
                    connectAlsoBought(id, also_bought)
                    
                if "also_viewed" in rels:
                    also_viewed = rels["also_viewed"]
                    connectAlsoViewed(id, also_viewed)
                    
                if "bought_together" in rels:
                    bought_together = rels["bought_together"]
                    connectBoughtTogether(id, bought_together)
                    
                # if "buy_after_viewing" in rels:
                #     buy_after_viewing = rels["buy_after_viewing"]
                #     connectBuyAfterViweing(id, buy_after_viewing)
                    
            if count % 1000==0:
                print(count)
            count += 1

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000


In [None]:
# Verification
query = """MATCH (i:Item)-[r:BRAND_IS]->(b:Brand)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (b:Brand)-[r:BRAND_INCLUDES]->(i:Item)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (i:Item)-[r:CATEGORY_IS]->(c:Category)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (c:Category)-[r:CATEGORY_INCLUDES]->(i:Item)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (i1:Item)-[r:ALSO_BOUGHT]->(i2:Item)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (i1:Item)-[r:ALSO_VIEWED]->(i2:Item)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (i1:Item)-[r:BOUGHT_TOGETHER]->(i2:Item)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

query = """MATCH (i1:Item)-[r:BUY_AFTER_VIEWING]->(i2:Item)
RETURN COUNT(r) AS relationship_count;"""
print(graph.run(query))

 relationship_count 
--------------------
               9995 

 relationship_count 
--------------------
               9995 

 relationship_count 
--------------------
              12086 

 relationship_count 
--------------------
              12086 

 relationship_count 
--------------------
             239078 

 relationship_count 
--------------------
             155103 

 relationship_count 
--------------------
               9009 

 relationship_count 
--------------------
                  0 



In [5]:
# Retaining only the most recent relationship between nodes based on timestamp
query = """MATCH (a)-[r]->(b)
WITH a, b, r
ORDER BY r.timestamp DESC
WITH a, b, COLLECT(r) AS rels
WHERE SIZE(rels) > 1
FOREACH (rel IN rels[1..] | DELETE rel)
"""

print(graph.run(query))

(No data)
