# Heterogeneous Graph Generator
Restaurants + Users

In [1]:
res_graph_path = "../graphs/restaurants_MA.gpickle"
user_graph_path = "../graphs/2017-2018_user_network.gpickle"
user_per_res_path = "../datasets/2017-2018_visited_users.csv"

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import json 

## Restaurant Graph

In [3]:
restaurant_G = nx.read_gpickle(res_graph_path)

In [4]:
print(f"Number of nodes: {restaurant_G.number_of_nodes()}")
print(f"Number of edges: {restaurant_G.number_of_edges()}")
print(f"Number of features: {len(list(restaurant_G.nodes.data())[0][1]['node_feature'])}")

Number of nodes: 6192
Number of edges: 130527
Number of features: 537


In [5]:
list(restaurant_G.edges.data())[0]

('jGennaZUr2MsJyRhijNBfA', 'az0DEo7NfL1Y8IKxLD1L_A', {})

## Visited Users for Restaurants

In [6]:
visited_user_df = pd.read_csv(user_per_res_path)

In [7]:
visited_user_df.head(3)

Unnamed: 0,business_id,user_ids
0,--6COJIAjkQwSUZci_4PJQ,"['kkSI0sYOzMXBzofb17U8Qw', 'mRyIfVHIJN6wwR3hnT..."
1,--UNNdnHRhsyFUbDgumdtQ,"['386nVS_BRsZBG5k3tO1LeQ', 'DEjRvKAJWCrjCaEP-F..."
2,--bbZa1KPYSmW0X4o3TUQw,"['6tJZrZYLmXLBx7HrpFmN-Q', 'tBRlSyCXalfAxLH2j9..."


## Creating Dictionnary of Restaurants visited by a user

In [8]:
restaurants_visited_by_user = {}

for index, row in visited_user_df.iterrows():
    restaurant_id = row["business_id"]
    user_ids = json.loads(row["user_ids"].replace("'", "\""))
    
    for user_id in user_ids:
        if user_id in restaurants_visited_by_user:
            restaurants_visited_by_user[user_id].append(restaurant_id)
        else:
            restaurants_visited_by_user[user_id] = [restaurant_id]

## Creating New Graph

In [9]:
G = nx.Graph()

In [10]:
# Adding Restaurants Nodes
restaurant_nodes = list(restaurant_G.nodes.data())
all_restaurant_ids = []

for node in restaurant_nodes:
    restaurant_id = node[0]
    restaurant_node_label = node[1]["node_label"]
    restaurant_node_feature = node[1]["node_feature"]
    
    G.add_node(restaurant_id,
               node_label=restaurant_node_label,
               node_feature=restaurant_node_feature,
               node_type="restaurant")
    all_restaurant_ids.append(restaurant_id)

In [11]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 6192
Number of edges: 0


In [12]:
all_restaurant_edges = []

for user_id, restaurant_ids in restaurants_visited_by_user.items():
    for i in range(len(restaurant_ids)):
        for j in range(i+1, len(restaurant_ids)):
            G.add_edge(restaurant_ids[i], restaurant_ids[j])
            all_restaurant_edges.append(restaurant_ids[i])
            all_restaurant_edges.append(restaurant_ids[j])

In [13]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 29991
Number of edges: 6228685


In [14]:
a = set(all_restaurant_ids)
b = set(all_restaurant_edges)

diff = b - a
print(len(diff))

23799


In [15]:
G.remove_nodes_from(list(diff))

In [16]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 6192
Number of edges: 1134956


## Write Graph to disk

In [17]:
nx.write_gpickle(G, "graph_visited_edges_MA.gpickle")