# Heterogeneous Graph Generator
Restaurants + Users

In [1]:
res_graph_path = "../graphs/restaurants_skew.gpickle"
user_graph_path = "../graphs/2017-2018_user_network.gpickle"
user_per_res_path = "../datasets/2017-2018_visited_users.csv"

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import json 

## User Graph

In [3]:
user_G = nx.read_gpickle(user_graph_path)

In [4]:
print(f"Number of nodes: {user_G.number_of_nodes()}")
print(f"Number of edges: {user_G.number_of_edges()}")

Number of nodes: 579604
Number of edges: 1560849


In [5]:
list(user_G.nodes.data())[0]

('21v8vUQKyTw7KCzXU6gI3g',
 {'node_features': tensor([4.7813e-03, 6.7032e-04, 5.7544e-04, 3.4265e-04, 3.3014e-04, 5.5500e-01,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.6095e-05,
          4.4025e-05, 0.0000e+00, 0.0000e+00, 6.4742e-05, 0.0000e+00]),
  'node_type': 'user'})

In [6]:
list(user_G.edges.data())[0]

('21v8vUQKyTw7KCzXU6gI3g', 'Tf1EmLLUZ2tlpOhaU2kvpg', {'edge_type': 'uu'})

## Restaurant Graph

In [7]:
restaurant_G = nx.read_gpickle(res_graph_path)

In [8]:
print(f"Number of nodes: {restaurant_G.number_of_nodes()}")
print(f"Number of edges: {restaurant_G.number_of_edges()}")
print(f"Number of features: {len(list(restaurant_G.nodes.data())[0][1]['node_feature'])}")

Number of nodes: 29963
Number of edges: 491371
Number of features: 537


In [9]:
list(restaurant_G.nodes.data())[0]

('6iYb2HFDywm3zjuRg0shjw',
 {'node_label': 2,
  'node_feature': tensor([1.0000, 1.0000, 1.0000, 1.0000, 2.0000, 0.0000, 1.0000, 1.0000, 1.0000,
          1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 1.0000,
          1.0000, 1.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.

In [10]:
list(restaurant_G.edges.data())[0]

('6iYb2HFDywm3zjuRg0shjw', 'DX6G8Vdu9wUx95Tzh6gEwA', {})

## Visited Users for Restaurants

In [11]:
visited_user_df = pd.read_csv(user_per_res_path)

In [12]:
visited_user_df.head(3)

Unnamed: 0,business_id,user_ids
0,--6COJIAjkQwSUZci_4PJQ,"['kkSI0sYOzMXBzofb17U8Qw', 'mRyIfVHIJN6wwR3hnT..."
1,--UNNdnHRhsyFUbDgumdtQ,"['386nVS_BRsZBG5k3tO1LeQ', 'DEjRvKAJWCrjCaEP-F..."
2,--bbZa1KPYSmW0X4o3TUQw,"['6tJZrZYLmXLBx7HrpFmN-Q', 'tBRlSyCXalfAxLH2j9..."


## Creating New Graph

In [13]:
G = nx.Graph()

In [14]:
# Adding Restaurants Nodes
restaurant_nodes = list(restaurant_G.nodes.data())
all_restaurant_ids = []

for node in restaurant_nodes:
    restaurant_id = node[0]
    restaurant_node_label = node[1]["node_label"]
    restaurant_node_feature = node[1]["node_feature"]
    
    G.add_node(restaurant_id,
               node_label=restaurant_node_label,
               node_feature=restaurant_node_feature,
               node_type="restaurant")
    all_restaurant_ids.append(restaurant_id)

In [15]:
# Adding User Nodes
user_nodes = list(user_G.nodes.data())
all_user_ids = []

for node in user_nodes:
    user_id = node[0]
    user_node_feature = node[1]["node_features"]
    
    G.add_node(user_id,
               node_label=0,
               node_feature=user_node_feature,
               node_type="user")
    all_user_ids.append(user_id)

In [16]:
# Adding Restaurant-Restaurant Edges
restaurant_edges = list(restaurant_G.edges.data())

for edge in restaurant_edges:
    node1 = edge[0]
    node2 = edge[1]
    
    G.add_edge(node1, node2, edge_type="rr") # rr means restaurant to restaurant

In [17]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 609371
Number of edges: 491371


In [18]:
# Adding User-User Edges
#user_edges = list(user_G.edges.data())

#for edge in user_edges:
#    node1 = edge[0]
#    node2 = edge[1]
#    
#    G.add_edge(node1, node2, edge_type="uu") # uu means user to user

In [19]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 609371
Number of edges: 491371


In [20]:
# Adding Restaurant-User Edges
all_user_ids2 = []

for index, row in visited_user_df.iterrows():
    restaurant_id = row["business_id"]
    user_ids = json.loads(row["user_ids"].replace("'", "\""))
    
    if restaurant_id in all_restaurant_ids:
        for user_id in user_ids:
            G.add_edge(user_id, restaurant_id, edge_type="ur") # ur means user to restaurant
            all_user_ids2.append(user_id)

In [21]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 609371
Number of edges: 1880887


In [22]:
a = set(all_user_ids)
b = set(all_restaurant_ids)
c = set(all_user_ids2)
print(len(a), len(b), len(c))

inter = c & b
print(len(inter))

579604 29963 579593
196


In [23]:
G.remove_nodes_from(list(inter))

In [24]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 609175
Number of edges: 1864204


## Write Graph to disk

In [25]:
nx.write_gpickle(G, "hetero_graph3.gpickle")