<a href="https://colab.research.google.com/github/Sarztak/WindyGraph-A-graph-based-recommender-for-restaurants-in-Chicago/blob/main/WindyGraphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:


# ## ==== DO NOT RUN ===== ##
# API_KEY = ""  # Replace with your actual key
# HEADERS = {"Authorization": f"Bearer {API_KEY}"}

# # Step 1: Search for businesses
# def search_businesses(term="restaurants", location="Chicago", limit=5):
#     url = "https://api.yelp.com/v3/businesses/search"
#     params = {"term": term, "location": location, "limit": limit}
#     response = requests.get(url, headers=HEADERS, params=params)
#     response.raise_for_status()
#     return response.json().get("businesses", [])

# # Step 2: Get reviews for each business
# def get_reviews(business_id):
#     url = f"https://api.yelp.com/v3/businesses/{business_id}/reviews"
#     response = requests.get(url, headers=HEADERS)
#     response.raise_for_status()
#     return response.json().get("reviews", [])

# # Step 3: Query and show results
# def fetch_data(term="restaurants", location="Chicago", limit=200):
#     businesses = search_businesses(term, location, limit)
#     return businesses

# def save_to_json(data, filename="yelp_data.json"):
#     output_path = Path("data") / filename
#     output_path.parent.mkdir(parents=True, exist_ok=True)
#     with open(output_path, "w", encoding="utf-8") as f:
#         json.dump(data, f, indent=2, ensure_ascii=False)
#     print(f"✅ Data saved to {output_path}")

# # Test
# if __name__ == "__main__":
#     data = fetch_data(limit=20)
#     save_to_json(data, filename="/content/drive/MyDrive/yelp_data.json")
#     # print(json.dumps(data, indent=2))


In [24]:
# with open('/content/drive/MyDrive/yelp_data.json', 'r') as f:
#     data = json.load(f)


In [25]:
import requests
import json
import time
from pathlib import Path
import pandas as pd


# Load the JSON data
with open('/content/drive/MyDrive/yelp_data.json', 'r') as f:
    restaurants = json.load(f)


# # Extract relevant features from the data
processed_data = []
for restaurant in restaurants:
    name = restaurant.get('name', '')
    coordinates = restaurant.get('coordinates', {})
    location = restaurant.get('location', {})
    address = location.get('display_address', [])
    latitude = coordinates.get('latitude', None)
    longitude = coordinates.get('longitude', None)
    categories = restaurant.get('categories', [])
    rating = restaurant.get('rating', None)
    review_count = restaurant.get('review_count', None)

    # Flatten categories into a list of strings
    categories = [category['title'] for category in categories]


    processed_data.append({
        'name': name,
        'address': address,
        'latitude': latitude,
        'longitude': longitude,
        'categories': categories,
        'rating': rating,
        'review_count': review_count
    })

df = pd.DataFrame(processed_data)
df.head()

Unnamed: 0,name,address,latitude,longitude,categories,rating,review_count
0,Girl & The Goat,"[809 W Randolph, Chicago, IL 60607]",41.884193,-87.647946,"[New American, Bars, Bakeries]",4.4,10428
1,The Purple Pig,"[444 N Michigan Ave, Chicago, IL 60611]",41.890694,-87.624782,"[Tapas/Small Plates, Mediterranean, New American]",4.3,8787
2,Cindy's Rooftop,"[12 S Michigan Ave, Chicago, IL 60603]",41.881689,-87.625006,"[New American, Seafood, Breakfast & Brunch]",4.1,2659
3,Penumbra,"[3309 W Fullerton Ave, Chicago, IL 60647]",41.924426,-87.710898,"[Wine Bars, Seafood, Steakhouses]",4.8,981
4,The Dearborn,"[145 N Dearborn St, Chicago, IL 60602]",41.884253,-87.629315,"[New American, Breakfast & Brunch, Beer, Wine ...",4.4,2532


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          20 non-null     object 
 1   address       20 non-null     object 
 2   latitude      20 non-null     float64
 3   longitude     20 non-null     float64
 4   categories    20 non-null     object 
 5   rating        20 non-null     float64
 6   review_count  20 non-null     int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 1.2+ KB


In [27]:
!pip install torch-geometric > info # redirect the output to something else to avoid std out


# Making the graph

In [28]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/result_df_allzip.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6457 entries, 0 to 6456
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        6457 non-null   object 
 1   alias                     6457 non-null   object 
 2   name                      6457 non-null   object 
 3   image_url                 5323 non-null   object 
 4   is_closed                 6457 non-null   bool   
 5   url                       6457 non-null   object 
 6   review_count              6457 non-null   int64  
 7   categories                6457 non-null   object 
 8   rating                    6457 non-null   float64
 9   transactions              6457 non-null   object 
 10  price                     3540 non-null   object 
 11  phone                     6122 non-null   float64
 12  display_phone             6122 non-null   object 
 13  distance                  6457 non-null   float64
 14  query_zi

In [29]:
import re
df.transactions = df.transactions.apply(lambda x: re.findall('\w+', x))
transaction_types = set()
for lt in df.transactions:
    if lt:
        transaction_types = transaction_types | set(lt)
transaction_types

{'delivery', 'pickup', 'restaurant_reservation'}

In [30]:
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np

data = HeteroData()

restaurant_id_map = {name: i for i, name in enumerate(df['name'])}


def encode_transactions(trans_list):
    return {
        'pickup': int('pickup' in trans_list),
        'delivery': int('delivery' in trans_list),
        'restaurant_reservation': int('restaurant_reservation' in trans_list)  # if present
    }

df_txn = df['transactions'].apply(lambda x: encode_transactions(x if isinstance(x, list) else []))
df_txn = pd.DataFrame(df_txn.tolist())
df = pd.concat([df, df_txn], axis=1)

continuous_cols = ['coordinates.latitude', 'coordinates.longitude',
                   'rating', 'review_count']
binary_cols = ['pickup', 'delivery', 'restaurant_reservation']  # no scaling needed

# Scale only continuous columns
scaled_continuous = MinMaxScaler().fit_transform(df[continuous_cols])

# Concatenate scaled + unscaled features
restaurant_features = np.concatenate([
    scaled_continuous,
    df[binary_cols].values  # keep as 0/1
], axis=1)

# Convert to tensor
data['restaurant'].x = torch.tensor(restaurant_features, dtype=torch.float)


In [31]:
import ast  # safely evaluates string representations of Python objects

# Step 1: Convert string to list of dicts using ast.literal_eval
df['categories'] = df['categories'].apply(ast.literal_eval)

# Step 2: Extract list of 'title' strings from each dict
df['category_titles'] = df['categories'].apply(lambda cat_list: [d['title'] for d in cat_list])

# Step 3: Explode the list into multiple rows
df_exploded = df.explode('category_titles').reset_index(drop=True)

# Optional: Rename for clarity
df_exploded.rename(columns={'category_titles': 'category'}, inplace=True)

df_exploded[['name', 'category']].head()


Unnamed: 0,name,category
0,Jamaica Jerk Villa,Caribbean
1,Italian Fiesta Pizzeria,Pizza
2,Italian Fiesta Pizzeria,Italian
3,Jamison's Soul Food,Southern
4,Jamison's Soul Food,Soul Food


In [32]:
all_categories = set(df_exploded.category.values)
category_id_map = {cat: i for i, cat in enumerate(all_categories)}
data['category'].x = torch.eye(len(all_categories))  # identity features

# === Add Edges: restaurant -> category ===
edge_index_rest_cat = [[], []]
edge_index_cat_rest = [[], []]

for _, row in df_exploded.iterrows():
    rest_name = row['name']
    cat_name = row['category']

    # Skip if not found (sanity check)
    if rest_name not in restaurant_id_map or cat_name not in category_id_map:
        continue

    rest_id = restaurant_id_map[rest_name]
    cat_id = category_id_map[cat_name]

    # Add edge from restaurant to category
    edge_index_rest_cat[0].append(rest_id)
    edge_index_rest_cat[1].append(cat_id)

    # Add edge from category to restaurant
    edge_index_cat_rest[0].append(cat_id)
    edge_index_cat_rest[1].append(rest_id)


edge_tensor = torch.tensor(edge_index_rest_cat, dtype=torch.long)
reverse_edge_tensor = torch.tensor(edge_index_cat_rest, dtype=torch.long)

# Register in PyG HeteroData object
data['restaurant', 'has_category', 'category'].edge_index = edge_tensor
data['category', 'has_reverse_category', 'restaurant'].edge_index = reverse_edge_tensor


In [33]:

# === Simulate Users ===
num_users = 5
user_features = torch.eye(num_users)  # dummy features
data['user'].x = user_features

# === Add Edges: user -> restaurant === (simulate a few preferences)
import random
user_rest_edges = [[], []]
for u in range(num_users):
    liked_restaurants = random.sample(range(len(df)), 3)
    for r in liked_restaurants:
        user_rest_edges[0].append(u)
        user_rest_edges[1].append(r)

data['user', 'likes', 'restaurant'].edge_index = torch.tensor(user_rest_edges, dtype=torch.long)

print(data)


HeteroData(
  restaurant={ x=[6457, 7] },
  category={ x=[417, 417] },
  user={ x=[5, 5] },
  (restaurant, has_category, category)={ edge_index=[2, 11856] },
  (category, has_reverse_category, restaurant)={ edge_index=[2, 11856] },
  (user, likes, restaurant)={ edge_index=[2, 15] }
)


In [34]:
print(data['restaurant', 'has_category', 'category'].edge_index[:, :5])


tensor([[   0, 1486, 1486,    2,    2],
        [  64,  341,   56,  369,   19]])


In [35]:
data['category', 'has_reverse_category', 'restaurant'].edge_index[:, :5]


tensor([[  64,  341,   56,  369,   19],
        [   0, 1486, 1486,    2,    2]])

In [36]:
import torch
import torch.nn.functional as F
from torch.nn import Linear, ModuleDict
from torch_geometric.nn import HeteroConv, SAGEConv

class WindyGraphGNN(torch.nn.Module):
    def __init__(self, metadata, hidden_channels=64):
        super().__init__()

        # Define per-relation GNN layers
        self.convs = torch.nn.ModuleList()
        self.convs.append(
            HeteroConv({
                ('restaurant', 'has_category', 'category'): SAGEConv((-1, -1), hidden_channels),
                ('category', 'has_reverse_category', 'restaurant'): SAGEConv((-1, -1), hidden_channels),
                ('user', 'likes', 'restaurant'): SAGEConv((-1, -1), hidden_channels),
            }, aggr='sum')
        )

        self.lin_dict = ModuleDict()
        for node_type in metadata[0]:
            self.lin_dict[node_type] = Linear(hidden_channels, hidden_channels)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.convs[0](x_dict, edge_index_dict)

        # Apply linear layers to each node type
        for node_type in x_dict:
            x_dict[node_type] = self.lin_dict[node_type](x_dict[node_type])
            x_dict[node_type] = F.relu(x_dict[node_type])

        return x_dict


class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.lin = torch.nn.Sequential(
            torch.nn.Linear(in_channels, in_channels),
            torch.nn.ReLU(),
            torch.nn.Linear(in_channels, 1)
        )

    def forward(self, x_user, x_restaurant):
        x = x_user * x_restaurant  # element-wise interaction
        return self.lin(x).squeeze(-1)  # (batch,) instead of (batch, 1)

def get_positive_and_negative_edges(edge_index, num_users, num_restaurants, num_samples):
    pos_edges = edge_index.T  # shape: [num_pos, 2]

    # Generate random negative edges
    neg_edges = []
    while len(neg_edges) < num_samples:
        u = torch.randint(0, num_users, (1,))
        r = torch.randint(0, num_restaurants, (1,))
        if not ((edge_index[0] == u) & (edge_index[1] == r)).any():
            neg_edges.append([u.item(), r.item()])

    neg_edges = torch.tensor(neg_edges).T  # shape: [2, num_samples]
    return pos_edges.T, neg_edges

In [42]:
model = WindyGraphGNN(metadata=data.metadata(), hidden_channels=64)
predictor = LinkPredictor(in_channels=64)
optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=0.005)
model.train()
out = model(data.x_dict, data.edge_index_dict)

In [44]:
out.keys()

dict_keys(['category', 'restaurant'])

In [37]:
model = WindyGraphGNN(metadata=data.metadata(), hidden_channels=64)
predictor = LinkPredictor(in_channels=64)
optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=0.005)

for epoch in range(3):
    model.train()
    predictor.train()

    # 1. Forward pass: get node embeddings
    out = model(data.x_dict, data.edge_index_dict)
    print(out)
    user_emb = out['user']
    rest_emb = out['restaurant']

    # 2. Get positive and negative samples
    edge_index = data['user', 'likes', 'restaurant'].edge_index
    pos_edge, neg_edge = get_positive_and_negative_edges(edge_index, user_emb.size(0), rest_emb.size(0), edge_index.size(1))

    # 3. Compute scores
    pos_scores = predictor(user_emb[pos_edge[0]], rest_emb[pos_edge[1]])
    neg_scores = predictor(user_emb[neg_edge[0]], rest_emb[neg_edge[1]])

    # 4. Compute loss
    pos_labels = torch.ones_like(pos_scores)
    neg_labels = torch.zeros_like(neg_scores)
    loss = F.binary_cross_entropy_with_logits(torch.cat([pos_scores, neg_scores]), torch.cat([pos_labels, neg_labels]))

    # 5. Backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch:03d} | Loss: {loss.item():.4f}')


{'category': tensor([[0.0000, 0.0890, 0.0000,  ..., 0.3044, 0.0000, 0.0000],
        [0.0738, 0.0000, 0.0000,  ..., 0.1903, 0.0016, 0.0000],
        [0.0000, 0.0315, 0.0000,  ..., 0.2556, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0688, 0.0290,  ..., 0.2782, 0.0000, 0.0000],
        [0.0437, 0.0000, 0.0314,  ..., 0.2764, 0.0000, 0.0463],
        [0.0022, 0.0000, 0.0089,  ..., 0.2839, 0.0000, 0.0000]],
       grad_fn=<ReluBackward0>), 'restaurant': tensor([[0.1000, 0.0000, 0.3603,  ..., 0.4019, 0.0723, 0.1732],
        [0.1485, 0.0000, 0.3152,  ..., 0.3659, 0.0887, 0.2139],
        [0.0000, 0.0000, 0.2380,  ..., 0.0000, 0.2444, 0.0000],
        ...,
        [0.1271, 0.0000, 0.0959,  ..., 0.0000, 0.3027, 0.0000],
        [0.1337, 0.0000, 0.1026,  ..., 0.0000, 0.2914, 0.0000],
        [0.2848, 0.0000, 0.1870,  ..., 0.3207, 0.1871, 0.2459]],
       grad_fn=<ReluBackward0>)}




KeyError: 'user'