For Gephi, there is a specific format for node and edge list

CloutBites seeks to have an undirected network 
- nodes vs edges

nodes -> restaurants sized by rating, colored by New York borough, reviewers sized by review score

edges -> connections between reviewers and restaurants

In [1]:
# necessary imports

import pandas as pd
import json
from difflib import get_close_matches

In [4]:
# we need merged restaurant data for the boroughs and rating count (in isolation)

restaurant_data = pd.read_csv('../outputs/merged_restaurant_data.csv')

# we also need the json review data
with open('../outputs/filtered_reviews.json', 'r', encoding='utf-8') as f:
    review_data = json.load(f)
    


In [7]:
# create restaurant nodes (left side of gephi bipartite graph)

restaurant_names = {entry['title'] for entry in review_data if entry['title']}

# take the ones from the larger merged_restaurants file

filtered_restaurants = restaurant_data[restaurant_data['name'].apply(lambda x: bool(get_close_matches(x, restaurant_names, n = 1, cutoff=0.6)))]

# create reviewer nodes
reviewers = {entry['name'] for entry in review_data if entry['name']}

# gephi requires a CSV of nodes
restaurant_nodes = filtered_restaurants.copy()
restaurant_nodes['id'] =  "R_" + restaurant_nodes['name']
restaurant_nodes['size'] = restaurant_nodes['rating'] # size is the rating
restaurant_nodes['colour'] = restaurant_nodes["BOROUGH"]
restaurant_nodes = restaurant_nodes[['id', "name", 'size', 'colour']]

reviewer_nodes = pd.DataFrame({"id": ["U_" + reviewer for reviewer in reviewers], "name": list(reviewers), "size": 1, "colour": "grey"})

nodes = pd.concat([restaurant_nodes, reviewer_nodes], ignore_index=True)

nodes.to_csv('../outputs/gephi_nodes.csv', index=False)

# create edges

edges = []
for entry in review_data:
    if entry['title'] and entry['name']:
        restaurant_match = get_close_matches(entry['title'], restaurant_names, n = 1, cutoff=0.6)
        if restaurant_match:
            edges.append(["U_" + entry['name'], "R_" + restaurant_match[0], entry['score']])
edges_df = pd.DataFrame(edges, columns = ['source', 'target', 'weight'])
edges_df.to_csv('../outputs/gephi_edges.csv', index=False)

