In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from igraph import Graph, plot, RainbowPalette
import cairo

In [None]:
def load_data(file_path, chunksize=100000):
    """Load the data into a DataFrame in chunks."""
    all_data = pd.DataFrame()
    for chunk in pd.read_json(file_path, lines=True, chunksize=chunksize):
        all_data = pd.concat([all_data, chunk])
    return all_data

### Take atention

To load both `datasets/yelp_academic_dataset_user.json` and `datasets/yelp_academic_dataset_review.json` use the function bellow, the others you may use `pd.read_json` straight forward.

In [None]:
business = pd.read_json('datasets/yelp_academic_dataset_business.json', lines=True)

In [None]:
users = load_data('datasets/yelp_academic_dataset_user.json')

In [None]:
reviews = load_data('datasets/yelp_academic_dataset_review.json')

In [None]:
print(f"user shape: {users.shape}")
print(f"review shape: {reviews.shape}")
print(f"business shape: {business.shape}")

In [None]:
tucson_df = business[business['city'] == 'Tucson'].copy(deep=True)
print(f"tucson shape: {tucson_df.shape}")

In [None]:
tucson_df.head()

In [None]:
reviews_df = reviews[reviews['business_id'].isin(tucson_df['business_id'])].copy(deep=True)
print(f"reviews shape: {reviews_df.shape}")

In [None]:
users_df = users[users['user_id'].isin(reviews_df['user_id'])].copy(deep=True)
print(f"users shape: {users_df.shape}")

In [None]:
# Save the data
# tucson_df.to_csv('data/tucson_business.csv', index=False)
# reviews_df.to_csv('data/tucson_reviews.csv', index=False)
# users_df.to_csv('data/tucson_users.csv', index=False)

# Start here

In [None]:
review_tucs = pd.read_csv('data/tucson_reviews.csv')
review_tucs.head()

In [None]:
tucson_df = pd.read_csv('data/tucson_business.csv')
tucson_df.head()

In [None]:
print(tucson_df[tucson_df['categories'].str.contains('Restaurants|Restaurant|restaurants|restaurant', regex=True) == True].shape)
restaurants_df = tucson_df[tucson_df['categories'].str.contains('Restaurants|Restaurant|restaurants|restaurant', regex=True) == True].reset_index(drop=True).copy(deep=True)

In [None]:
reviews_rest_df = review_tucs[review_tucs['business_id'].isin(restaurants_df['business_id'])].reset_index(drop=True).copy(deep=True)
print(f"reviews shape: {reviews_rest_df.shape}")
reviews_rest_df.head()

In [None]:
g = Graph(directed=False)

In [None]:
unique_users = reviews_rest_df['user_id'].unique()
unique_business = reviews_rest_df['business_id'].unique()
unique_reviews = reviews_rest_df['review_id'].unique()

g.add_vertices(unique_users)
g.add_vertices(unique_business)
g.add_vertices(unique_reviews)

g.vs['type'] = ['user'] * len(unique_users) + ['business'] * len(unique_business) + ['review'] * len(unique_reviews)

g.add_edges([(user, review) for user, review in zip(reviews_rest_df['user_id'], reviews_rest_df['review_id'])])

g.add_edges([(review, business) for review, business in zip(reviews_rest_df['review_id'], reviews_rest_df['business_id'])])

In [None]:
communities = g.community_multilevel()

print(f"Number of communities: {len(communities)}")

In [None]:
layout = g.layout("fr")

plot(g, layout=layout, vertex_size=10, vertex_label_size=8, vertex_label_dist=1, edge_arrow_size=0.5, bbox=(800, 800))



In [None]:
plot(communities, layout=layout, vertex_size=5, vertex_label_size=5, bbox=(1000, 1000), margin=100, palette=RainbowPalette(n=len(communities)))

In [None]:
layout = g.layout("fr")
plot(g, layout=layout, vertex_size=10, vertex_label_size=8, vertex_label_dist=1, edge_arrow_size=0.5, bbox=(800, 800))
