In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import dask
import dask.dataframe as dd
import time
import os
from IPython.display import display
import ast
import warnings
warnings.filterwarnings('ignore')
# import sweetviz as sv


# print(dask.__version__)

In [2]:
# Convert JSON to parquet files

# Paths to save the Parquet files
# parquet_dir = "../../data/processed/"

# # 1. 
# # Convert pandas DataFrames to Parquet
# data_tip = pd.read_json(f"../../data/raw/yelp_academic_dataset_tip.json", lines=True)
# data_tip.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_tip.parquet"))

# # Convert Dask DataFrames to Parquet
# data_businesses = dd.read_json(f"../../data/raw/yelp_academic_dataset_business.json", lines=True)
# data_businesses.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_business.parquet"))

# data_checkin = dd.read_json(f"../../data/raw/yelp_academic_dataset_checkin.json", lines=True)
# data_checkin.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_checkin.parquet"))

# data_user = dd.read_json(f"../../data/raw/yelp_academic_dataset_user.json", lines=True)
# data_user.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_user.parquet"))

# # Read the large JSON file with Dask in chunks and convert to Parquet
# data_review = dd.read_json(f"../../data/raw/yelp_academic_dataset_review.json", lines=True, blocksize="64MB")
# data_review.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_review.parquet"))



# Read and save sample data for larger files
# data_user = dd.read_json('../../data/raw/yelp_academic_dataset_user.json', lines=True, blocksize=40000000)  # 1MB blocksize
# data_user_sample = data_user.head(5000)  # Adjust the number of rows to read
# data_user_sample.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_review_user.parquet"))

# # Review Data
# data_review = dd.read_json(f"../../data/raw/yelp_academic_dataset_review.json", lines=True, blocksize=40000000)
# data_review = data_review.head(5000)
# data_review.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_review_sample.parquet"))



In [3]:

# Load data and set indices
data_businesses = dd.read_parquet("../../data/processed/yelp_academic_dataset_business.parquet")
data_businesses = data_businesses.set_index('business_id')

data_checkin = dd.read_parquet("../../data/processed/yelp_academic_dataset_checkin.parquet")
data_checkin = data_checkin.set_index('business_id')

data_tip = dd.read_parquet("../../data/processed/yelp_academic_dataset_tip.parquet")

data_review_sample = dd.read_parquet("../../data/processed/yelp_academic_dataset_review_sample.parquet")
data_review_sample = data_review_sample.set_index('review_id')

data_user_sample = dd.read_parquet("../../data/processed/yelp_academic_dataset_user_sample.parquet")
data_user_sample = data_user_sample.set_index('user_id')


df_dict = {
    'bus_df': data_businesses,
    'checkin_df': data_checkin,
    'tip_df': data_tip,
    'review_df': data_review_sample,
    'user_df': data_user_sample
}


In [4]:

# Define the processing instructions
processes = {
    "bus_df": {
        "bool": ["is_open"],
        "dict": ["attributes", "hours"],
        "lists": ["categories"]
    },
    "checkin_df": {
        "date_time_list": ["date"]
    },
    "tip_df": {
        "date_time": ["date"],
        "string": ["text"]
    },
    "user_df": {
        "date_time": ["yelping_since"], #yelping since already shows in datetime...
        "lists": ["elite", "friends"]
    },
    "review_df": {
        "date_time": ["date"],
        "string": ["text"]
    }
}

for df_name, df in df_dict.items():
    
    instructions = processes[df_name]

    for dtype, cols in instructions.items():
        if dtype == "bool":
            for col in cols:
                df[col] = df[col].astype(bool)
        
        elif dtype == "date_time":
            for col in cols:
                df[col] = dd.to_datetime(df[col], errors='coerce')
                
        elif dtype == "string":
            for col in cols:
                df[col] = df[col].str.lower()
        
        elif dtype == "date_time_list":
            # continue
            for col in cols:
                df[col] = df[col].str.split(', ').map(
                    lambda x: pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S", errors='coerce') 
                    if isinstance(x, list) else [],
                    meta=('x', 'object')
                )
        elif dtype == "dict":
            # continue
            for col in cols:
                df[col] = df[col].map(
                    lambda x: ast.literal_eval(x) if pd.notna(x) and isinstance(x, str) else {}, 
                    meta=('x', 'object')
                )
        
        elif dtype == "lists":
            # continue
            for col in cols:
                df[col] = df[col].map(
                    lambda x: x.split(', ') if pd.notna(x) and isinstance(x, str) else [], 
                    meta=('x', 'object')
                )
        
        

    # df_dict[df_name] = df

# If you need to compute the results, you can do so here:
# for df_name in df_dict:
#     df_dict[df_name] = df_dict[df_name].compute()

In [5]:
# Save dataset

# # Assuming df_dict contains Dask DataFrames
# for name, ddf in df_dict.items():
#     ddf.to_parquet(f'../../data/processed/{name}.parquet')  # or engine='fastparquet'


In [6]:
# data_checkin.compute().describe() #taking too long...

for df_name, df in df_dict.items():
    if df_name != "checkin_df":
        print(df_name)
        display(df.compute().describe())
        

bus_df


Unnamed: 0,latitude,longitude,stars,review_count
count,150346.0,150346.0,150346.0,150346.0
mean,36.67115,-89.357339,3.596724,44.866561
std,5.872759,14.918502,0.974421,121.120136
min,27.555127,-120.095137,1.0,5.0
25%,32.187293,-90.35781,3.0,8.0
50%,38.777413,-86.121179,3.5,15.0
75%,39.954036,-75.421542,4.5,37.0
max,53.679197,-73.200457,5.0,7568.0


tip_df


Unnamed: 0,date,compliment_count
count,908915,908915.0
mean,2015-06-14 10:13:53.302817280,0.012525
min,2009-04-16 13:11:49,0.0
25%,2013-01-26 01:18:02.500000,0.0
50%,2015-03-15 01:10:25,0.0
75%,2017-08-02 06:19:55.500000,0.0
max,2022-01-19 20:38:55,6.0
std,,0.120763


review_df


Unnamed: 0,stars,useful,funny,cool,date
count,5000.0,5000.0,5000.0,5000.0,5000
mean,3.8386,0.8688,0.2524,0.34,2015-04-13 15:23:19.728400128
min,1.0,0.0,0.0,0.0,2005-03-12 03:47:06
25%,3.0,0.0,0.0,0.0,2013-10-25 00:11:22
50%,4.0,0.0,0.0,0.0,2015-09-23 23:04:05.500000
75%,5.0,1.0,0.0,0.0,2017-03-15 19:22:29
max,5.0,34.0,19.0,13.0,2018-10-04 18:10:01
std,1.357983,1.71947,0.904244,0.968601,


user_df


Unnamed: 0,review_count,yelping_since,useful,funny,cool,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
count,4211.0,4211,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0,4211.0
mean,341.816671,2009-05-08 13:15:50.810496512,1275.876276,684.31489,883.991926,44.503681,3.830869,93.666588,11.84873,9.772738,8.834244,5.336737,56.616481,144.450724,128.450962,128.450962,48.126098,37.050344
min,1.0,2005-01-03 18:01:35,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,42.0,2008-03-15 06:30:46,60.0,15.0,19.0,2.0,3.6,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
50%,145.0,2009-06-18 21:29:13,244.0,79.0,101.0,10.0,3.83,5.0,2.0,1.0,0.0,0.0,7.0,9.0,10.0,10.0,4.0,1.0
75%,399.0,2010-08-03 00:38:08.500000,868.0,344.5,436.5,36.0,4.06,33.0,7.0,3.0,3.0,2.0,28.0,45.0,51.0,51.0,22.5,6.0
max,9941.0,2013-07-30 17:49:45,124311.0,91842.0,113069.0,3138.0,5.0,12391.0,3575.0,5662.0,1744.0,2261.0,4901.0,16674.0,13262.0,13262.0,6574.0,14045.0
std,582.808313,,4488.603479,2908.094407,3755.493255,142.928365,0.431512,433.07469,69.657934,98.640596,51.187551,44.008681,218.024305,714.616047,562.896788,562.896788,223.647099,342.897864
