In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import dask
import dask.dataframe as dd
import time
import os
from IPython.display import display
import ast

# import sweetviz as sv


# print(dask.__version__)

In [37]:
# Convert JSON to parquet files

# Paths to save the Parquet files
# parquet_dir = "../../data/processed/"

# # 1. 
# # Convert pandas DataFrames to Parquet
# data_tip = pd.read_json(f"../../data/raw/yelp_academic_dataset_tip.json", lines=True)
# data_tip.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_tip.parquet"))

# # Convert Dask DataFrames to Parquet
# data_businesses = dd.read_json(f"../../data/raw/yelp_academic_dataset_business.json", lines=True)
# data_businesses.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_business.parquet"))

# data_checkin = dd.read_json(f"../../data/raw/yelp_academic_dataset_checkin.json", lines=True)
# data_checkin.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_checkin.parquet"))

# data_user = dd.read_json(f"../../data/raw/yelp_academic_dataset_user.json", lines=True)
# data_user.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_user.parquet"))

# # Read the large JSON file with Dask in chunks and convert to Parquet
# data_review = dd.read_json(f"../../data/raw/yelp_academic_dataset_review.json", lines=True, blocksize="64MB")
# data_review.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_review.parquet"))



# Read and save sample data for larger files
# data_user = dd.read_json('../../data/raw/yelp_academic_dataset_user.json', lines=True, blocksize=40000000)  # 1MB blocksize
# data_user_sample = data_user.head(5000)  # Adjust the number of rows to read
# data_user_sample.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_review_user.parquet"))

# # Review Data
# data_review = dd.read_json(f"../../data/raw/yelp_academic_dataset_review.json", lines=True, blocksize=40000000)
# data_review = data_review.head(5000)
# data_review.to_parquet(os.path.join(parquet_dir, "yelp_academic_dataset_review_sample.parquet"))



In [36]:

# Load data and set indices
data_businesses = dd.read_parquet("../../data/processed/yelp_academic_dataset_business.parquet")
data_businesses = data_businesses.set_index('business_id')

data_checkin = dd.read_parquet("../../data/processed/yelp_academic_dataset_checkin.parquet")
data_checkin = data_checkin.set_index('business_id')

data_tip = dd.read_parquet("../../data/processed/yelp_academic_dataset_tip.parquet")

data_review_sample = dd.read_parquet("../../data/processed/yelp_academic_dataset_review_sample.parquet")
data_review_sample = data_review_sample.set_index('review_id')

data_user_sample = dd.read_parquet("../../data/processed/yelp_academic_dataset_user_sample.parquet")
data_user_sample = data_user_sample.set_index('user_id')


df_dict = {
    'bus_df': data_businesses,
    'checkin_df': data_checkin,
    'tip_df': data_tip,
    'review_df': data_review_sample,
    'user_df': data_user_sample
}


In [37]:

# Define the processing instructions
processes = {
    "bus_df": {
        "bool": ["is_open"],
        "dict": ["attributes", "hours"],
        "lists": ["categories"]
    },
    "checkin_df": {
        "date_time_list": ["date"]
    },
    "tip_df": {
        "date_time": ["date"],
        "string": ["text"]
    },
    "user_df": {
        "date_time": ["yelping_since"], #yelping since already shows in datetime...
        "lists": ["elite", "friends"]
    },
    "review_df": {
        "date_time": ["date"],
        "string": ["text"]
    }
}

for df_name, df in df_dict.items():
    
    instructions = processes[df_name]

    for dtype, cols in instructions.items():
        if dtype == "bool":
            for col in cols:
                df[col] = df[col].astype(bool)
        
        elif dtype == "date_time":
            for col in cols:
                df[col] = dd.to_datetime(df[col], errors='coerce')
        
        elif dtype == "date_time_list":
            for col in cols:
                df[col] = df[col].str.split(', ').map(
                    lambda x: pd.to_datetime(x, format="%Y-%m-%d %H:%M:%S", errors='coerce') 
                    if isinstance(x, list) else [],
                    meta=('x', 'object')
                )
        elif dtype == "dict":
            for col in cols:
                df[col] = df[col].map(
                    lambda x: ast.literal_eval(x) if pd.notna(x) and isinstance(x, str) else {}, 
                    meta=('x', 'object')
                )
        
        elif dtype == "lists":
            for col in cols:
                df[col] = df[col].map(
                    lambda x: x.split(', ') if pd.notna(x) and isinstance(x, str) else [], 
                    meta=('x', 'object')
                )
        
        elif dtype == "string":
            for col in cols:
                df[col] = df[col].str.lower()

    # df_dict[df_name] = df

# If you need to compute the results, you can do so here:
# for df_name in df_dict:
#     df_dict[df_name] = df_dict[df_name].compute()

In [38]:
data_tip.head()

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,they have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,it's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,appetizers.. platter special for lunch,2012-10-06 19:43:09,0
