# Yelp

## 1. Introduction

The purpose of this notebook is to implement an inital and incremental load for BRONZE stage using JSON data.

## 2. Environment setup

In [0]:
%sql
USE CATALOG training_catalog;

In [0]:
%sql
USE SCHEMA yelp_db;

In [0]:
%sql
SELECT current_catalog() AS current_catalog, current_schema() AS current_schema;

## 3. Batch data ingestion with CTAS

This method is to ingest data as a inital load.

Steps:

1. Create table
2. Ingest data

### 3.1. Exploring data

There are five source data files.

* business.
* checkin.
* review.
* tip.
* user.

Additionally, initial files have the number "1" in their name.

Exploring files

In [0]:
%sql
LIST "/Volumes/training_catalog/yelp_db/training_files"

#### Business

In [0]:
spark.sql("SELECT * FROM text.`/Volumes/training_catalog/yelp_db/training_files/*business*` LIMIT 5").display()

In [0]:
%sql
SELECT *
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*business*",
    FORMAT => "JSON"
)
LIMIT 5;

#### Checkin

In [0]:
spark.sql("SELECT * FROM text.`/Volumes/training_catalog/yelp_db/training_files/*checkin*` LIMIT 5").display()

In [0]:
%sql
SELECT *
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*checkin*",
    FORMAT => "JSON"
)
LIMIT 5;

#### Tip

In [0]:
spark.sql("SELECT * FROM text.`/Volumes/training_catalog/yelp_db/training_files/*tip*` LIMIT 5").display()

In [0]:
%sql
SELECT *
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*tip*",
    FORMAT => "JSON"
)
LIMIT 5;

#### Review

In [0]:
spark.sql("SELECT * FROM text.`/Volumes/training_catalog/yelp_db/training_files/*review*` LIMIT 5").display()

In [0]:
%sql
SELECT *
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*review*",
    FORMAT => "JSON"
)
LIMIT 5;

#### User

In [0]:
spark.sql("SELECT * FROM text.`/Volumes/training_catalog/yelp_db/training_files/*user*` LIMIT 5").display()

In [0]:
%sql
SELECT *
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*user*",
    FORMAT => "JSON"
)
LIMIT 5;

### 3.2. Creating managed delta table

#### Store data into table

##### Business

Load JSON data as string

In [0]:
%sql
DROP TABLE IF EXISTS yelp_business_json_string_bronze;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_business_json_string_bronze
AS
SELECT
    *,
    _metadata.file_modification_time,
    _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*business*1*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        business_id STRING,
        name STRING,
        address STRING,
        city STRING,
        state STRING,
        postal_code STRING,
        latitude DOUBLE,
        longitude DOUBLE,
        stars DOUBLE,
        review_count INT,
        is_open TINYINT,
        attributes STRING,
        categories STRING,
        hours STRING
    ''',
    rescueddatacolumn => "_rescued_data"
);

-- Preview data
SELECT * FROM yelp_business_json_string_bronze LIMIT 5;

Get JSON structure

In [0]:
%sql
SELECT schema_of_json('{"Monday":"8:0-22:0","Tuesday":"8:0-22:0","Wednesday":"8:0-22:0","Thursday":"8:0-22:0","Friday":"8:0-23:0","Saturday":"8:0-23:0","Sunday":"8:0-22:0"}');

Load JSON data as struct

In [0]:
%sql
CREATE OR REPLACE TABLE yelp_business_bronze
AS
SELECT 
  * 
  EXCEPT(hours),
  from_json(hours, 'STRUCT<Friday: STRING, Monday: STRING, Saturday: STRING, Sunday: STRING, Thursday: STRING, Tuesday: STRING, Wednesday: STRING>') AS hours_json
FROM yelp_business_json_string_bronze;

SELECT * FROM yelp_business_bronze LIMIT 5;

Remove string json delta table

In [0]:
%sql
DROP TABLE IF EXISTS yelp_business_json_string_bronze;

##### Checkin

In [0]:
%sql
DROP TABLE IF EXISTS yelp_checkin_bronze;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_checkin_bronze
AS
SELECT
    *,
    _metadata.file_modification_time,
    _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*checkin*1*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        business_id STRING,
        date STRING
    ''',
    rescueddatacolumn => "_rescued_data"
);

-- Preview data
SELECT * FROM yelp_checkin_bronze LIMIT 5;

##### Tip

In [0]:
%sql
DROP TABLE IF EXISTS yelp_tip_bronze;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_tip_bronze
AS
SELECT
  *,
  _metadata.file_modification_time,
  _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*tip*1*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        user_id STRING,
        business_id STRING,
        text STRING,
        date TIMESTAMP,
        compliment_count INT
    ''',
    rescueddatacolumn => "_rescued_data"
);

-- Preview data
SELECT * FROM yelp_tip_bronze LIMIT 5;

##### Review

In [0]:
%sql
DROP TABLE IF EXISTS yelp_review_bronze;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_review_bronze
AS
SELECT
  *,
  _metadata.file_modification_time,
  _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*review*1*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        review_id STRING,
        user_id STRING,
        business_id STRING,
        stars DOUBLE,
        useful INT,
        funny INT,
        cool INT,
        text STRING,
        date TIMESTAMP
    ''',
    rescueddatacolumn => "_rescued_data"
);

-- Preview data
SELECT * FROM yelp_review_bronze LIMIT 5;

##### User

In [0]:
%sql
DROP TABLE IF EXISTS yelp_user_bronze;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_user_bronze
AS
SELECT
  *,
  _metadata.file_modification_time,
  _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*user*1*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        user_id STRING,
        name STRING,
        review_count INT,
        yelping_since TIMESTAMP,
        useful INT,
        funny INT,
        cool INT,
        elite STRING,
        friends STRING,
        fans INT,
        average_stars DOUBLE,
        compliment_hot INT,
        compliment_more INT,
        compliment_profile INT,
        compliment_cute INT,
        compliment_list INT,
        compliment_note INT,
        compliment_plain INT,
        compliment_cool INT,
        compliment_funny INT,
        compliment_writer INT,
        compliment_photos INT
    ''',
    rescueddatacolumn => "_rescued_data"
);

-- Preview data
SELECT * FROM yelp_user_bronze LIMIT 5;

## 4. Incremental ingestion

This stage ingest new data to delta tables. However, to avoid duplicity the column **file_modification_time** is going to be used as filter, only taking the newest files.

Finally, this method is safer that **COPY INTO**, which relies on if the previously ingested files has not suffered any changed, otherwise, it will be ingested again.

##### Business

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_business_bronze GROUP BY file_name ORDER BY file_name;

Load JSON data as string

In [0]:
%sql
CREATE TABLE IF NOT EXISTS yelp_business_json_string_bronze
AS
SELECT
    *,
    _metadata.file_modification_time,
    _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*business*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        business_id STRING,
        name STRING,
        address STRING,
        city STRING,
        state STRING,
        postal_code STRING,
        latitude DOUBLE,
        longitude DOUBLE,
        stars DOUBLE,
        review_count INT,
        is_open TINYINT,
        attributes STRING,
        categories STRING,
        hours STRING
    ''',
    rescueddatacolumn => "_rescued_data"
)
WHERE _metadata.file_modification_time > (SELECT MAX(file_modification_time) FROM yelp_business_bronze);

SELECT COUNT(*) FROM yelp_business_json_string_bronze;

Load JSON data as struct.

In [0]:
%sql
INSERT INTO yelp_business_bronze (business_id, name, address, city, state, postal_code, latitude, longitude, stars, review_count, is_open, attributes, categories, _rescued_data, file_modification_time, file_name, hours_json)
SELECT 
  * 
  EXCEPT(hours),
  from_json(hours, 'STRUCT<Friday: STRING, Monday: STRING, Saturday: STRING, Sunday: STRING, Thursday: STRING, Tuesday: STRING, Wednesday: STRING>') AS hours_json
FROM yelp_business_json_string_bronze;

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_business_bronze GROUP BY file_name ORDER BY file_name;

Remove string json delta table

In [0]:
%sql
DROP TABLE IF EXISTS yelp_business_json_string_bronze;

##### Checkin

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_checkin_bronze GROUP BY file_name ORDER BY file_name;

In [0]:
%sql
INSERT INTO yelp_checkin_bronze (business_id, date, _rescued_data, file_modification_time, file_name)
SELECT
    *,
    _metadata.file_modification_time,
    _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*checkin*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        business_id STRING,
        date STRING
    ''',
    rescueddatacolumn => "_rescued_data"
)
WHERE _metadata.file_modification_time > (SELECT MAX(file_modification_time) FROM yelp_checkin_bronze);

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_checkin_bronze GROUP BY file_name ORDER BY file_name;

##### Tip

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_tip_bronze GROUP BY file_name ORDER BY file_name;

In [0]:
%sql
INSERT INTO yelp_tip_bronze (user_id, business_id, text, date, compliment_count, _rescued_data, file_modification_time, file_name)
SELECT
  *,
  _metadata.file_modification_time,
  _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*tip*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        user_id STRING,
        business_id STRING,
        text STRING,
        date TIMESTAMP,
        compliment_count INT
    ''',
    rescueddatacolumn => "_rescued_data"
)
WHERE _metadata.file_modification_time > (SELECT MAX(file_modification_time) FROM yelp_tip_bronze);

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_tip_bronze GROUP BY file_name ORDER BY file_name;

##### Review

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_review_bronze GROUP BY file_name ORDER BY file_name;

In [0]:
%sql
INSERT INTO yelp_review_bronze (review_id, user_id, business_id, stars, useful, funny, cool, text, date, _rescued_data, file_modification_time, file_name)
SELECT
  *,
  _metadata.file_modification_time,
  _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*review*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        review_id STRING,
        user_id STRING,
        business_id STRING,
        stars DOUBLE,
        useful INT,
        funny INT,
        cool INT,
        text STRING,
        date TIMESTAMP
    ''',
    rescueddatacolumn => "_rescued_data"
)
WHERE _metadata.file_modification_time > (SELECT MAX(file_modification_time) FROM yelp_review_bronze);

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_review_bronze GROUP BY file_name ORDER BY file_name;

##### User

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_user_bronze GROUP BY file_name ORDER BY file_name;

In [0]:
%sql
INSERT INTO yelp_user_bronze (user_id, name, review_count, yelping_since, useful, funny, cool, elite, friends, fans, average_stars, compliment_hot, compliment_more, compliment_profile, compliment_cute, compliment_list, compliment_note, compliment_plain, compliment_cool, compliment_funny, compliment_writer, compliment_photos, _rescued_data, file_modification_time, file_name)
SELECT
  *,
  _metadata.file_modification_time,
  _metadata.file_name
FROM read_files(
    "/Volumes/training_catalog/yelp_db/training_files/*user*",
    FORMAT => "JSON",
    HEADER => TRUE,
    SCHEMA => '''
        user_id STRING,
        name STRING,
        review_count INT,
        yelping_since TIMESTAMP,
        useful INT,
        funny INT,
        cool INT,
        elite STRING,
        friends STRING,
        fans INT,
        average_stars DOUBLE,
        compliment_hot INT,
        compliment_more INT,
        compliment_profile INT,
        compliment_cute INT,
        compliment_list INT,
        compliment_note INT,
        compliment_plain INT,
        compliment_cool INT,
        compliment_funny INT,
        compliment_writer INT,
        compliment_photos INT
    ''',
    rescueddatacolumn => "_rescued_data"
)
WHERE _metadata.file_modification_time > (SELECT MAX(file_modification_time) FROM yelp_user_bronze);

In [0]:
%sql
SELECT file_name, COUNT(*) amount_per_file FROM yelp_user_bronze GROUP BY file_name ORDER BY file_name;