# Yelp

## 1. Introduction

The purpose of this notebook is to implement an inital and incremental load for SILVER stage.

## 2. Environment setup

In [0]:
%sql
USE CATALOG training_catalog;

In [0]:
%sql
USE SCHEMA yelp_db;

In [0]:
%sql
SELECT current_catalog() AS current_catalog, current_schema() AS current_schema;

## 3. Transform

### 3.1. Data profiling

#### 3.1.1. Identify missing values

##### Business

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW vw_business_missing_values AS
SELECT column_name, missing_count
FROM (
  SELECT
    COUNT(CASE WHEN address IS NULL OR TRIM(address) = '' THEN 1 END) AS missing_address,
    COUNT(CASE WHEN city IS NULL OR TRIM(city) = '' THEN 1 END) AS missing_city,
    COUNT(CASE WHEN state IS NULL OR TRIM(state) = '' THEN 1 END) AS missing_state,
    COUNT(CASE WHEN postal_code IS NULL OR TRIM(postal_code) = '' THEN 1 END) AS missing_postal_code,
    COUNT(CASE WHEN latitude IS NULL THEN 1 END) AS missing_latitude,
    COUNT(CASE WHEN longitude IS NULL THEN 1 END) AS missing_longitude,
    COUNT(CASE WHEN stars IS NULL THEN 1 END) AS missing_stars,
    COUNT(CASE WHEN review_count IS NULL THEN 1 END) AS missing_review_count,
    COUNT(CASE WHEN is_open IS NULL THEN 1 END) AS missing_is_open,
    COUNT(CASE WHEN attributes IS NULL OR TRIM(attributes) = '' THEN 1 END) AS missing_attributes,
    COUNT(CASE WHEN categories IS NULL OR TRIM(categories) = '' THEN 1 END) AS missing_categories,
    COUNT(CASE WHEN hours_json IS NULL THEN 1 END) AS missing_hours_json
  FROM yelp_business_bronze
) t
LATERAL VIEW STACK(12,
  'address', missing_address,
  'city', missing_city,
  'state', missing_state,
  'postal_code', missing_postal_code,
  'latitude', missing_latitude,
  'longitude', missing_longitude,
  'stars', missing_stars,
  'review_count', missing_review_count,
  'is_open', missing_is_open,
  'attributes', missing_attributes,
  'categories', missing_categories,
  'hours_json', missing_hours_json
) AS column_name, missing_count;

SELECT * FROM vw_business_missing_values;

##### Checkin

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW vw_checkin_missing_values AS
SELECT column_name, missing_count
FROM (
  SELECT
    COUNT(CASE WHEN business_id IS NULL OR TRIM(business_id) = '' THEN 1 END) AS missing_business_id,
    COUNT(CASE WHEN date IS NULL OR TRIM(date) = '' THEN 1 END) AS missing_date
  FROM yelp_checkin_bronze
) t
LATERAL VIEW STACK(2,
  'business_id', missing_business_id,
  'date', missing_date
) AS column_name, missing_count;

SELECT * FROM vw_checkin_missing_values;

##### Review

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW vw_review_missing_values AS
SELECT column_name, missing_count
FROM (
  SELECT
    COUNT(CASE WHEN review_id IS NULL OR TRIM(review_id) = '' THEN 1 END) AS missing_review_id,
    COUNT(CASE WHEN user_id IS NULL OR TRIM(user_id) = '' THEN 1 END) AS missing_user_id,
    COUNT(CASE WHEN business_id IS NULL OR TRIM(business_id) = '' THEN 1 END) AS missing_business_id,
    COUNT(CASE WHEN stars IS NULL THEN 1 END) AS missing_stars,
    COUNT(CASE WHEN useful IS NULL THEN 1 END) AS missing_useful,
    COUNT(CASE WHEN funny IS NULL THEN 1 END) AS missing_funny,
    COUNT(CASE WHEN cool IS NULL THEN 1 END) AS missing_cool,
    COUNT(CASE WHEN text IS NULL OR TRIM(text) = '' THEN 1 END) AS missing_text,
    COUNT(CASE WHEN date IS NULL THEN 1 END) AS missing_date
  FROM yelp_review_bronze
) t
LATERAL VIEW STACK(9,
  'address', missing_review_id,
  'city', missing_user_id,
  'state', missing_business_id,
  'postal_code', missing_stars,
  'latitude', missing_useful,
  'longitude', missing_funny,
  'stars', missing_cool,
  'review_count', missing_text,
  'is_open', missing_date
) AS column_name, missing_count;

SELECT * FROM vw_review_missing_values;

##### Tip

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW vw_tip_missing_values AS
SELECT column_name, missing_count
FROM (
  SELECT
    COUNT(CASE WHEN user_id IS NULL OR TRIM(user_id) = '' THEN 1 END) AS missing_user_id,
    COUNT(CASE WHEN business_id IS NULL OR TRIM(business_id) = '' THEN 1 END) AS missing_business_id,
    COUNT(CASE WHEN text IS NULL OR TRIM(text) = '' THEN 1 END) AS missing_text,
    COUNT(CASE WHEN date IS NULL THEN 1 END) AS missing_date,
    COUNT(CASE WHEN compliment_count IS NULL THEN 1 END) AS missing_compliment_count
  FROM yelp_tip_bronze
) t
LATERAL VIEW STACK(5,
  'user_id', missing_user_id,
  'business_id', missing_business_id,
  'text', missing_text,
  'date', missing_date,
  'compliment_count', missing_compliment_count
) AS column_name, missing_count;

SELECT * FROM vw_tip_missing_values;

##### User

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW vw_user_missing_values AS
SELECT column_name, missing_count
FROM (
  SELECT
    COUNT(CASE WHEN user_id IS NULL OR TRIM(user_id) = '' THEN 1 END) AS missing_user_id,
    COUNT(CASE WHEN name IS NULL OR TRIM(name) = '' THEN 1 END) AS missing_name,
    COUNT(CASE WHEN review_count IS NULL THEN 1 END) AS missing_review_count,
    COUNT(CASE WHEN yelping_since IS NULL THEN 1 END) AS missing_yelping_since,
    COUNT(CASE WHEN useful IS NULL THEN 1 END) AS missing_useful,
    COUNT(CASE WHEN funny IS NULL THEN 1 END) AS missing_funny,
    COUNT(CASE WHEN cool IS NULL THEN 1 END) AS missing_cool,
    COUNT(CASE WHEN elite IS NULL OR TRIM(elite) = '' THEN 1 END) AS missing_elite,
    COUNT(CASE WHEN friends IS NULL OR TRIM(friends) = '' THEN 1 END) AS missing_friends,
    COUNT(CASE WHEN fans IS NULL THEN 1 END) AS missing_fans,
    COUNT(CASE WHEN average_stars IS NULL THEN 1 END) AS missing_average_stars,
    COUNT(CASE WHEN compliment_hot IS NULL THEN 1 END) AS missing_compliment_hot,
    COUNT(CASE WHEN compliment_more IS NULL THEN 1 END) AS missing_compliment_more,
    COUNT(CASE WHEN compliment_profile IS NULL THEN 1 END) AS missing_compliment_profile,
    COUNT(CASE WHEN compliment_cute IS NULL THEN 1 END) AS missing_compliment_cute,
    COUNT(CASE WHEN compliment_list IS NULL THEN 1 END) AS missing_compliment_list,
    COUNT(CASE WHEN compliment_note IS NULL THEN 1 END) AS missing_compliment_note,
    COUNT(CASE WHEN compliment_plain IS NULL THEN 1 END) AS missing_compliment_plain,
    COUNT(CASE WHEN compliment_cool IS NULL THEN 1 END) AS missing_compliment_cool,
    COUNT(CASE WHEN compliment_funny IS NULL THEN 1 END) AS missing_compliment_funny,
    COUNT(CASE WHEN compliment_writer IS NULL THEN 1 END) AS missing_compliment_writer,
    COUNT(CASE WHEN compliment_photos IS NULL THEN 1 END) AS missing_compliment_photos
  FROM yelp_user_bronze
) t
LATERAL VIEW STACK(22,
  'user_id', missing_user_id,
  'name', missing_name,
  'review_count', missing_review_count,
  'yelping_since', missing_yelping_since,
  'useful', missing_useful,
  'funny', missing_funny,
  'cool', missing_cool,
  'elite', missing_elite,
  'friends', missing_friends,
  'fans', missing_fans,
  'average_stars', missing_average_stars,
  'compliment_hot', missing_compliment_hot,
  'compliment_more', missing_compliment_more,
  'compliment_profile', missing_compliment_profile,
  'compliment_cute', missing_compliment_cute,
  'compliment_list', missing_compliment_list,
  'compliment_note', missing_compliment_note,
  'compliment_plain', missing_compliment_plain,
  'compliment_cool', missing_compliment_cool,
  'compliment_funny', missing_compliment_funny,
  'compliment_writer', missing_compliment_writer,
  'compliment_photos', missing_compliment_photos
) AS column_name, missing_count;

SELECT * FROM vw_user_missing_values;

#### 3.1.2. Identify duplicate values

##### Business

In [0]:
%sql
SELECT
  business_id,
  name,
  address,
  city,
  state,
  postal_code,
  latitude,
  longitude,
  stars,
  review_count,
  is_open,
  attributes,
  categories,
  hours_json,
  COUNT(*) count_duplicated
FROM yelp_business_bronze
GROUP BY
  business_id,
  name,
  address,
  city,
  state,
  postal_code,
  latitude,
  longitude,
  stars,
  review_count,
  is_open,
  attributes,
  categories,
  hours_json
HAVING COUNT(*) > 1;

##### Checkin

In [0]:
%sql
SELECT
  business_id,
  date,
  COUNT(*) count_duplicated
FROM yelp_checkin_bronze
GROUP BY
  business_id,
  date
HAVING COUNT(*) > 1;

##### Review

In [0]:
%sql
SELECT
  review_id,
  user_id,
  business_id,
  stars,
  useful,
  funny,
  cool,
  text,
  date,
  COUNT(*) count_duplicated
FROM yelp_review_bronze
GROUP BY
  review_id,
  user_id,
  business_id,
  stars,
  useful,
  funny,
  cool,
  text,
  date
HAVING COUNT(*) > 1;

##### Tip

In [0]:
%sql
SELECT
  user_id,
  business_id,
  text,
  date,
  compliment_count,
  COUNT(*) count_duplicated
FROM yelp_tip_bronze
GROUP BY
  user_id,
  business_id,
  text,
  date,
  compliment_count
HAVING COUNT(*) > 1;

##### User

In [0]:
%sql
SELECT
  user_id,
  name,
  review_count,
  yelping_since,
  useful,
  funny,
  cool,
  elite,
  friends,
  fans,
  average_stars,
  compliment_hot,
  compliment_more,
  compliment_profile,
  compliment_cute,
  compliment_list,
  compliment_note,
  compliment_plain,
  compliment_cool,
  compliment_funny,
  compliment_writer,
  compliment_photos,
  COUNT(*) count_duplicated
FROM yelp_user_bronze
GROUP BY
  user_id,
  name,
  review_count,
  yelping_since,
  useful,
  funny,
  cool,
  elite,
  friends,
  fans,
  average_stars,
  compliment_hot,
  compliment_more,
  compliment_profile,
  compliment_cute,
  compliment_list,
  compliment_note,
  compliment_plain,
  compliment_cool,
  compliment_funny,
  compliment_writer,
  compliment_photos
HAVING COUNT(*) > 1;

#### 3.1.3. Validate data consistency

To carry out this step I am using the column **_rescued_data**, which stores values that do not match with the schema.

##### Business

In [0]:
%sql
SELECT * FROM yelp_business_bronze WHERE _rescued_data IS NOT NULL;

##### Checkin

In [0]:
%sql
SELECT * FROM yelp_checkin_bronze WHERE _rescued_data IS NOT NULL;

##### Review

In [0]:
%sql
SELECT * FROM yelp_review_bronze WHERE _rescued_data IS NOT NULL;

##### Tip

In [0]:
%sql
SELECT * FROM yelp_tip_bronze WHERE _rescued_data IS NOT NULL;

##### User

In [0]:
%sql
SELECT * FROM yelp_user_bronze WHERE _rescued_data IS NOT NULL;

### 3.2. Cleaning

#### 3.2.1. Dealing with missing values

There are several strategies to deal with missing values, but for the purpose of this project I have the following options:

* Remove missing values if there are few.
* Remove columns if there are a lot or it is too complex to fill.
* Fill with median for numeric features.

##### Business

Strategy:

* **address**: remove rows which do not have this feature.
* **attributes**: remove this feature, due to its complexity.
* **hours_json**: remove rows which do not have this feature.

In [0]:
%sql
SELECT * FROM vw_business_missing_values WHERE missing_count > 0;

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW vw_business_no_missing_values AS
SELECT 
  *
  EXCEPT(attributes)
FROM yelp_business_bronze 
WHERE (address IS NOT NULL AND TRIM(address) <> '')
AND hours_json IS NOT NULL;

SELECT COUNT(*) FROM vw_business_no_missing_values;

##### User

In [0]:
%sql
SELECT * FROM vw_user_missing_values WHERE missing_count > 0;

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW vw_user_no_missing_values AS
SELECT 
  *
  EXCEPT(elite)
FROM yelp_user_bronze;

SELECT COUNT(*) FROM vw_user_no_missing_values;

In [0]:
%sql
SELECT * FROM vw_user_no_missing_values;

### 3.3. Load

##### Business

In [0]:
%sql
DROP TABLE IF EXISTS yelp_business_silver;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_business_silver
AS
SELECT
  business_id,
  UPPER(TRIM(name)) AS name,
  UPPER(TRIM(address)) AS address,
  UPPER(TRIM(city)) AS city,
  UPPER(TRIM(state)) AS state,
  UPPER(TRIM(postal_code)) AS postal_code,
  latitude,
  longitude,
  stars,
  review_count,
  is_open,
  UPPER(TRIM(categories)) AS categories,
  file_modification_time,
  file_name
FROM vw_business_no_missing_values;

-- Preview data
SELECT * FROM yelp_business_silver LIMIT 5;

##### Checkin

In [0]:
%sql
DROP TABLE IF EXISTS yelp_checkin_silver;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_checkin_silver
AS
SELECT
  business_id,
  date,
  file_modification_time,
  file_name
FROM yelp_checkin_bronze;

-- Preview data
SELECT * FROM yelp_checkin_silver LIMIT 5;

##### Review

In [0]:
%sql
DROP TABLE IF EXISTS yelp_review_silver;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_review_silver
AS
SELECT
  review_id,
  user_id,
  business_id,
  stars,
  useful,
  funny,
  cool,
  UPPER(TRIM(text)) AS text,
  date,
  file_modification_time,
  file_name
FROM yelp_review_bronze;

-- Preview data
SELECT * FROM yelp_review_silver LIMIT 5;

##### Tip

In [0]:
%sql
DROP TABLE IF EXISTS yelp_tip_silver;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_tip_silver
AS
SELECT
  user_id,
  business_id,
  UPPER(TRIM(text)) AS text,
  date,
  compliment_count,
  file_modification_time,
  file_name
FROM yelp_tip_bronze;

-- Preview data
SELECT * FROM yelp_tip_silver LIMIT 5;

##### User

In [0]:
%sql
DROP TABLE IF EXISTS yelp_user_silver;

-- Create table
CREATE TABLE IF NOT EXISTS yelp_user_silver
AS
SELECT
  user_id,
  UPPER(TRIM(name)) AS name,
  review_count,
  yelping_since,
  useful,
  funny,
  cool,
  fans,
  average_stars,
  compliment_hot,
  compliment_more,
  compliment_profile,
  compliment_cute,
  compliment_list,
  compliment_note,
  compliment_plain,
  compliment_cool,
  compliment_funny,
  compliment_writer,
  compliment_photos,
  file_modification_time,
  file_name
FROM vw_user_no_missing_values;

-- Preview data
SELECT * FROM yelp_user_silver LIMIT 5;

## 4. Remove temp object

In [0]:
%sql
DROP VIEW vw_business_missing_values;
DROP VIEW vw_checkin_missing_values;
DROP VIEW vw_review_missing_values;
DROP VIEW vw_tip_missing_values;
DROP VIEW vw_user_missing_values;