
## STEP 1: Connect to local DB where pagila data is located

In [2]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [None]:
DB_ENDPOINT = '127.0.0.1'
DB = 'pagila'
DB_USER = '***'
DB_PASSWORD = '***'
DB_PORT = '5432'


conn_string = "postgresql://{}:{}@{}:{}/{}" \
                        .format(DB_USER, DB_PASSWORD, DB_ENDPOINT, DB_PORT, DB)


print(conn_string)

In [4]:
%sql $conn_string

'Connected: postgres@pagila'

## Step 4: Creating Facts & Dimensions 

In [5]:
%%sql 

CREATE TABLE dimDate
(
    date_key integer NOT NULL PRIMARY KEY,
    date date NOT NULL,
    year smallint NOT NULL,
    quarter smallint NOT NULL,
    month smallint NOT NULL,
    day smallint NOT NULL,
    week smallint NOT NULL,
    is_weekend boolean
);


CREATE TABLE dimCustomer
(
    customer_key SERIAL PRIMARY KEY,
    customer_id smallint NOT NULL,
    first_name varchar(45) NOT NULL,
    last_name varchar(45) NOT NULL,
    email varchar(50),
    address varchar(50) NOT NULL,
    address2 varchar(50),
    district varchar(20) NOT NULL,
    city varchar(50) NOT NULL,
    country varchar(50) NOT NULL,
    postal_code varchar(10),
    phone varchar(20) NOT NULL,
    active smallint NOT NULL,
    create_date timestamp NOT NULL,
    start_date date NOT NULL,
    end_date date NOT NULL
);



CREATE TABLE dimMovie
(
    movie_key SERIAL PRIMARY KEY,
    film_id smallint NOT NULL,
    title varchar(255) NOT NULL,
    description text,
    release_year year,
    language varchar(20) NOT NULL,
    original_language varchar(20),
    rental_duration smallint NOT NULL,
    length smallint NOT NULL,
    rating varchar(5) NOT NULL,
    special_features varchar(400) NOT NULL
);


CREATE TABLE dimStore
(
    store_key SERIAL PRIMARY KEY,
    store_id smallint NOT NULL,
    address varchar(50) NOT NULL,
    address2 varchar(50),
    district varchar(20) NOT NULL,
    city varchar(50) NOT NULL,
    country varchar(50) NOT NULL,
    postal_code varchar(10),
    manager_first_name varchar(45) NOT NULL,
    manager_last_name varchar(45) NOT NULL,
    start_date date NOT NULL,
    end_date date NOT NULL
);


CREATE TABLE factSales
(
    sales_key SERIAL PRIMARY KEY,
    date_key INT NOT NULL REFERENCES dimDate(date_key),
    customer_key INT NOT NULL REFERENCES dimCustomer(customer_key),
    movie_key INT NOT NULL REFERENCES dimMovie(movie_key),
    store_key INT NOT NULL REFERENCES dimStore(store_key),
    sales_amount decimal(5,2) NOT NULL
);

 * postgresql://postgres:***@127.0.0.1:5432/pagila
Done.
Done.
Done.
Done.
Done.


[]

## STEP 5: ETL the data from 3NF to Facts & Dimensions

#ISODOW is a "a day of the week function" ::: 6,7 --> saturday & sunday 

In [6]:
%%sql 
INSERT INTO dimDate (date_key, date, year, quarter, month, day, week, is_weekend)
SELECT DISTINCT (TO_CHAR(payment_date :: DATE, 'yyyyMMDD'):: integer) AS date_key,
                         date(payment_date) AS date,
                         EXTRACT(year FROM payment_date) AS year,
                         EXTRACT(quarter FROM payment_date) AS quarter,
                         EXTRACT(month FROM payment_date) AS month,
                         EXTRACT(day FROM payment_date) AS day,
                         EXTRACT(week FROM payment_date) AS week,
                         CASE WHEN EXTRACT(ISODOW FROM payment_date) IN (6, 7) THEN true ELSE false END AS is_weekend
                        FROM payment;
        
        

        
INSERT INTO dimCustomer (customer_key, customer_id, first_name, last_name, email, address, address2, district, city, country, postal_code, phone, active, create_date, start_date, end_date)
SELECT c.customer_id AS customer_key,
                         c.customer_id,
                         c.first_name,
                         c.last_name,
                         c.email,
                         a.address,
                         a.address2,
                         a.district,
                         ci.city,
                         co.country,
                         postal_code,
                         a.phone,
                         c.active,
                         c.create_date,
                         now() AS start_date,
                         now() AS end_date
FROM customer c
JOIN address a ON (c.address_id = a.address_id)
JOIN city ci ON (a.city_id = ci.city_id)
JOIN country co ON (ci.country_id = co.country_id);        
        
        
        
        
        
        
INSERT INTO dimMovie (movie_key, film_id, title, description, release_year, language, original_language, rental_duration, length, rating, special_features)
SELECT f.film_id AS movie_key,
                        film_id,
                        f.title,
                        f.description,
                        f.release_year,
                        l.name AS language,
                        l.name AS original_language,
                        f.rental_duration,
                        f.length,
                        f.rating,
                        f.special_features
FROM film f
JOIN language l on (f.language_id = l.language_id);

           
        
    
    
    
INSERT INTO dimStore (store_key, store_id, address, address2, district, city, country, postal_code, manager_first_name, manager_last_name, start_date, end_date)
SELECT s.store_id AS store_key,
        s.store_id,
        a.address,
        a.address2,
        a.district,
        c.city,
        co.country,
        a.postal_code,
        st.first_name AS manager_first_name,
        st.last_name AS manager_last_name,
        now() AS start_date,
        now() AS end_date
FROM store s
JOIN staff st ON (s.manager_staff_id = st.staff_id)
JOIN address a ON (s.address_id = a.address_id)
JOIN city c ON (a.city_id = c.city_id)
JOIN country co ON (c.country_id = co.country_id);


  
    
    
    
INSERT INTO factSales (date_key, customer_key, movie_key, store_key, sales_amount)
SELECT TO_CHAR(p.payment_date :: DATE, 'yyyyMMDD')::integer AS date_key,
                        p.customer_id AS customer_key,
                        i.film_id AS movie_key,
                        i.store_id AS store_key,
                      p.amount AS sales_amount
FROM payment p
JOIN rental r ON (p.rental_id = r.rental_id)
JOIN inventory i ON (r.inventory_id = i.inventory_id);




 * postgresql://postgres:***@127.0.0.1:5432/pagila
32 rows affected.
599 rows affected.
1000 rows affected.
2 rows affected.
14596 rows affected.


[]

## Step 6:  Repeat the computation from the facts & dimension table.

## 6.1: Facts tables has all the needed dimensions, so no need for deep joins 

In [7]:
%%time
%%sql
SELECT movie_key, date_key, customer_key, sales_amount
FROM factSales
limit 5;

 * postgresql://postgres:***@127.0.0.1:5432/pagila
5 rows affected.
Wall time: 1.56 ms


movie_key,date_key,customer_key,sales_amount
749,20070215,341,7.99
552,20070216,341,1.99
551,20070216,341,7.99
445,20070219,341,2.99
563,20070220,341,7.99


## 6.2: Join fact table with dimension to replace keys with attibutes

In [8]:
%%time
%%sql
SELECT dimMovie.title, dimDate.month, dimCustomer.city, sales_amount
FROM factSales
JOIN dimMovie on (dimMovie.movie_key = factSales.movie_key)
JOIN dimDate on (dimDate.date_key = factSales.date_key)
JOIN dimCustomer on (dimCustomer.customer_key = factSales.customer_key)
limit 5;

 * postgresql://postgres:***@127.0.0.1:5432/pagila
5 rows affected.
Wall time: 1.97 ms


title,month,city,sales_amount
Rules Human,2,Ede,7.99
Majestic Floats,2,Ede,1.99
Maiden Home,2,Ede,7.99
Hyde Doctor,2,Ede,2.99
Massacre Usual,2,Ede,7.99


In [9]:
%%time
%%sql
SELECT dimMovie.title, dimDate.month, dimCustomer.city, sum(sales_amount) as revenue
FROM factSales
JOIN dimMovie on (dimMovie.movie_key = factSales.movie_key)
JOIN dimDate on (dimDate.date_key = factSales.date_key)
JOIN dimCustomer on (dimCustomer.customer_key = factSales.customer_key)
group by (dimMovie.title, dimDate.month, dimCustomer.city)
order by dimMovie.title, dimDate.month, dimCustomer.city, revenue desc;

 * postgresql://postgres:***@127.0.0.1:5432/pagila
14540 rows affected.
Wall time: 73.3 ms


title,month,city,revenue
Academy Dinosaur,2,San Lorenzo,0.99
Academy Dinosaur,2,Sullana,1.99
Academy Dinosaur,2,Udaipur,0.99
Academy Dinosaur,3,Almirante Brown,1.99
Academy Dinosaur,3,Goinia,0.99
Academy Dinosaur,3,Kaliningrad,0.99
Academy Dinosaur,3,Kurashiki,0.99
Academy Dinosaur,3,Livorno,0.99
Academy Dinosaur,3,Nukualofa,0.99
Academy Dinosaur,3,Rajkot,3.99


In [10]:
%%time
%%sql
SELECT f.title, EXTRACT(month From p.payment_date) as month, ci.city, sum(p.amount) as revenue
FROM payment p
JOIN rental r on ( p.rental_id = r.rental_id)
JOIN inventory i on (r.inventory_id = i.inventory_id)
JOIN film f on (i.film_id = f.film_id)
JOIN customer c on ( p.customer_id = c.customer_id)
JOIN address a on (c.address_id = a.address_id)
JOIN city ci on (a.city_id = ci.city_id)
group by (f.title, month, ci.city)
order by f.title, month, ci.city, revenue desc;

 * postgresql://postgres:***@127.0.0.1:5432/pagila
14540 rows affected.
Wall time: 95.5 ms


title,month,city,revenue
Academy Dinosaur,2.0,San Lorenzo,0.99
Academy Dinosaur,2.0,Sullana,1.99
Academy Dinosaur,2.0,Udaipur,0.99
Academy Dinosaur,3.0,Almirante Brown,1.99
Academy Dinosaur,3.0,Goinia,0.99
Academy Dinosaur,3.0,Kaliningrad,0.99
Academy Dinosaur,3.0,Kurashiki,0.99
Academy Dinosaur,3.0,Livorno,0.99
Academy Dinosaur,3.0,Nukualofa,0.99
Academy Dinosaur,3.0,Rajkot,3.99


In [11]:
%%sql 


DROP TABLE dimDate CASCADE;
DROP TABLE dimCustomer CASCADE;
DROP TABLE dimMovie CASCADE;
DROP TABLE dimStore CASCADE;
DROP TABLE factSales CASCADE;


 * postgresql://postgres:***@127.0.0.1:5432/pagila
Done.
Done.
Done.
Done.
Done.


[]