In [1]:
import pandas as pd
from secrets_config import source_db_user, source_db_password, source_db_server_name, source_db_database_name
import jinja2 as j2 

# import libraries for sql 
from sqlalchemy import create_engine
from sqlalchemy.engine import URL

In [2]:
# create connection to database 
source_connection_url = URL.create(
    drivername = "postgresql+pg8000", 
    username = source_db_user,
    password = source_db_password,
    host = source_db_server_name, 
    port = 5432,
    database = source_db_database_name, 
)

source_engine = create_engine(source_connection_url)

In [3]:
import os 
import logging 


def extract_from_database(table_name, engine, path="extract_queries")->pd.DataFrame:
    """
    Builds models with a matching file name in the models_path folder. 
    - `table_name`: the name of the table (without .sql)
    - `path`: the path to the extract queries directory containing the sql files. defaults to `extract_queries`
    """
    logging.basicConfig(level=logging.INFO, format="[%(levelname)s][%(asctime)s]: %(message)s")
    
    if f"{table_name}.sql" in os.listdir(path):
        logging.info(f"Extracting table: {table_name}")
    
        # read sql contents into a variable 
        with open(f"{path}/{table_name}.sql") as f: 
            raw_sql = f.read()

        # parse sql using jinja 
        parsed_sql = j2.Template(raw_sql).render(source_table = table_name, engine=engine)
        # # execute parsed sql 
        df = pd.read_sql(sql=parsed_sql, con=engine)

        logging.info(f"Successfully extracted table: {table_name}, rows extracted: {len(df)}")
        return df 
    else: 
        logging.error(f"Could not find table: {table_name}")

In [4]:
def upsert_to_database(df: pd.DataFrame, table_name: str, engine)->bool: 
    """
    Upsert dataframe to a database table 
    - `df`: pandas dataframe 
    - `table`: name of the target table 
    - `engine`: connection engine to database 
    """
    logging.basicConfig(level=logging.INFO, format="[%(levelname)s][%(asctime)s]: %(message)s")
    logging.info(f"Writing to table: {table_name}")
    df.to_sql(name=table_name, con=engine, if_exists="replace", index=False)
    logging.info(f"Successful write to table: {table_name}, rows inserted/updated: {len(df)}")
    return True 

In [5]:
from secrets_config import target_db_user, target_db_password, target_db_server_name, target_db_database_name
# create connection to database 
target_connection_url = URL.create(
    drivername = "postgresql+pg8000", 
    username = target_db_user,
    password = target_db_password,
    host = target_db_server_name, 
    port = 5432,
    database = target_db_database_name, 
)

target_engine = create_engine(target_connection_url)

In [6]:
def extract_load_pipeline(source_engine, target_engine, path="extract_queries"): 
    for file in os.listdir(path):
        table_name = file.replace(".sql", "")
        df = extract_from_database(table_name=table_name, engine=source_engine, path=path)
        upsert_to_database(df=df, table_name=table_name, engine=target_engine)

In [7]:
extract_load_pipeline(
    source_engine=source_engine, 
    target_engine=target_engine, 
    path="extract_queries"
)

[INFO][2022-07-24 00:27:38,833]: Extracting table: customer
[INFO][2022-07-24 00:27:38,938]: Successfully extracted table: customer, rows extracted: 599
[INFO][2022-07-24 00:27:38,939]: Writing to table: customer
[INFO][2022-07-24 00:27:39,488]: Successful write to table: customer, rows inserted/updated: 599
[INFO][2022-07-24 00:27:39,489]: Extracting table: film_category
[INFO][2022-07-24 00:27:39,509]: Successfully extracted table: film_category, rows extracted: 1000
[INFO][2022-07-24 00:27:39,509]: Writing to table: film_category
[INFO][2022-07-24 00:27:40,060]: Successful write to table: film_category, rows inserted/updated: 1000
[INFO][2022-07-24 00:27:40,060]: Extracting table: film
[INFO][2022-07-24 00:27:40,149]: Successfully extracted table: film, rows extracted: 1000
[INFO][2022-07-24 00:27:40,150]: Writing to table: film
[INFO][2022-07-24 00:27:40,760]: Successful write to table: film, rows inserted/updated: 1000
[INFO][2022-07-24 00:27:40,760]: Extracting table: staff
[INFO

In [8]:
import os 
import logging 

def build_model(model, engine, models_path="models")->bool:
    """
    Builds models with a matching file name in the models_path folder. 
    - `model`: the name of the model (without .sql)
    - `models_path`: the path to the models directory containing the sql files. defaults to `models`
    """
    logging.basicConfig(level=logging.INFO, format="[%(levelname)s][%(asctime)s]: %(message)s")
    
    if f"{model}.sql" in os.listdir(models_path):
        logging.info(f"Building model: {model}")
    
        # read sql contents into a variable 
        with open(f"{models_path}/{model}.sql") as f: 
            raw_sql = f.read()

        # parse sql using jinja 
        parsed_sql = j2.Template(raw_sql).render(target_table = model, engine=engine)

        # execute parsed sql 
        result = engine.execute(parsed_sql)
        logging.info(f"Successfully built model: {model}, rows inserted/updated: {result.rowcount}")
        return True 
    else: 
        logging.error(f"Could not find model: {model}")

In [9]:
# import TopologicalSorter
from graphlib import TopologicalSorter

In [10]:
# create a DAG of models using TopologicalSorter
ts = TopologicalSorter()
ts.add("staging_films")
ts.add("serving_sales_film", "staging_films")
ts.add("serving_films_popular", "staging_films")
ts.add("serving_sales_customer")
ts.add("serving_sales_cumulative")
dag = tuple(ts.static_order())
print(dag)

('staging_films', 'serving_sales_customer', 'serving_sales_cumulative', 'serving_sales_film', 'serving_films_popular')


In [16]:
# execute each node in the dag in order using a for loop 
for node in dag: 
    build_model(model=node, engine=target_engine, models_path="models/")

[INFO][2022-07-24 00:34:28,551]: Building model: staging_films
[INFO][2022-07-24 00:34:28,612]: Successfully built model: staging_films, rows inserted/updated: 958
[INFO][2022-07-24 00:34:28,613]: Building model: serving_sales_customer
[INFO][2022-07-24 00:34:28,659]: Successfully built model: serving_sales_customer, rows inserted/updated: 599
[INFO][2022-07-24 00:34:28,660]: Building model: serving_sales_cumulative
[INFO][2022-07-24 00:34:28,720]: Successfully built model: serving_sales_cumulative, rows inserted/updated: 14596
[INFO][2022-07-24 00:34:28,721]: Building model: serving_sales_film
[INFO][2022-07-24 00:34:28,749]: Successfully built model: serving_sales_film, rows inserted/updated: 958
[INFO][2022-07-24 00:34:28,750]: Building model: serving_films_popular
[INFO][2022-07-24 00:34:28,757]: Successfully built model: serving_films_popular, rows inserted/updated: 958
