In [25]:
import pandas as pd
from secrets_config import source_db_user, source_db_password, source_db_server_name, source_db_database_name
import jinja2 as j2 

# import libraries for sql 
from sqlalchemy import create_engine
from sqlalchemy.engine import URL

In [26]:
# create connection to database 
source_connection_url = URL.create(
    drivername = "postgresql+pg8000", 
    username = source_db_user,
    password = source_db_password,
    host = source_db_server_name, 
    port = 5432,
    database = source_db_database_name, 
)

source_engine = create_engine(source_connection_url)

In [27]:
import os 
import logging 
import datetime as dt 
import numpy as np

def get_incremental_value(table_name, path="extract_log"):
    df = pd.read_csv(f"{path}/{table_name}.csv")
    return df[df["log_date"] == df["log_date"].max()]["incremental_value"].values[0]

def upsert_incremental_log(log_path, table_name, incremental_value)->bool:
    if f"{table_name}.csv" in os.listdir(log_path):
        df_existing_incremental_log = pd.read_csv(f"{log_path}/{table_name}.csv")
        df_incremental_log = pd.DataFrame(data={
            "log_date": [dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")], 
            "incremental_value": [incremental_value]
        })
        df_updated_incremental_log = pd.concat([df_existing_incremental_log,df_incremental_log])
        df_updated_incremental_log.to_csv(f"{log_path}/{table_name}.csv", index=False)
    else: 
        df_incremental_log = pd.DataFrame(data={
            "log_date": [dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")], 
            "incremental_value": [incremental_value]
        })
        df_incremental_log.to_csv(f"{log_path}/{table_name}.csv", index=False)
    return True 

def extract_from_database(table_name, engine, path="extract_queries")->pd.DataFrame:
    """
    Builds models with a matching file name in the models_path folder. 
    - `table_name`: the name of the table (without .sql)
    - `path`: the path to the extract queries directory containing the sql files. defaults to `extract_queries`
    """
    logging.basicConfig(level=logging.INFO, format="[%(levelname)s][%(asctime)s]: %(message)s")
    
    logging.info(f"Extracting table: {table_name}")
    if f"{table_name}.sql" in os.listdir(path):
        # read sql contents into a variable 
        with open(f"{path}/{table_name}.sql") as f: 
            raw_sql = f.read()
        
        # get config 
        config = j2.Template(raw_sql).make_module().config 
        
        if config["extract_type"].lower() == "incremental": 
            incremental_path = "extract_log"
            if not os.path.exists(incremental_path): 
                os.mkdir(incremental_path)
            if f"{table_name}.csv" in os.listdir(incremental_path):
                # get incremental value and perform incremental extract 
                current_max_incremental_value = get_incremental_value(table_name, path=incremental_path)
                parsed_sql = j2.Template(raw_sql).render(source_table = table_name, engine=engine, is_incremental=True, incremental_value=current_max_incremental_value)
                # execute incremental extract
                df = pd.read_sql(sql=parsed_sql, con=engine)
                # update max incremental value 
                if len(df) > 0: 
                    max_incremental_value = df[config["incremental_column"]].max()
                else: 
                    max_incremental_value = current_max_incremental_value
                upsert_incremental_log(log_path=incremental_path, table_name=table_name, incremental_value=max_incremental_value)
                logging.info(f"Successfully extracted table: {table_name}, rows extracted: {len(df)}")
                return df 
            else: 
                # parse sql using jinja 
                parsed_sql = j2.Template(raw_sql).render(source_table = table_name, engine=engine)
                # perform full extract 
                df = pd.read_sql(sql=parsed_sql, con=engine)
                # store latest incremental value 
                max_incremental_value = df[config["incremental_column"]].max()
                upsert_incremental_log(log_path=incremental_path, table_name=table_name, incremental_value=max_incremental_value)
                logging.info(f"Successfully extracted table: {table_name}, rows extracted: {len(df)}")
                return df 
        else: 
            # parse sql using jinja 
            parsed_sql = j2.Template(raw_sql).render(source_table = table_name, engine=engine)
            # perform full extract 
            df = pd.read_sql(sql=parsed_sql, con=engine)
            logging.info(f"Successfully extracted table: {table_name}, rows extracted: {len(df)}")
            return df 
    else: 
        logging.error(f"Could not find table: {table_name}")

In [28]:
def upsert_to_database(df: pd.DataFrame, table_name: str, engine)->bool: 
    """
    Upsert dataframe to a database table 
    - `df`: pandas dataframe 
    - `table`: name of the target table 
    - `engine`: connection engine to database 
    """
    logging.basicConfig(level=logging.INFO, format="[%(levelname)s][%(asctime)s]: %(message)s")
    logging.info(f"Writing to table: {table_name}")
    df.to_sql(name=table_name, con=engine, if_exists="replace", index=False)
    logging.info(f"Successful write to table: {table_name}, rows inserted/updated: {len(df)}")
    return True 

In [29]:
from secrets_config import target_db_user, target_db_password, target_db_server_name, target_db_database_name
# create connection to database 
target_connection_url = URL.create(
    drivername = "postgresql+pg8000", 
    username = target_db_user,
    password = target_db_password,
    host = target_db_server_name, 
    port = 5432,
    database = target_db_database_name, 
)

target_engine = create_engine(target_connection_url)

In [30]:
def extract_load_pipeline(source_engine, target_engine, path="extract_queries"): 
    for file in os.listdir(path):
        table_name = file.replace(".sql", "")
        df = extract_from_database(table_name=table_name, engine=source_engine, path=path)
        upsert_to_database(df=df, table_name=table_name, engine=target_engine)

In [31]:
extract_load_pipeline(
    source_engine=source_engine, 
    target_engine=target_engine, 
    path="extract_queries"
)

[INFO][2022-07-24 16:32:43,300]: Extracting table: customer
[INFO][2022-07-24 16:32:43,378]: Successfully extracted table: customer, rows extracted: 599
[INFO][2022-07-24 16:32:43,379]: Writing to table: customer
[INFO][2022-07-24 16:32:43,815]: Successful write to table: customer, rows inserted/updated: 599
[INFO][2022-07-24 16:32:43,815]: Extracting table: film_category
[INFO][2022-07-24 16:32:43,851]: Successfully extracted table: film_category, rows extracted: 1000
[INFO][2022-07-24 16:32:43,852]: Writing to table: film_category
[INFO][2022-07-24 16:32:44,277]: Successful write to table: film_category, rows inserted/updated: 1000
[INFO][2022-07-24 16:32:44,277]: Extracting table: film
[INFO][2022-07-24 16:32:44,352]: Successfully extracted table: film, rows extracted: 1000
[INFO][2022-07-24 16:32:44,353]: Writing to table: film
[INFO][2022-07-24 16:32:45,004]: Successful write to table: film, rows inserted/updated: 1000
[INFO][2022-07-24 16:32:45,005]: Extracting table: staff
[INFO

In [32]:
import os 
import logging 

def build_model(model, engine, models_path="models")->bool:
    """
    Builds models with a matching file name in the models_path folder. 
    - `model`: the name of the model (without .sql)
    - `models_path`: the path to the models directory containing the sql files. defaults to `models`
    """
    logging.basicConfig(level=logging.INFO, format="[%(levelname)s][%(asctime)s]: %(message)s")
    
    if f"{model}.sql" in os.listdir(models_path):
        logging.info(f"Building model: {model}")
    
        # read sql contents into a variable 
        with open(f"{models_path}/{model}.sql") as f: 
            raw_sql = f.read()

        # parse sql using jinja 
        parsed_sql = j2.Template(raw_sql).render(target_table = model, engine=engine)

        # execute parsed sql 
        result = engine.execute(parsed_sql)
        logging.info(f"Successfully built model: {model}, rows inserted/updated: {result.rowcount}")
        return True 
    else: 
        logging.error(f"Could not find model: {model}")

In [33]:
# import TopologicalSorter
from graphlib import TopologicalSorter

In [34]:
# create a DAG of models using TopologicalSorter
ts = TopologicalSorter()
ts.add("staging_films")
ts.add("serving_sales_film", "staging_films")
ts.add("serving_films_popular", "staging_films")
ts.add("serving_sales_customer")
ts.add("serving_sales_cumulative")
dag = tuple(ts.static_order())
print(dag)

('staging_films', 'serving_sales_customer', 'serving_sales_cumulative', 'serving_sales_film', 'serving_films_popular')


In [35]:
# execute each node in the dag in order using a for loop 
for node in dag: 
    build_model(model=node, engine=target_engine, models_path="models/")

[INFO][2022-07-24 16:33:03,978]: Building model: staging_films
[INFO][2022-07-24 16:33:04,059]: Successfully built model: staging_films, rows inserted/updated: 958
[INFO][2022-07-24 16:33:04,060]: Building model: serving_sales_customer
[INFO][2022-07-24 16:33:04,110]: Successfully built model: serving_sales_customer, rows inserted/updated: 599
[INFO][2022-07-24 16:33:04,111]: Building model: serving_sales_cumulative
[INFO][2022-07-24 16:33:04,170]: Successfully built model: serving_sales_cumulative, rows inserted/updated: 14596
[INFO][2022-07-24 16:33:04,171]: Building model: serving_sales_film
[INFO][2022-07-24 16:33:04,200]: Successfully built model: serving_sales_film, rows inserted/updated: 958
[INFO][2022-07-24 16:33:04,200]: Building model: serving_films_popular
[INFO][2022-07-24 16:33:04,208]: Successfully built model: serving_films_popular, rows inserted/updated: 958
