In [1]:
import pandas as pd
from secrets_config import source_db_user, source_db_password, source_db_server_name, source_db_database_name
import jinja2 as j2 

# import libraries for sql 
from sqlalchemy import create_engine
from sqlalchemy.engine import URL

In [2]:
# create connection to database 
source_connection_url = URL.create(
    drivername = "postgresql+pg8000", 
    username = source_db_user,
    password = source_db_password,
    host = source_db_server_name, 
    port = 5432,
    database = source_db_database_name, 
)

source_engine = create_engine(source_connection_url)

In [92]:
import os 
import logging 
import datetime as dt 
import numpy as np

def get_incremental_value(table_name, path="extract_log"):
    df = pd.read_csv(f"{path}/{table_name}.csv")
    return df[df["log_date"] == df["log_date"].max()]["incremental_value"].values[0]

def upsert_incremental_log(log_path, table_name, incremental_value)->bool:
    if f"{table_name}.csv" in os.listdir(log_path):
        df_existing_incremental_log = pd.read_csv(f"{log_path}/{table_name}.csv")
        df_incremental_log = pd.DataFrame(data={
            "log_date": [dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")], 
            "incremental_value": [incremental_value]
        })
        df_updated_incremental_log = pd.concat([df_existing_incremental_log,df_incremental_log])
        df_updated_incremental_log.to_csv(f"{log_path}/{table_name}.csv", index=False)
    else: 
        df_incremental_log = pd.DataFrame(data={
            "log_date": [dt.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")], 
            "incremental_value": [incremental_value]
        })
        df_incremental_log.to_csv(f"{log_path}/{table_name}.csv", index=False)
    return True 

def extract_from_database(table_name, engine, path="extract_queries")->pd.DataFrame:
    """
    Builds models with a matching file name in the models_path folder. 
    - `table_name`: the name of the table (without .sql)
    - `path`: the path to the extract queries directory containing the sql files. defaults to `extract_queries`
    """
    logging.basicConfig(level=logging.INFO, format="[%(levelname)s][%(asctime)s]: %(message)s")
    
    logging.info(f"Extracting table: {table_name}")
    if f"{table_name}.sql" in os.listdir(path):
        # read sql contents into a variable 
        with open(f"{path}/{table_name}.sql") as f: 
            raw_sql = f.read()
        
        # get config 
        config = j2.Template(raw_sql).make_module().config 
        
        if config["extract_type"].lower() == "incremental": 
            incremental_path = "extract_log"
            if not os.path.exists(incremental_path): 
                os.mkdir(incremental_path)
            if f"{table_name}.csv" in os.listdir(incremental_path):
                # get incremental value and perform incremental extract 
                current_max_incremental_value = get_incremental_value(table_name, path=incremental_path)
                parsed_sql = j2.Template(raw_sql).render(source_table = table_name, engine=engine, is_incremental=True, incremental_value=current_max_incremental_value)
                # execute incremental extract
                df = pd.read_sql(sql=parsed_sql, con=engine)
                # update max incremental value 
                if len(df) > 0: 
                    max_incremental_value = df[config["incremental_column"]].max()
                else: 
                    max_incremental_value = current_max_incremental_value
                upsert_incremental_log(log_path=incremental_path, table_name=table_name, incremental_value=max_incremental_value)
                logging.info(f"Successfully extracted table: {table_name}, rows extracted: {len(df)}")
                return df 
            else: 
                # parse sql using jinja 
                parsed_sql = j2.Template(raw_sql).render(source_table = table_name, engine=engine)
                # perform full extract 
                df = pd.read_sql(sql=parsed_sql, con=engine)
                # store latest incremental value 
                max_incremental_value = df[config["incremental_column"]].max()
                upsert_incremental_log(log_path=incremental_path, table_name=table_name, incremental_value=max_incremental_value)
                logging.info(f"Successfully extracted table: {table_name}, rows extracted: {len(df)}")
                return df 
        else: 
            # parse sql using jinja 
            parsed_sql = j2.Template(raw_sql).render(source_table = table_name, engine=engine)
            # perform full extract 
            df = pd.read_sql(sql=parsed_sql, con=engine)
            logging.info(f"Successfully extracted table: {table_name}, rows extracted: {len(df)}")
            return df 
    else: 
        logging.error(f"Could not find table: {table_name}")

In [95]:
df = extract_from_database(table_name="orders", engine=source_engine, path="extract_queries")
df

[INFO][2022-07-24 16:46:57,170]: Extracting table: orders
[INFO][2022-07-24 16:46:57,996]: Successfully extracted table: orders, rows extracted: 12003


Unnamed: 0,orderid,orderdate,customerid,netamount,tax,totalamount
0,1,2004-01-27,7888,313.24,25.84,339.08
1,2,2004-01-01,4858,54.90,4.53,59.43
2,3,2004-01-17,15399,160.10,13.21,173.31
3,4,2004-01-28,17019,106.67,8.80,115.47
4,5,2004-01-09,14771,256.00,21.12,277.12
...,...,...,...,...,...,...
11998,11999,2004-12-25,1485,175.34,14.47,189.81
11999,12000,2004-12-15,7393,205.09,16.92,222.01
12000,12001,2005-01-01,10205,50.10,5.02,55.12
12001,12002,2005-01-02,10205,50.10,5.02,55.12


In [103]:
df.dtypes

orderid          int64
orderdate       object
customerid       int64
netamount      float64
tax            float64
totalamount    float64
dtype: object

In [108]:
df["orderdate"].astype(str)

0        2004-01-27
1        2004-01-01
2        2004-01-17
3        2004-01-28
4        2004-01-09
            ...    
11998    2004-12-25
11999    2004-12-15
12000    2005-01-01
12001    2005-01-02
12002    2005-01-03
Name: orderdate, Length: 12003, dtype: object