# Orderlines - Data Quality

## Define Pandas display format

In [9]:
import pandas as pd
from pandas_profiling import ProfileReport
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_rows', 1000)

In [10]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

In [11]:
init_orders = pd.read_csv("../../data/clean_data/clean_orders.csv")
init_orderlines = pd.read_csv("../../data/clean_data/clean_orderlines.csv")
init_products = pd.read_csv("../../data/clean_data/clean_products.csv")

In [12]:
@log_step
def start_pipeline(df):
    return df.copy()

In [13]:
@log_step
def merge_orders(df):
    return df.merge(init_orders["order_id"], how="inner")

In [14]:
@log_step
def merge_products(df):
    return df.merge(init_products["sku"])


In [15]:
@log_step
def create_csv(df, name):
    df.to_csv(f"../../data/clean_data/clean_{name}.csv", index = False)
    return df

In [16]:
@log_step
def create_report(df, name):
    
    ProfileReport(df, title=f"{name.title()} - Report", dark_mode=True, minimal = True).to_file(f"../../data/profile_reports/{name}_report.html")
    return df

## Pipeline

In [18]:
orderlines = (
init_orderlines
    .pipe(start_pipeline)
    .pipe(merge_orders)
    .pipe(merge_products)
    .pipe(create_csv, name="orderlines")
    .pipe(create_report, name="orderlines" )
)

start_pipeline:
 shape=(293982, 6) took 0:00:00.013644s

merge_orders:
 shape=(61947, 6) took 0:00:00.068961s

merge_products:
 shape=(61742, 6) took 0:00:00.022465s

create_csv:
 shape=(61742, 6) took 0:00:00.248586s



Summarize dataset: 100%|██████████| 12/12 [00:00<00:00, 22.82it/s, Completed]                      
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.80s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 335.65it/s]

create_report:
 shape=(61742, 6) took 0:00:02.756870s




