# Data Quality

In [99]:
import pandas as pd
from pandas_profiling import ProfileReport
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_rows', 1000)

In [100]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Exclude unwanted orders

In [101]:
init_orders = pd.read_csv("../../data/clean_data/clean_orders.csv")
init_orderlines = pd.read_csv("../../data/clean_data/clean_orderlines.csv")
init_products = pd.read_csv("../../data/clean_data/clean_products.csv")

In [102]:
@log_step
def start_pipeline(df):
    return df.copy()

In [103]:
@log_step
def drop_incomplete_orders(df):
    return df.query("order_status == 'Completed'")
    

In [104]:
@log_step
def merge_orderlines(df):
    return df.merge(orderlines[["order_id", "sku"]], how="inner")

In [105]:
@log_step
def merge_products(df):
    return df.merge(init_products["sku"])


In [106]:
@log_step
def get_unique_orders(df):
    return df.drop_duplicates("order_id")
    

In [107]:
@log_step
def drop_columns(df):
    return df.drop(columns="sku")
    

In [108]:
@log_step
def create_csv(df):
    df.to_csv(f"../../data/clean_data/clean_orders.csv", index = False)
    return df

In [109]:
@log_step
def create_report(df):
    df_name = "orders"
    ProfileReport(df, title=f"{df_name.title()} - Report", dark_mode=True, minimal = True).to_file(f"../../data/profile_reports/{df_name}_report.html")
    return df

## Explore the revenue from different tables

## Become confident about your dataset

## Pipeline

In [111]:
orders = (
init_orders
    .pipe(start_pipeline)
    .pipe(drop_incomplete_orders)
    .pipe(merge_orderlines)
    .pipe(merge_products)
    .pipe(get_unique_orders)
    .pipe(drop_columns)
    .pipe(create_csv)
    .pipe(create_report)
)

start_pipeline:
 shape=(46407, 4) took 0:00:00.001846s

drop_incomplete_orders:
 shape=(46407, 4) took 0:00:00.010045s

merge_orderlines:
 shape=(61748, 5) took 0:00:00.064027s

merge_products:
 shape=(61748, 5) took 0:00:00.023798s

get_unique_orders:
 shape=(46407, 5) took 0:00:00.014494s

drop_columns:
 shape=(46407, 4) took 0:00:00.002269s

create_csv:
 shape=(46407, 4) took 0:00:00.154613s



Summarize dataset: 100%|██████████| 11/11 [00:00<00:00, 32.88it/s, Completed]                        
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.56s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  3.22it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 335.89it/s]

create_report:
 shape=(46407, 4) took 0:00:02.283815s




