# Clean Orderlines

## Initialize Cleaning

### Import Packages and Raw_data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

# Global variables
df_name = "orderlines" 
data_path = '../../data/raw_data/' 

init_df = pd.read_csv(f"{data_path }{df_name}.csv")


### Logging

In [2]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Pipeline - Functions

### Start Pipeline

In [3]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [4]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


In [5]:
def rename_columns(df):
    return (df
        .rename(columns={
            "id": "o_lines_id",
            "id_order":  "order_id",      
            "product_quantity":"o_lines_product_qty",        
            "unit_price": "o_lines_unit_price", 
            "date":"o_lines_date"
            },
        )
    )

### Remove duplicates

In [6]:
@log_step
def remove_duplicates(df):
    return df.drop_duplicates()

### Adjust price

In [7]:
@log_step
def adjust_price(df):
    df["o_lines_unit_price"] = df["o_lines_unit_price"].str.replace("\.","", regex=True)
    df["o_lines_unit_price"] = pd.to_numeric(df["o_lines_unit_price"]) / 100
    return df

### Missing Values

In [8]:
@log_step
def missing_values(df):
    return df

### Adjust data types

In [9]:
@log_step
def adjust_data_types(df):
    df["o_lines_id"] = df["o_lines_id"].astype("string")
    df["order_id"] = df["order_id"].astype("string")
    df["o_lines_date"] = pd.to_datetime(df["o_lines_date"], format='%Y-%m-%d %H:%M:%S')
    return df

### Drop Columns

In [10]:
@log_step
def drop_columns(df, cols):
    return df.drop(columns=cols)


### Add columns

In [11]:
@log_step
def add_columns(df):
    df = df.assign(o_lines_year = df["o_lines_date"].dt.strftime("%Y")).astype("category")
    df = df.assign(o_lines_month = df["o_lines_date"].dt.strftime("%h")).astype("category")
    df = df.assign(o_lines_day = df["o_lines_date"].dt.strftime("%A")).astype("category")
    df = df.assign(o_lines_hour = df["o_lines_date"].dt.strftime("%H")).astype("category")
    return df

### CSV

In [12]:
@log_step
def create_csv(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index = False)
    return df

### Report

In [13]:
@log_step
def create_report(df):
    ProfileReport(df, title=f"{df_name.title()} - Report", dark_mode=True, minimal = True).to_file(f"../../data/profile_reports/{df_name}_report.html")
    return df

## Run Cleaning

In [14]:
o_lines = (
init_df
    .pipe(start_pipeline)       # copy df
    .pipe(rename_columns)       # naming conventions
    #.pipe(remove_duplicates)    # no duplicates
    #.pipe(missing_values)       # no missing values
    .pipe(adjust_price)         # removed second dot: 1.000.99 -> 1000.99  + to_numeric 
    .pipe(adjust_data_types)    # o_lines_id -> STRING, order_id --> STRING, o_lines_date --> DATETIME
    .pipe(add_columns)          # o_lines_year, o_lines_month, o_lines_day, o_lines_hour
    .pipe(drop_columns, cols=["product_id"])
    .pipe(create_csv)           # save clean data as .csv 
    .pipe(create_report)        # save report as .html 
)

start_pipeline:
 shape=(293983, 7) took 0:00:00.015077s

adjust_price:
 shape=(293983, 7) took 0:00:00.559948s

adjust_data_types:
 shape=(293983, 7) took 0:00:00.730657s

add_columns:
 shape=(293983, 11) took 0:00:09.448121s

drop_columns:
 shape=(293983, 10) took 0:00:00.000750s

create_csv:
 shape=(293983, 10) took 0:00:02.569210s



Summarize dataset: 100%|██████████| 16/16 [00:06<00:00,  2.59it/s, Completed]                           
Generate report structure: 100%|██████████| 1/1 [00:00<00:00, 29.69it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  4.09it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 412.01it/s]

create_report:
 shape=(293983, 10) took 0:00:06.588670s






## Profile Report

### Types

#### Before

In [15]:
init_df.dtypes

id                   int64
id_order             int64
product_id           int64
product_quantity     int64
sku                 object
unit_price          object
date                object
dtype: object

#### After

In [16]:
o_lines.dtypes

o_lines_id             category
order_id               category
o_lines_product_qty    category
sku                    category
o_lines_unit_price     category
o_lines_date           category
o_lines_year           category
o_lines_month          category
o_lines_day            category
o_lines_hour           category
dtype: object

### Samples

#### Before

In [17]:
init_df.head()

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
0,1119109,299539,0,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,0,1,LGE0043,399.0,2017-01-01 00:19:45
2,1119111,299541,0,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,0,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,0,1,JBL0104,23.74,2017-01-01 01:06:38


#### After

In [18]:
o_lines.head()

Unnamed: 0,o_lines_id,order_id,o_lines_product_qty,sku,o_lines_unit_price,o_lines_date,o_lines_year,o_lines_month,o_lines_day,o_lines_hour
0,1119109,299539,1,OTT0133,18.99,2017-01-01 00:07:19,2017,Jan,Sunday,0
1,1119110,299540,1,LGE0043,399.0,2017-01-01 00:19:45,2017,Jan,Sunday,0
2,1119111,299541,1,PAR0071,474.05,2017-01-01 00:20:57,2017,Jan,Sunday,0
3,1119112,299542,1,WDT0315,68.39,2017-01-01 00:51:40,2017,Jan,Sunday,0
4,1119113,299543,1,JBL0104,23.74,2017-01-01 01:06:38,2017,Jan,Sunday,1


### Info - Types Range, Non Null Count, Dtype

#### Before

In [19]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


#### After

In [20]:
o_lines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype   
---  ------               --------------   -----   
 0   o_lines_id           293983 non-null  category
 1   order_id             293983 non-null  category
 2   o_lines_product_qty  293983 non-null  category
 3   sku                  293983 non-null  category
 4   o_lines_unit_price   293983 non-null  category
 5   o_lines_date         293983 non-null  category
 6   o_lines_year         293983 non-null  category
 7   o_lines_month        293983 non-null  category
 8   o_lines_day          293983 non-null  category
 9   o_lines_hour         293983 non-null  category
dtypes: category(10)
memory usage: 36.5 MB
