# Clean Orderlines

## Initialize Cleaning

### Import Packages and Raw_data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

# Global variables
df_name = "orderlines" 
data_path = '../../data/raw_data/' 

init_df = pd.read_csv(f"{data_path }{df_name}.csv")


### Logging

In [7]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Pipeline - Functions

### Start Pipeline

In [8]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [9]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


In [10]:
def rename_columns(df):
    return (df
        .rename(columns={
            "id": "o_lines_id",
            "id_order":  "order_id",      
            "product_quantity":"o_lines_product_qty",        
            "unit_price": "o_lines_unit_price", 
            "date":"o_lines_date"
            },
        )
    )

### Remove duplicates

In [11]:
@log_step
def remove_duplicates(df):
    return df.drop_duplicates()

### Adjust price

In [12]:
@log_step
def adjust_price(df):
    df["o_lines_unit_price"] = df["o_lines_unit_price"].str.replace("\.","", regex=True)
    df["o_lines_unit_price"] = pd.to_numeric(df["o_lines_unit_price"]) / 100
    return df

### Missing Values

In [13]:
@log_step
def missing_values(df):
    return df

### Adjust data types

In [14]:
@log_step
def adjust_data_types(df):
    df["o_lines_id"] = df["o_lines_id"].astype("string")
    df["order_id"] = df["order_id"].astype("string")
    df["o_lines_date"] = pd.to_datetime(df["o_lines_date"], format='%Y-%m-%d %H:%M:%S')
    return df

### Drop Columns

In [15]:
@log_step
def drop_columns(df, cols):
    return df.drop(columns=cols)


### Add columns

In [16]:
@log_step
def add_columns(df):
    df = df.assign(o_lines_year = df["o_lines_date"].dt.strftime("%Y"))
    df = df.assign(o_lines_month = df["o_lines_date"].dt.strftime("%M"))
    df = df.assign(o_lines_day = df["o_lines_date"].dt.strftime("%A"))
    df = df.assign(o_lines_hour = df["o_lines_date"].dt.strftime("%H"))
    return df

### End Pipeline

In [17]:
@log_step
def end_pipeline(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index = False)
    return df

## Run Cleaning

In [25]:
o_lines = (
init_df
    .pipe(start_pipeline)       # copy df
    .pipe(rename_columns)       # naming conventions
    .pipe(remove_duplicates)    # no duplicates
    .pipe(missing_values)       # no missing values
    .pipe(adjust_price)         # removed second dot: 1.000.99 -> 1000.99  + to_numeric 
    .pipe(adjust_data_types)    # o_lines_id -> STRING, order_id --> STRING, o_lines_date --> DATETIME
    .pipe(add_columns)          # o_lines_year, o_lines_month, o_lines_day, o_lines_hour
    .pipe(drop_columns, cols=["product_id"])
    .pipe(end_pipeline)         # save clean data as .csv 
)

start_pipeline:
 shape=(293983, 7) took 0:00:00.016508s

remove_duplicates:
 shape=(293983, 7) took 0:00:00.193220s

missing_values:
 shape=(293983, 7) took 0:00:00.000008s

adjust_price:
 shape=(293983, 7) took 0:00:00.453441s

adjust_data_types:
 shape=(293983, 7) took 0:00:00.559255s

add_columns:
 shape=(293983, 11) took 0:00:08.699243s

drop_columns:
 shape=(293983, 10) took 0:00:00.058681s

end_pipeline:
 shape=(293983, 10) took 0:00:01.800246s



## Profile Report

### Types

#### Before

In [29]:
init_df.dtypes

id                   int64
id_order             int64
product_id           int64
product_quantity     int64
sku                 object
unit_price          object
date                object
dtype: object

#### After

In [30]:
o_lines.dtypes

o_lines_id                     string
order_id                       string
o_lines_product_qty             int64
sku                            object
o_lines_unit_price            float64
o_lines_date           datetime64[ns]
o_lines_year                   object
o_lines_month                  object
o_lines_day                    object
o_lines_hour                   object
dtype: object

### Samples

#### Before

In [20]:
init_df_head = (
init_df.head()
)

init_df_head

Unnamed: 0,id,id_order,product_id,product_quantity,sku,unit_price,date
0,1119109,299539,0,1,OTT0133,18.99,2017-01-01 00:07:19
1,1119110,299540,0,1,LGE0043,399.0,2017-01-01 00:19:45
2,1119111,299541,0,1,PAR0071,474.05,2017-01-01 00:20:57
3,1119112,299542,0,1,WDT0315,68.39,2017-01-01 00:51:40
4,1119113,299543,0,1,JBL0104,23.74,2017-01-01 01:06:38


#### After

In [21]:
o_lines_cleaned_head = (
o_lines.head()
)

o_lines_cleaned_head

Unnamed: 0,o_lines_id,order_id,o_lines_product_qty,sku,o_lines_unit_price,o_lines_year,o_lines_month,o_lines_day,o_lines_hour
0,1119109,299539,1,OTT0133,18.99,2017,7,Sunday,0
1,1119110,299540,1,LGE0043,399.0,2017,19,Sunday,0
2,1119111,299541,1,PAR0071,474.05,2017,20,Sunday,0
3,1119112,299542,1,WDT0315,68.39,2017,51,Sunday,0
4,1119113,299543,1,JBL0104,23.74,2017,6,Sunday,1


### Info - Types Range, Non Null Count, Dtype

#### Before

In [22]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


#### After

In [23]:
o_lines.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293983 entries, 0 to 293982
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   o_lines_id           293983 non-null  string 
 1   order_id             293983 non-null  string 
 2   o_lines_product_qty  293983 non-null  int64  
 3   sku                  293983 non-null  object 
 4   o_lines_unit_price   293983 non-null  float64
 5   o_lines_year         293983 non-null  object 
 6   o_lines_month        293983 non-null  object 
 7   o_lines_day          293983 non-null  object 
 8   o_lines_hour         293983 non-null  object 
dtypes: float64(1), int64(1), object(5), string(2)
memory usage: 22.4+ MB


### Pandas - Profile Report

In [24]:
# ProfileReport(o_lines)