# Clean orders

## Initialize Cleaning

### Import Packages and Raw_data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

# Global variables
df_name = "orders" 
data_path = '../../data/raw_data/' 

init_df = pd.read_csv(f"{data_path }{df_name}.csv")


### Logging

In [2]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Pipeline - Functions

### Start Pipeline

In [3]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [4]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226909 entries, 0 to 226908
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      226909 non-null  int64  
 1   created_date  226909 non-null  object 
 2   total_paid    226904 non-null  float64
 3   state         226909 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.9+ MB


In [5]:
def rename_columns(df):
    return (df
        .rename(columns={
            "created_date":  "order_created_at",      
            "total_paid":"order_total",        
            "state": "order_status"
            },
        )
    )

### Adjust data types

In [6]:
@log_step
def adjust_data_types(df):
    df["order_id"] = df["order_id"].astype("string")
    
    df["order_status"] = df["order_status"].astype("category")
    df ["order_year"] = df ["order_year"].astype("category")
    df ["order_month"] = df ["order_month"].astype("category")
    df ["order_day"] = df ["order_day"].astype("category")
    df ["order_hour"] = df ["order_hour"].astype("int")
    return df

### Add columns

In [7]:
@log_step
def add_columns(df):
    df["order_created_at"] = pd.to_datetime(df["order_created_at"], format='%Y-%m-%d %H:%M:%S')
    df = df.assign(order_year = df["order_created_at"].dt.strftime("%Y"))
    df = df.assign(order_month = df["order_created_at"].dt.strftime("%h"))
    df = df.assign(order_day = df["order_created_at"].dt.strftime("%A"))
    df = df.assign(order_hour = df["order_created_at"].dt.strftime("%H"))
    return df

### End Pipeline

In [8]:
@log_step
def end_pipeline(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index = False)
    ProfileReport(df, title=f"{df_name.title()} - Report", dark_mode=True, minimal= True).to_file(f"../../data/profile_reports/{df_name}_report.html")
    return df

## Run Cleaning

In [9]:
order = (
init_df
    .pipe(start_pipeline)       # copy df
    .pipe(rename_columns)       # naming conventions
    #.pipe(remove_duplicates)    # no duplicates
    #.pipe(missing_values)       # no missing values
    .pipe(add_columns)          # order_year, order_month, order_day, order_hour
    .pipe(adjust_data_types)    # order_id -> STRING, order_created_at --> DATETIME, orfder_status --> CATEGORY
    #.pipe(drop_columns, cols=["col_name"])
    .pipe(end_pipeline)         # save clean data as .csv 
)

start_pipeline:
 shape=(226909, 4) took 0:00:00.005243s

add_columns:
 shape=(226909, 8) took 0:00:06.697344s

adjust_data_types:
 shape=(226909, 8) took 0:00:00.319118s



Summarize dataset: 100%|██████████| 14/14 [00:02<00:00,  6.96it/s, Completed]                        
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.69it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 350.11it/s]

end_pipeline:
 shape=(226909, 8) took 0:00:05.632883s






## Profile Report

### Types

#### Before

In [10]:
init_df.dtypes

order_id          int64
created_date     object
total_paid      float64
state            object
dtype: object

#### After

In [11]:
order.dtypes

order_id                    string
order_created_at    datetime64[ns]
order_total                float64
order_status              category
order_year                category
order_month               category
order_day                 category
order_hour                   int64
dtype: object

### Samples

#### Before

In [12]:
init_df.head()

Unnamed: 0,order_id,created_date,total_paid,state
0,241319,2017-01-02 13:35:40,44.99,Cancelled
1,241423,2017-11-06 13:10:02,136.15,Completed
2,242832,2017-12-31 17:40:03,15.76,Completed
3,243330,2017-02-16 10:59:38,84.98,Completed
4,243784,2017-11-24 13:35:19,157.86,Cancelled


#### After

In [13]:
order.head()

Unnamed: 0,order_id,order_created_at,order_total,order_status,order_year,order_month,order_day,order_hour
0,241319,2017-01-02 13:35:40,44.99,Cancelled,2017,Jan,Monday,13
1,241423,2017-11-06 13:10:02,136.15,Completed,2017,Nov,Monday,13
2,242832,2017-12-31 17:40:03,15.76,Completed,2017,Dec,Sunday,17
3,243330,2017-02-16 10:59:38,84.98,Completed,2017,Feb,Thursday,10
4,243784,2017-11-24 13:35:19,157.86,Cancelled,2017,Nov,Friday,13


### Info - Types Range, Non Null Count, Dtype

#### Before

In [14]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226909 entries, 0 to 226908
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   order_id      226909 non-null  int64  
 1   created_date  226909 non-null  object 
 2   total_paid    226904 non-null  float64
 3   state         226909 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 6.9+ MB


#### After

In [15]:
order.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226909 entries, 0 to 226908
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   order_id          226909 non-null  string        
 1   order_created_at  226909 non-null  datetime64[ns]
 2   order_total       226904 non-null  float64       
 3   order_status      226909 non-null  category      
 4   order_year        226909 non-null  category      
 5   order_month       226909 non-null  category      
 6   order_day         226909 non-null  category      
 7   order_hour        226909 non-null  int64         
dtypes: category(4), datetime64[ns](1), float64(1), int64(1), string(1)
memory usage: 7.8 MB
