# Clean Orderlines

## Import Packages and Raw_data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Global variables
df_name = "orderlines" 
data_path = '../../data/raw_data/' 


init_df = pd.read_csv(f"{data_path }{df_name}.csv")

init_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


## Logging

In [2]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Plot Function

In [3]:
def plot_lines(
    lines,
    x,
    title = "", 
    legend ="", 
    x_label="", 
    y_label="",
    x_ticks ="",
    width = 22,
    height= 5
    ):
    ax = plt.subplots(figsize=(width, height))
    plt.plot(x, lines[0], "" )
    plt.plot(x,lines[1], "r")
    plt.title(title)
    plt.legend(legend)
    plt.ylabel(y_label)
    plt.xlabel(x_label)
    plt.xticks(x_ticks, rotation=90)
    plt.show()
    

## Pipeline - Functions

- [x]  Start
- [x]  Remove duplicates
- [_]  dots in price
- [_]  other weird numbers
- [_]  Missing values
- [_]  Prices to Numeric

### Start Pipeline

In [4]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [5]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB


In [6]:
def rename_columns(df):
    return (df
        .rename(columns={
            "id": "o_line_id",
            "id_order":  "order_id",      
            "product_id":"o_line_product_id",
            "product_quantity":"o_line_product_qty",        
            "unit_price": "o_line_unit_price", 
            "date":"o_line_date"
            },
        )
    )

### Remove duplicates

In [7]:
@log_step
def remove_duplicates(df):
    return df.drop_duplicates()

### Adjust price

In [8]:
@log_step
def adjust_price(df):
    df["o_line_unit_price"] = df["o_line_unit_price"].str.replace("\.","", regex=True)
    df["o_line_unit_price"] = pd.to_numeric(df["o_line_unit_price"]) / 100
    return df

### Missing Values

In [9]:
@log_step
def missing_values(df):
    return df

### Adjust data types

In [14]:
@log_step
def adjust_data_types(df):
    df["o_line_id"] = df["o_line_id"].astype("string")
    df["order_id"] = df["order_id"].astype("string")
    df["o_line_product_id"] = df["o_line_product_id"].astype("string")
    df["o_line_unit_price"] = df["o_line_unit_price"].astype("float")
    return df

### Drop Columns

In [11]:
@log_step
def drop_columns(df, columns):
    return df.drop(columns=columns)


### End Pipeline

In [12]:
@log_step
def end_pipeline(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index = False)
    return df

## Run Cleaning

In [15]:
olin = (
init_df
    .pipe(start_pipeline)
    .pipe(rename_columns)
    .pipe(remove_duplicates)
    .pipe(missing_values)  
    .pipe(adjust_price)  
    .pipe(adjust_data_types) 
    #.pipe(drop_columns, columns=[""]) 
    .pipe(end_pipeline)
)

start_pipeline:
 shape=(293983, 7) took 0:00:00.011521s

remove_duplicates:
 shape=(293983, 7) took 0:00:00.195958s

missing_values:
 shape=(293983, 7) took 0:00:00.000006s

adjust_price:
 shape=(293983, 7) took 0:00:00.450084s

adjust_data_types:
 shape=(293983, 7) took 0:00:00.712468s

end_pipeline:
 shape=(293983, 7) took 0:00:01.366833s



In [None]:
olin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   o_line_id           293983 non-null  string 
 1   o_line_order_id     293983 non-null  string 
 2   o_line_product_id   293983 non-null  string 
 3   o_line_product_qty  293983 non-null  int64  
 4   product_code        293983 non-null  object 
 5   o_line_unit_price   293983 non-null  float64
 6   o_line_date         293983 non-null  object 
dtypes: float64(1), int64(1), object(2), string(3)
memory usage: 17.9+ MB


In [None]:
sample = (
olin.sample(10)
)

sample

Unnamed: 0,index,o_line_id,o_line_order_id,o_line_product_id,o_line_product_qty,product_code,o_line_unit_price,o_line_date
116863,116863,1352626,400142,0,1,APP1697,49.00,2017-09-14 18:23:45
96242,96242,1303192,382385,0,1,PAC2119,3.299.00,2017-07-26 16:15:52
84619,84619,1281788,372328,0,1,BEL0264,14.99,2017-07-04 09:15:51
170382,170382,1451231,445222,0,1,TRA0036,172.13,2017-11-26 13:44:27
76031,76031,1266884,365035,0,1,TUC0279,49.99,2017-06-12 22:01:43
278956,278956,1626695,517233,0,1,DLL0012,215.99,2018-02-25 17:48:36
202022,202022,1495832,462606,0,1,APP2288,2.960.59,2017-12-14 07:12:15
81102,81102,1275686,369355,0,2,PHI0080,54.99,2017-06-26 22:43:38
35502,35502,1194726,330533,0,1,SYN0163,241.29,2017-02-28 18:39:27
141108,141108,1396294,421334,0,1,AP20117,26.99,2017-11-03 08:49:21


In [None]:
olin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   o_line_id           293983 non-null  string 
 1   o_line_order_id     293983 non-null  string 
 2   o_line_product_id   293983 non-null  string 
 3   o_line_product_qty  293983 non-null  int64  
 4   product_code        293983 non-null  object 
 5   o_line_unit_price   293983 non-null  float64
 6   o_line_date         293983 non-null  object 
dtypes: float64(1), int64(1), object(2), string(3)
memory usage: 17.9+ MB


In [None]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB
