# Clean products

## Initialize Cleaning

### Import Packages and Raw_data

In [193]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

# Global variables
df_name = "products" 
data_path = '../../data/raw_data/' 

init_df = pd.read_csv(f"{data_path }{df_name}.csv")


### Logging

In [194]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Pipeline - Functions

### Start Pipeline

In [195]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [196]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


In [197]:
def rename_columns(df):
    return (df
        .rename(columns={
            "name": "product_name",
            "desc":  "product_description",      
            "price":"product_price",        
            "in_stock":"product_in_stock"
            },
        )
    )

### Remove duplicates

In [198]:
@log_step
def remove_duplicates(df):
    return df.drop_duplicates().reset_index()

### Adjust price

In [199]:
@log_step
def adjust_price(df):
    
    # make sure eveything is a string
    df["product_price"] = df["product_price"].astype("string")
    
    df.assign(price_check = df["product_price"].str.find("\.\d{3}$"))
    
    return df

### Missing Values

In [200]:
@log_step
def missing_values(df):
    return df.fillna({"product_price":"000.000"})

### Adjust data types

In [201]:
@log_step
def adjust_data_types(df):
   df["sku"] = df["sku"].astype("string")
   df["product_name"] = df["product_name"].astype("string")
   df["product_description"] = df["product_description"].astype("string")
   return df

### Drop Columns

In [202]:
@log_step
def drop_columns(df, cols):
    return df.drop(columns=cols)


### Add columns

In [203]:
@log_step
def add_columns(df):
    return df

### CSV

In [204]:
@log_step
def create_csv(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index = False)
    return df

### Report

In [205]:
@log_step
def create_report(df):
    ProfileReport(df, title=f"{df_name.title()} - Report", dark_mode=True, minimal = True).to_file(f"../../data/profile_reports/{df_name}_report.html")
    return df

## Run Cleaning

In [206]:
products = (
init_df
    .pipe(start_pipeline)       # copy df
    .pipe(rename_columns)       # naming conventions
    .pipe(missing_values)       # price --> replaced with: "000.000"
    .pipe(remove_duplicates)    # duplicate sku, removed first one, without a price
    .pipe(adjust_price)         # removed second dot: 1.000.99 -> 1000.99  + to_numeric 
    .pipe(adjust_data_types)    # product_id -> STRING, order_id --> STRING, product_date --> DATETIME
    .pipe(add_columns)          # product_year, product_month, product_day, product_hour
    .pipe(drop_columns, cols=["promo_price", "type"])
    .pipe(create_csv)           # save clean data as .csv 
    .pipe(create_report)        # save report as .html 
)


start_pipeline:
 shape=(19326, 7) took 0:00:00.001625s

missing_values:
 shape=(19326, 7) took 0:00:00.002860s

remove_duplicates:
 shape=(10580, 8) took 0:00:00.023138s

adjust_price:
 shape=(10580, 8) took 0:00:00.009926s

adjust_data_types:
 shape=(10580, 8) took 0:00:00.005690s

add_columns:
 shape=(10580, 8) took 0:00:00.000004s

drop_columns:
 shape=(10580, 6) took 0:00:00.002158s

create_csv:
 shape=(10580, 6) took 0:00:00.081678s



Summarize dataset: 100%|██████████| 12/12 [00:00<00:00, 51.14it/s, Completed]                           
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

## Profile Report

### Types

#### Before

In [None]:
init_df.dtypes

sku            object
name           object
desc           object
price          object
promo_price    object
in_stock        int64
type           object
dtype: object

#### After

In [None]:
products.dtypes

df_index                int64
sku                    string
product_name           string
product_description    string
product_price          string
product_in_stock        int64
dtype: object

### Samples

#### Before

In [None]:
init_df.head()

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,499.899,1,8696
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59.0,589.996,0,13855401
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59.0,569.898,0,1387
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25.0,229.997,0,1230
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,31.99,1,1364


#### After

In [None]:
products.head()

Unnamed: 0,df_index,sku,product_name,product_description,product_price,product_in_stock
0,0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,1
1,1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59.0,0
2,2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59.0,0
3,3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25.0,0
4,4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,1


### Info - Types Range, Non Null Count, Dtype

#### Before

In [None]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


#### After

In [None]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10580 entries, 0 to 10579
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   df_index             10580 non-null  int64 
 1   sku                  10580 non-null  string
 2   product_name         10580 non-null  string
 3   product_description  10573 non-null  string
 4   product_price        10580 non-null  string
 5   product_in_stock     10580 non-null  int64 
dtypes: int64(2), string(4)
memory usage: 496.1 KB
