# Clean products

## Initialize Cleaning

### Import Packages and Raw_data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import chime

# Global variables
df_name = "products" 
data_path = '../../data/raw_data/' 
cl_data_path = '../../data/clean_data/' 

init_df = pd.read_csv(f"{data_path}{df_name}.csv")
ol_cleaned = pd.read_csv(f"{cl_data_path}clean_orderlines.csv")
brands_cleaned = pd.read_csv(f"{cl_data_path}clean_brands.csv")

ol_cleaned.info()


ModuleNotFoundError: No module named 'chime'

### Logging

In [103]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Pipeline - Functions

### Start Pipeline

In [104]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [105]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


In [106]:
def rename_columns(df):
    return (df
        .rename(columns={
            "name": "product_name",
            "desc":  "product_description",      
            "price":"product_price_dirty",        
            "in_stock":"product_in_stock"
            }
        )
    )

### Remove duplicates

In [107]:
@log_step
def remove_duplicates(df):
    return (
            df
            .drop_duplicates()       # removed dupliceted rows
            .drop_duplicates("sku")  # one duplictaed sku. 2. Appearence has no price --> dropped!
    )

### Adjust price

In [108]:
@log_step
def adjust_price(df):
    df["product_price_dirty"] =( 
    df["product_price_dirty"]
            # make sure everythin is a string
            .astype("string")
            # mark every price with three decimals as dirty
            .str.replace(".*\.\d{3}$", "dirty", regex = True)
            # mark every price that is 0 as dirty
            .str.replace("^0*\.?(0*)?$", "dirty", regex = True) 
    )

    # get maximum price from orderlines
    max_ol_df = (
        ol_cleaned[["sku","ol_unit_price"]]
            .groupby("sku")
            .max()
            .reset_index()
    )
    # merge with max_ol_df 
    df = (
    df
        .merge(max_ol_df, how="left", on="sku")
        .rename(columns={"ol_unit_price":"max_price_ol"})
    )
    # replace dirty prices with the max_price_ol
    df['product_price'] = df.apply(lambda x: x["max_price_ol"] if (x["product_price_dirty"] == "dirty" ) else x["product_price_dirty"],axis=1)
    df['product_price'] = pd.to_numeric(df['product_price'])

    #  Use max_price_ol if > product_price use 
    df = df.assign(max_price_gretaer_product_price = lambda x: x['product_price'] < x['max_price_ol'])
    df['product_price'] = df.apply(lambda x: x["max_price_ol"] if (x["max_price_gretaer_product_price"]) else x["product_price"],axis=1)



    
    return df
    
    

### Missing Values

In [109]:
@log_step
def missing_values(df):
    #df = df["product_price_dirty"].fillna("dirty")
    df.product_price_dirty = df.product_price_dirty.fillna("dirty")
    return df
    # return df.fillna({"product_price_dirty":"dirty"})

### Adjust data types

In [110]:
@log_step
def adjust_data_types(df):
   df["sku"] = df["sku"].astype("string")
   df["product_name"] = df["product_name"].astype("string")
   df["product_description"] = df["product_description"].astype("string")
   df["product_brand"] = df["product_brand"].astype("category")
   return df

### Drop Columns

In [111]:
@log_step
def drop_columns(df, cols):
    
    return df.drop(columns=cols)


### Add columns

In [112]:
@log_step
def add_columns(df):
    # extract brand_short from sku
    df = df.assign(brand_short = df["sku"].str.extract("(^\w{3})"))
    
    # merge brands on "brands_short"

    df = df.merge(brands_cleaned, how="left", on="brand_short")
    df = df.rename(columns={"brand_long":"product_brand"})
    return df

### CSV

In [113]:
@log_step
def create_csv(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index = False)
    return df

### Report

In [114]:
@log_step
def create_report(df):
    ProfileReport(df, title=f"{df_name.title()} - Report", dark_mode=True, minimal = True).to_file(f"../../data/profile_reports/{df_name}_report.html")
    return df

## Run Cleaning

In [115]:
products = (
init_df
    .pipe(start_pipeline)       # copy df
    .pipe(rename_columns)       # naming conventions
    .pipe(missing_values)       # price --> replaced with: "dirty"
    .pipe(remove_duplicates)    # duplicated rows + duplicated sku, removed first one, without a price
    .pipe(adjust_price)       
      # make sure eveything is a string
      # mark every price with three decimals as dirty
      # mark every price that is 0 as dirty
      # get maximum price from orderlines
      # merge with max_ol_df 
      # replace dirty prices with the max_price_ol
      # Use max_price-ol if > product_price use 
    df = df.assign(max_price_gretaer_product_price = lambda x: x['product_price'] < x['max_price_ol'])
    df['product_price'] = df.apply(lambda x: x["max_price_ol"] if (x["max_price_gretaer_product_price"]) else x["product_price"],axis=1)
    .pipe(add_columns)          # extract brand short from sku --> merge brand_long from brands
    .pipe(drop_columns, cols=["promo_price", "type","brand_short", "product_price_dirty"])
    .pipe(adjust_data_types)    # product_id -> STRING, order_id --> STRING, product_date --> DATETIME
    .pipe(create_csv)           # save clean data as .csv 
    .pipe(create_report)        # save report as .html 

)


start_pipeline:
 shape=(19326, 7) took 0:00:00.001691s

missing_values:
 shape=(19326, 7) took 0:00:00.002643s

remove_duplicates:
 shape=(10579, 7) took 0:00:00.026822s

adjust_price:
 shape=(10579, 10) took 0:00:00.406756s

add_columns:
 shape=(10579, 12) took 0:00:00.032482s

drop_columns:
 shape=(10579, 8) took 0:00:00.001220s

adjust_data_types:
 shape=(10579, 8) took 0:00:00.006089s

create_csv:
 shape=(10579, 8) took 0:00:00.079239s



Summarize dataset: 100%|██████████| 14/14 [00:00<00:00, 69.07it/s, Completed]                               
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.14s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  5.32it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 817.60it/s]


create_report:
 shape=(10579, 8) took 0:00:01.573545s



## Profile Report

### Types

#### Before

In [116]:
init_df.dtypes

sku            object
name           object
desc           object
price          object
promo_price    object
in_stock        int64
type           object
dtype: object

#### After

In [117]:
products.dtypes

sku                                  string
product_name                         string
product_description                  string
product_in_stock                      int64
max_price_ol                        float64
product_price                       float64
max_price_gretaer_product_price        bool
product_brand                      category
dtype: object

### Samples

#### Before

In [118]:
init_df.sample(10)

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
15000,MOP0101,Mophie Juice Pack Reserve Battery Case 1840 mA...,Case 1840 mAh battery and LED indicator for iP...,69.95,599.918,0,11865403
6437,PAC1069,"Apple iMac 27 ""Core i7 Retina 5K 4Hz | 16GB | ...",IMac desktop computer 27 inch 5K Retina i5 3.3...,4189.0,38.449.904,0,"5,74E+15"
13349,APP1653,Apple iPhone 7 256GB Black,New Apple iPhone 7 Free Black 256GB,990.33,9.903.306,0,85641716
7271,PAC1616,"Apple iMac 27 ""Core i7 Retina 5K 4GHz | 8GB RA...",IMac desktop computer 27 inch Retina 5K 4GHz C...,4369.0,34.999.903,0,"5,74E+15"
19290,AP20474,Like new - Apple Watch GPS 38mm Case Series 3 ...,Reconditioned Apple Watch 38mm series 3 with G...,369.0,3.189.996,0,24885185
2392,IFX0015,iFixit disassembly tool iSclack iPhone 5 5S 5C...,Tweezers with suction to remove iPhone 5 / 5S ...,29.99,299.899,1,12645406
15788,PLA0029,Plantronics BackBeat Fit Wireless Headphones Blue,secure wireless headset sport waterproof desig...,129.99,99.99,0,5384
17044,AP20197,Like new - Apple Dock to USB Cable iPhone iPad...,Cable iPhone iPad and iPod dock USB refitted.,25.0,119.899,0,1230
9871,PAC0974,"Apple iMac 27 ""Core i5 3.2GHz Retina 5K | 32GB...",IMac desktop computer 27 inch 5K Retina i5 3.2...,3169.0,26.309.901,0,"5,74E+15"
15673,BEZ0191-A,"Open - Be.ez LArobe ONE Macbook 12 ""Red",Macbook thin sheath 12 inches.,29.9,198.695,0,1298


#### After

In [119]:

    products.sample(10).reset_index()
    

Unnamed: 0,index,sku,product_name,product_description,product_in_stock,max_price_ol,product_price,max_price_gretaer_product_price,product_brand
0,4170,WDT0274-A,Open - Western Digital My Passport Ultra 2TB h...,WD External Hard Drive 2TB Mac and PC USB 3.0.,0,91.53,124.9,False,Western Digital
1,1555,CRU0038,Crucial MX200 500GB SSD 7mm,25-inch 7mm SSD 500GB for Mac and PC.,0,,197.99,False,Crucial
2,6281,SAT0049,Satechi Support Aluminum Gray Space Headphones,Support Headphones with aluminum finish and st...,1,29.99,39.99,False,Satechi
3,5249,TPL0036,TP-Link TL-PA4010P Kit Powerline AV500 Passthr...,amplifiers internet kit with transfer speeds u...,1,46.99,46.99,False,TP-Link
4,3658,APP1470,Apple Dock Base magnetic charging for Apple Watch,Magnetic base load minimalist design for Apple...,0,89.0,89.0,False,Apple
5,5689,APP1731,Apple Watch Series 2 42mm Stainless Steel Case...,Apple Watch 42 mm dual core processor with GPS...,0,,819.0,False,Apple
6,4430,APP1577,"Apple MacBook Retina 12 ""Core m5 12GHz | 8GB R...",New MacBook Retina Display 12-inch Core 8GB RA...,0,1679.99,1799.0,False,Apple
7,4764,TRK0009,Bravo Trackr Locator Plata,Bluetooth locator objects APP for iPhone,0,29.99,29.99,False,TrackR
8,10540,REP0410,repair rear camera plus iPhone 6s,Repair service including parts and labor for i...,0,,,False,Repair
9,7819,LAC0233,LaCie Porsche Design Desktop Drive 6TB USB 3.0...,External Hard Drive 6TB 35-inch USB 3.0 connec...,0,192.79,199.99,False,LaCie


### Info - Types Range, Non Null Count, Dtype

#### Before

In [120]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


#### After

In [121]:
products.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10579 entries, 0 to 10578
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   sku                              10579 non-null  string  
 1   product_name                     10579 non-null  string  
 2   product_description              10572 non-null  string  
 3   product_in_stock                 10579 non-null  int64   
 4   max_price_ol                     7850 non-null   float64 
 5   product_price                    10334 non-null  float64 
 6   max_price_gretaer_product_price  10579 non-null  bool    
 7   product_brand                    10368 non-null  category
dtypes: bool(1), category(1), float64(2), int64(1), string(3)
memory usage: 615.0 KB


In [122]:
# products.product_price_dirty = products.product_price_dirty.fillna("dirty")
products.isna().sum()

sku                                   0
product_name                          0
product_description                   7
product_in_stock                      0
max_price_ol                       2729
product_price                       245
max_price_gretaer_product_price       0
product_brand                       211
dtype: int64