# Clean Products

## Import Packages and Raw_data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Global variables
df_name = "products" 
data_path = '../../data/raw_data/' 
cl_data_path = '../../data/clean_data/' 


init_df = pd.read_csv(f"{data_path }{df_name}.csv")
o_lines_cleaned = pd.read_csv(f"{cl_data_path}clean_orderlines.csv")
brands_cleaned = pd.read_csv(f"{cl_data_path}clean_brands.csv")

init_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


## Logging

In [2]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Plot Function

In [3]:
def plot_lines(
    lines,
    x,
    title = "", 
    legend ="", 
    x_label="", 
    y_label="",
    x_ticks ="",
    width = 22,
    height= 5
    ):
    ax = plt.subplots(figsize=(width, height))
    plt.plot(x, lines[0], "" )
    plt.plot(x,lines[1], "r")
    plt.title(title)
    plt.legend(legend)
    plt.ylabel(y_label)
    plt.xlabel(x_label)
    plt.xticks(x_ticks, rotation=90)
    plt.show()
    

## Pipeline - Functions

- [x]  Start
- [x]  Remove duplicates
- [_]  dots in price
- [_]  other weird numbers
- [_]  Missing values
- [_]  Prices to Numeric

### Start Pipeline

In [4]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [5]:
def rename_columns(df):
    return (
        df
        .rename(columns={
            "name":  "product_name",      
            "desc":"product_description",
            "price":"product_price",        
            "promo_price":"product_promo_price",
            "in_stock": "product_in_stock", 
            "type":"product_type"
            }
        )
    )       

### Remove duplicates

Use `DataFrame.drop_duplicates()` to remove rows that appear twice (or more) in the dataset: that means that these rows contain exactly the same values across all columns. Set the argument `inplace=True` so that changes actually take place. You can count the rows before and after running this code with `len()` and see how many rows were dropped this way.

It is possible that, after doing this, you still have a duplicate sku, which you should not have —Stock Keeping Units are meant to be unique`

In [6]:
@log_step
def remove_duplicates(df):
    df.drop_duplicates(inplace = True)
    # there is one dupiceted sku --> removed the one with missing price. Index 8000 (second appearence)
    df.drop_duplicates(subset='sku', keep="first", inplace = True )
    return df 
    


### Adjust price

In [7]:



@log_step
def adjust_price(df):
    # Make sure that every price is a string
    df["product_price"] = df["product_price"].astype("string")
    # fill na values with "000.000"
    df['product_price'] = df['product_price'].fillna("000.000")
    df = df.assign(price_check = df["product_price"].str.contains("\.\d{3}$"))
    
    # Get maximum unit_price from orderlines  



    return df

### Missing Values

In [8]:
@log_step
def missing_values(df):
    return df

### Adjust data types

In [9]:
@log_step
def adjust_data_types(df):
    df["sku"] = df["sku"].astype("string")
    df["product_name"] = df["product_name"].astype("string")
    df["product_description"] = df["product_description"].astype("string")

    return df

### Plot Samples

In [10]:
@log_step
def plot_samples(df, num_samples = 50):
    return df

### Drop Columns

In [11]:
@log_step
def drop_columns(df, columns):
    return df.drop(columns=columns)


### Add columns

In [12]:
@log_step
def add_columns(df):
    df = df.assign(product_brand_short = df["sku"].replace("^(\w{3})","\g<1>", regex = True))
    df = df.assign(product_brand = brands_cleaned["brand_long"])
    
    return df

### End Pipeline

In [13]:
@log_step
def end_pipeline(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index = False)
    return df

## Run Cleaning

In [14]:
products = (
init_df
    .pipe(start_pipeline)
    .pipe(rename_columns)
    .pipe(remove_duplicates)
    .pipe(adjust_price)
    .pipe(missing_values)  
    .pipe(adjust_data_types) 
    .pipe(plot_samples) 
    .pipe(drop_columns, columns=["product_type"])
    .pipe(add_columns) # product_brand
    .pipe(end_pipeline)
)

start_pipeline:
 shape=(19326, 7) took 0:00:00.001832s

remove_duplicates:
 shape=(10579, 7) took 0:00:00.027897s

adjust_price:
 shape=(10579, 8) took 0:00:00.011594s

missing_values:
 shape=(10579, 8) took 0:00:00.000004s

adjust_data_types:
 shape=(10579, 8) took 0:00:00.005883s

plot_samples:
 shape=(10579, 8) took 0:00:00.000004s

drop_columns:
 shape=(10579, 7) took 0:00:00.000834s

add_columns:
 shape=(10579, 9) took 0:00:00.059692s

end_pipeline:
 shape=(10579, 9) took 0:00:00.076133s



In [15]:
products.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10579 entries, 0 to 19325
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   sku                  10579 non-null  string 
 1   product_name         10579 non-null  string 
 2   product_description  10572 non-null  string 
 3   product_price        10579 non-null  string 
 4   product_promo_price  10579 non-null  object 
 5   product_in_stock     10579 non-null  int64  
 6   price_check          10579 non-null  boolean
 7   product_brand_short  10579 non-null  string 
 8   product_brand        110 non-null    object 
dtypes: boolean(1), int64(1), object(2), string(5)
memory usage: 764.5+ KB


In [16]:
sample = (
products[products["price_check"]]
)

sample

Unnamed: 0,sku,product_name,product_description,product_price,product_promo_price,product_in_stock,price_check,product_brand_short,product_brand
34,TWS0019,Twelve South MagicWand support Apple Magic Tra...,MagicWand for wireless keyboard and Magic Trac...,000.000,299.899,0,True,TWS0019,Elago
362,REP0043,Speaker lower repair iPhone 4,Repair service including parts and labor for i...,499.004,499.004,0,True,REP0043,
480,PIE0011,Internal Battery for iPhone 3G,Replacement AC Adapter for Apple iPhone 3G.,98.978,98.978,0,True,PIE0011,
515,SEN0061,Sennheiser EZX 80 Handsfree iPhone iPad and iP...,IPhone bluetooth headset with microphone iPad ...,649.891,649.891,0,True,SEN0061,
518,SEV0026,Service installation RAM + HDD + SSD MacBook /...,RAM + HDD installation + SSD in your MacBook /...,599.918,599.918,0,True,SEV0026,
...,...,...,...,...,...,...,...,...,...
19312,REP0424,Input repair Headphones iPad,Repair service including parts and labor for iPad,6.999.003,69.99,0,True,REP0424,
19313,REP0421,iPad charging connector repair,Repair service including parts and labor for iPad,6.999.003,69.99,0,True,REP0421,
19314,REP0416,iPad front camera repair,Repair service including parts and labor for iPad,6.999.003,69.99,0,True,REP0416,
19315,REP0413,repair rear camera iPad,Repair service including parts and labor for iPad,6.999.003,69.99,0,True,REP0413,


In [18]:
o_lines_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   o_line_id           293983 non-null  int64  
 1   order_id            293983 non-null  int64  
 2   o_line_product_id   293983 non-null  int64  
 3   o_line_product_qty  293983 non-null  int64  
 4   sku                 293983 non-null  object 
 5   o_line_unit_price   293983 non-null  float64
 6   o_line_date         293983 non-null  object 
dtypes: float64(1), int64(4), object(2)
memory usage: 15.7+ MB


In [19]:
o_lines_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   o_line_id           293983 non-null  int64  
 1   order_id            293983 non-null  int64  
 2   o_line_product_id   293983 non-null  int64  
 3   o_line_product_qty  293983 non-null  int64  
 4   sku                 293983 non-null  object 
 5   o_line_unit_price   293983 non-null  float64
 6   o_line_date         293983 non-null  object 
dtypes: float64(1), int64(4), object(2)
memory usage: 15.7+ MB


In [22]:
  max_unit_price_df = (
    o_lines_cleaned
        .groupby("sku")
        .agg("max")
        .sort_values("o_line_unit_price", ascending=False)
        .reset_index()
    )

max_unit_price_df

Unnamed: 0,sku,o_line_id,order_id,o_line_product_id,o_line_product_qty,o_line_unit_price,o_line_date
0,NEA0009,1645998,525385,0,7,159989.83,2018-03-11 11:42:22
1,LAC0223,1623118,515661,0,10,15349.00,2018-02-21 19:59:58
2,APP2660,1639306,522845,0,4,14725.00,2018-03-07 16:40:22
3,APP2659,1625977,516941,0,1,13799.00,2018-02-24 20:48:48
4,APP2648,1639157,522781,0,1,12428.03,2018-03-07 14:39:08
...,...,...,...,...,...,...,...
7946,PAR0037,1593909,503425,0,1,2.24,2018-01-31 23:46:51
7947,CAD0008,1309160,384544,0,1,1.39,2017-08-01 17:57:23
7948,CAD0010,1309165,384617,0,1,0.29,2017-08-01 18:00:43
7949,LIBRO,1264215,359326,0,24,0.00,2017-06-08 18:11:40
