# Clean Products

## Import Packages and Raw_data

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Global variables
df_name = "products" 
data_path = '../../data/raw_data/' 


init_df = pd.read_csv(f"{data_path }{df_name}.csv")

init_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


## Logging

In [16]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Plot Function

In [17]:
def plot_lines(
    lines,
    x,
    title = "", 
    legend ="", 
    x_label="", 
    y_label="",
    x_ticks ="",
    width = 22,
    height= 5
    ):
    ax = plt.subplots(figsize=(width, height))
    plt.plot(x, lines[0], "" )
    plt.plot(x,lines[1], "r")
    plt.title(title)
    plt.legend(legend)
    plt.ylabel(y_label)
    plt.xlabel(x_label)
    plt.xticks(x_ticks, rotation=90)
    plt.show()
    

## Pipeline - Functions

- [x]  Start
- [x]  Remove duplicates
- [_]  dots in price
- [_]  other weird numbers
- [_]  Missing values
- [_]  Prices to Numeric

### Start Pipeline

In [18]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [19]:
def rename_columns(df):
    df.rename(columns={
        "sku": "product_code",
        "name":  "product_name",      
        "desc":"product_description",
        "price":"product_price",        
        "promo_price":"product_promo_price",
        "in_stock": "product_in_stock", 
        "type":"product_type"
    },
             inplace = True)
    return df

### Remove duplicates

Use `DataFrame.drop_duplicates()` to remove rows that appear twice (or more) in the dataset: that means that these rows contain exactly the same values across all columns. Set the argument `inplace=True` so that changes actually take place. You can count the rows before and after running this code with `len()` and see how many rows were dropped this way.

It is possible that, after doing this, you still have a duplicate sku, which you should not have —Stock Keeping Units are meant to be unique`

In [20]:
@log_step
def remove_duplicates(df):
    df.drop_duplicates(inplace = True)
    # there is one dupiceted sku --> removed the one with missing price. Index 8000 (second appearence)
    df.drop_duplicates(subset='product_code', keep="first", inplace = True )
    return df 
    


### Adjust price

In [21]:
orderlines = pd.read_csv(f"{data_path}orderlines.csv")


@log_step
def adjust_price(df):
    # Make sure that every price is a string
    df["product_price"] = df["product_price"].astype("string")
    # fill na values with "000.000"
    df['product_price'] = df['product_price'].fillna("000.000")
    df = df.assign(product_decimal_count = df["product_price"].str.split("\."))
    df = df.assign(price_check = df["product_price"].str.contains("\.\d{3}$"))
    
    # Get maximum unit_price from orderlines
    # max_unit_price_df = (
    # orderlines
    #     .orderby("sku","unit_price")
    # )
    
    # df.merge


    return df

### Missing Values

In [22]:
@log_step
def missing_values(df):
    return df

### Adjust data types

In [23]:
@log_step
def adjust_data_types(df):
    df["product_code"] = df["product_code"].astype("string")
    df["product_name"] = df["product_name"].astype("string")
    df["product_description"] = df["product_description"].astype("string")


    return df

### Plot Samples

In [24]:
@log_step
def plot_samples(df, num_samples = 50):
    return df

### Drop Columns

In [25]:
@log_step
def drop_columns(df, columns):
    return df.drop(columns=columns)


### End Pipeline

In [26]:
@log_step
def end_pipeline(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index = False)
    return df

### Clean the rest of the DataFrames

The products table is probably the one that needed more cleaning, but the other tables might need some work too. Check them one by one and, when you’re done, move on to the next lesson —Data Quality.

## Run Cleaning

In [27]:
products = (
init_df
    .pipe(start_pipeline)
    .pipe(rename_columns)
    .pipe(remove_duplicates)
    .pipe(adjust_price)
    .pipe(missing_values)  # Open
    .pipe(adjust_data_types) # Open
    .pipe(plot_samples) 
    .pipe(drop_columns, columns=["product_type"]) # Open
    .pipe(end_pipeline)
)

start_pipeline:
 shape=(19326, 7) took 0:00:00.001523s

remove_duplicates:
 shape=(10579, 7) took 0:00:00.025665s

adjust_price:
 shape=(10579, 9) took 0:00:00.024377s

missing_values:
 shape=(10579, 9) took 0:00:00.000006s

adjust_data_types:
 shape=(10579, 9) took 0:00:00.004414s

plot_samples:
 shape=(10579, 9) took 0:00:00.000004s

drop_columns:
 shape=(10579, 8) took 0:00:00.001142s

end_pipeline:
 shape=(10579, 8) took 0:00:00.077196s



In [28]:
products.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10579 entries, 0 to 19325
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   product_code           10579 non-null  string 
 1   product_name           10579 non-null  string 
 2   product_description    10572 non-null  string 
 3   product_price          10579 non-null  string 
 4   product_promo_price    10579 non-null  object 
 5   product_in_stock       10579 non-null  int64  
 6   product_decimal_count  10579 non-null  object 
 7   price_check            10579 non-null  boolean
dtypes: boolean(1), int64(1), object(2), string(4)
memory usage: 681.8+ KB


In [29]:
sample = (
products[products["price_check"]]
)

sample

Unnamed: 0,product_code,product_name,product_description,product_price,product_promo_price,product_in_stock,product_decimal_count,price_check
34,TWS0019,Twelve South MagicWand support Apple Magic Tra...,MagicWand for wireless keyboard and Magic Trac...,000.000,299.899,0,"[000, 000]",True
362,REP0043,Speaker lower repair iPhone 4,Repair service including parts and labor for i...,499.004,499.004,0,"[499, 004]",True
480,PIE0011,Internal Battery for iPhone 3G,Replacement AC Adapter for Apple iPhone 3G.,98.978,98.978,0,"[98, 978]",True
515,SEN0061,Sennheiser EZX 80 Handsfree iPhone iPad and iP...,IPhone bluetooth headset with microphone iPad ...,649.891,649.891,0,"[649, 891]",True
518,SEV0026,Service installation RAM + HDD + SSD MacBook /...,RAM + HDD installation + SSD in your MacBook /...,599.918,599.918,0,"[599, 918]",True
...,...,...,...,...,...,...,...,...
19312,REP0424,Input repair Headphones iPad,Repair service including parts and labor for iPad,6.999.003,69.99,0,"[6, 999, 003]",True
19313,REP0421,iPad charging connector repair,Repair service including parts and labor for iPad,6.999.003,69.99,0,"[6, 999, 003]",True
19314,REP0416,iPad front camera repair,Repair service including parts and labor for iPad,6.999.003,69.99,0,"[6, 999, 003]",True
19315,REP0413,repair rear camera iPad,Repair service including parts and labor for iPad,6.999.003,69.99,0,"[6, 999, 003]",True


In [30]:
orderlines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293983 entries, 0 to 293982
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                293983 non-null  int64 
 1   id_order          293983 non-null  int64 
 2   product_id        293983 non-null  int64 
 3   product_quantity  293983 non-null  int64 
 4   sku               293983 non-null  object
 5   unit_price        293983 non-null  object
 6   date              293983 non-null  object
dtypes: int64(4), object(3)
memory usage: 15.7+ MB
