# Clean Products

## Import Packages and Raw_data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Global variables
df_name = "products" 
data_path = '../../data/raw_data' 


init_df = pd.read_csv(f"{data_path }/{df_name}.csv")

init_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19326 entries, 0 to 19325
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sku          19326 non-null  object
 1   name         19326 non-null  object
 2   desc         19319 non-null  object
 3   price        19280 non-null  object
 4   promo_price  19326 non-null  object
 5   in_stock     19326 non-null  int64 
 6   type         19276 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.0+ MB


## Logging

In [2]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Plot Function

In [3]:
def plot_lines(
    lines,
    x,
    title = "", 
    legend ="", 
    x_label="", 
    y_label="",
    x_ticks ="",
    width = 22,
    height= 5
    ):
    ax = plt.subplots(figsize=(width, height))
    plt.plot(x, lines[0], "" )
    plt.plot(x,lines[1], "r")
    plt.title(title)
    plt.legend(legend)
    plt.ylabel(y_label)
    plt.xlabel(x_label)
    plt.xticks(x_ticks, rotation=90)
    plt.show()
    

## Pipeline - Functions

- [x]  Start
- [x]  Remove duplicates
- [_]  dots in price
- [_]  other weird numbers
- [_]  Missing values
- [_]  Prices to Numeric

### Start Pipeline

In [4]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [5]:
def rename_columns(df):
    df.rename(columns={
        "sku": "product_code",
        "name":  "product_name",      
        "desc":"product_description",
        "price":"product_price",        
        "promo_price":"product_promo_price",
        "in_stock": "product_in_stock", 
        "type":"product_type"
    },
             inplace = True)
    return df

### Remove duplicates

Use `DataFrame.drop_duplicates()` to remove rows that appear twice (or more) in the dataset: that means that these rows contain exactly the same values across all columns. Set the argument `inplace=True` so that changes actually take place. You can count the rows before and after running this code with `len()` and see how many rows were dropped this way.

It is possible that, after doing this, you still have a duplicate sku, which you should not have —Stock Keeping Units are meant to be unique`

In [6]:
@log_step
def remove_duplicates(df):
    df.drop_duplicates(inplace = True)
    # there is one dupiceted sku --> removed the one with missing price. Index 8000 (second appearence)
    df.drop_duplicates(subset='product_code', keep="first", inplace = True )
    return df 
    


### Adjust price

In [7]:
@log_step
def adjust_price(df):
    # Make sure that every price is a string
    df["product_price"] = df["product_price"].astype("string")
    # fill na values with "000.000"
    df['product_price'] = df['product_price'].fillna("000.000")
    df = df.assign(product_decimal_count = df["product_price"].str.split("\.") )
    df = df.assign(price_check = df["product_price"].str.contains("\.\d{3}$"))
    return df

### Missing Values

In [8]:
@log_step
def missing_values(df):
    return df

### Adjust data types

In [9]:
@log_step
def adjust_data_types(df):
    return df

### Plot Samples

In [10]:
@log_step
def plot_samples(df, num_samples = 50):
    return df

### Drop Columns

In [11]:
@log_step
def drop_columns(df):
    return df


### End Pipeline

In [12]:
def end_pipeline(df):
    products.to_csv(f"../../data/clean_data/clean_{df_name}.csv")
    return df

### Clean the rest of the DataFrames

The products table is probably the one that needed more cleaning, but the other tables might need some work too. Check them one by one and, when you’re done, move on to the next lesson —Data Quality.

## Run Cleaning

In [13]:
products = (
init_df
    .pipe(start_pipeline)
    .pipe(rename_columns)
    .pipe(remove_duplicates)
    .pipe(adjust_price)
    .pipe(missing_values)  # Open
    .pipe(adjust_data_types) # Open
    .pipe(plot_samples) 
    .pipe(drop_columns) # Open
    .pipe(end_pipeline)
)
products.info()

start_pipeline:
 shape=(19326, 7) took 0:00:00.001976s

remove_duplicates:
 shape=(10579, 7) took 0:00:00.032687s

adjust_price:
 shape=(10579, 9) took 0:00:00.030382s

missing_values:
 shape=(10579, 9) took 0:00:00.000006s

adjust_data_types:
 shape=(10579, 9) took 0:00:00.000003s

plot_samples:
 shape=(10579, 9) took 0:00:00.000002s

drop_columns:
 shape=(10579, 9) took 0:00:00.000001s



NameError: name 'products' is not defined

## Discount Analysis

In [None]:
products["discount_to_standard_rate"].describe()

KeyError: 'discount_to_standard_rate'

In [None]:
discount_check =(
    products.
    query("discount_to_standard_rate > 20")
    .sort_values("discount_to_standard_rate", ascending = False)
    .reset_index()
    
    [["product_name","product_price","product_price_final","product_promo_price","product_promo_price_final","discount_to_standard_rate"]]
)
discount_check

Unnamed: 0,product_name,product_price,product_price_final,product_promo_price,product_promo_price_final,discount_to_standard_rate
0,iPhone 5 GSM antenna repair,69.989.909,6998.99,699.899,69.98,100.014147
1,Home button repair iPad mini,69.989.909,6998.99,699.899,69.98,100.014147
2,Open - Withings Blood Pressure Monitor Wireles...,92.197.644,9219.76,921.976,92.19,100.008244
3,"Apple MacBook Pro 13 ""Core i5 Touch Bar 31GHz ...",300.559.402,30055.94,3.005.594,300.55,100.003128
4,"Apple MacBook Pro 13 ""Core i5 Touch Bar 31GHz ...",300.559.402,30055.94,3.005.594,300.55,100.003128
5,Brenthaven Fusion case for MacBook Pro Retina ...,99.99,99.99,15.488,1.54,64.928571
6,Open - OtterBox Symmetry Case iPhone 6 / 6S Bl...,49.99,49.99,10.017,1.0,49.99
7,(Open) Jawbone UP2 Activity Monitor Black,119.99,119.99,24.214,2.42,49.582645
8,(Open) NewerTech Cable HDMI to HDMI 1.4A 180m,17.99,17.99,3.808,0.38,47.342105
9,Open - NewerTech Mini DisplayPort to DVI Adapter,29.99,29.99,7.607,0.76,39.460526


## 


In [None]:
products.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10579 entries, 0 to 19325
Data columns (total 18 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   product_code                     10579 non-null  object 
 1   product_name                     10579 non-null  object 
 2   product_description              10572 non-null  object 
 3   product_price                    10534 non-null  string 
 4   product_promo_price              10579 non-null  object 
 5   product_in_stock                 10579 non-null  int64  
 6   product_type                     10529 non-null  object 
 7   dots                             10534 non-null  Int64  
 8   promo_dots                       10579 non-null  int64  
 9   single_dot_price_1               10534 non-null  string 
 10  single_dot_price                 10534 non-null  string 
 11  regex                            10579 non-null  bool   
 12  promo_single_dot_p