# Products

# Imports

In [1]:
import pandas as pd
import seaborn as sns
from functools import wraps
import datetime as dt

# Log-Wrapper

In [2]:
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        global df_rows

        df_rows.append(result.shape[0])
        return result


    return wrapper

## Pipeline Functions

### Pipeline Init

In [3]:
filename = "products"
init_df = pd.read_csv("../../eniac-data/"+filename+".csv")

df_rows = []


@log_step
def start_pipeline(data_frame):
    global df_rows
    df_rows = []
    df_rows.append(init_df.shape[0])
    return data_frame.copy()

init_df.shape


(19326, 7)

### Rename Columns

In [4]:
@log_step
def rename_columns(data_frame):
    data_frame.rename(columns={"sku":"product_sku",
                             "name":"product_name",
                             "desc":"product_description",
                             "price":"product_price",
                             "promo_price":"product_promo_price",
                             "type":"product_type"}, inplace = True)
    
    return data_frame

### Set Data Types

In [5]:
removePeriod = lambda price_str: price_str.replace(".", "",1) if price_str.count(".") > 1 else price_str


@log_step
def set_data_types(data_frame):
    data_frame.product_price = str(data_frame.product_price).replace("\n","").strip()
    
    data_frame["new_price"] = data_frame.product_price.apply(removePeriod)
    data_frame.new_price.astype("float")
    return data_frame

### Remove Duplicates

In [6]:
@log_step
def remove_duplicates(data_frame):
    return data_frame.drop_duplicates()

In [7]:
@log_step
def drop_nan(data_frame):
    return data_frame.dropna()


### Remove Outliers

In [8]:
@log_step
def remove_outliers(data_frame):
    return data_frame

## Pipeline UI

In [9]:
df = init_df\
        .pipe(start_pipeline)\
        .pipe(rename_columns)\
        .pipe(set_data_types)\
        .pipe(remove_duplicates)\
        .pipe(drop_nan)\
        .pipe(remove_outliers)
sns.lineplot(list(range(len(df_rows))), df_rows) 

start_pipeline:
 shape=(19326, 7) took 0:00:00.006472s

rename_columns:
 shape=(19326, 7) took 0:00:00.001010s



ValueError: could not convert string to float: '0        59991           592           593           254        34.99         ...  19321    29.9919322    69.9519323    69.9519324    69.9519325    69.95Name: product_price, Length: 19326, dtype: object'

In [12]:
init_df.head()

Unnamed: 0,sku,name,desc,price,promo_price,in_stock,type
0,RAI0007,Silver Rain Design mStand Support,Aluminum support compatible with all MacBook,59.99,499.899,1,8696
1,APP0023,Apple Mac Keyboard Keypad Spanish,USB ultrathin keyboard Apple Mac Spanish.,59.0,589.996,0,13855401
2,APP0025,Mighty Mouse Apple Mouse for Mac,mouse Apple USB cable.,59.0,569.898,0,1387
3,APP0072,Apple Dock to USB Cable iPhone and iPod white,IPhone dock and USB Cable Apple iPod.,25.0,229.997,0,1230
4,KIN0007,Mac Memory Kingston 2GB 667MHz DDR2 SO-DIMM,2GB RAM Mac mini and iMac (2006/07) MacBook Pr...,34.99,31.99,1,1364


In [11]:
df.dtypes

NameError: name 'df' is not defined

In [None]:
df.describe()

In [None]:
df.to_csv("../../clean_data/clean_"+filename+".csv")