# Clean Brands

## Import packages and File

In [19]:
import pandas as pd
from pandas_profiling import ProfileReport

df_name = "brands"
path_name = "../../data/raw_data/"
init_df = pd.read_csv(f"{path_name}{df_name}.csv")


# Log-Wrapper

In [20]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result
    return wrapper

## Pipeline Functions

### Pipeline Init

In [21]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [22]:
@log_step
def rename_columns(df):
    return (
        df
        .rename(columns={
            "short":"brand_short",
            "long":"brand_long"
            }
        )
    )

### Set Data Types

In [23]:
@log_step
def adjust_dtypes(df):
    return df

### Remove Duplicates

In [24]:
@log_step
def remove_duplicates(df):
    return df.drop_duplicates()

In [25]:
@log_step
def drop_nan(df):
    return df.dropna()


In [26]:
@log_step
def create_csv(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index=False)
    return df

In [27]:
@log_step
def end_pipeline(df):
    ProfileReport(df, title=f"{df_name.title()} - Report", dark_mode=True).to_file(f"../../data/profile_reports/{df_name}_report.html")
    return df

## Pipeline UI

In [28]:
brands = (
init_df
        .pipe(start_pipeline)      # copy 
        .pipe(rename_columns)      # short --> brand_short, long --> brand_long
        #.pipe(remove_duplicates)   # none
        #.pipe(adjust_dtypes)       # keep string type
        .pipe(create_csv)          # save .csv file
        .pipe(end_pipeline)        # create and save Profile Report .html
)

start_pipeline:
 shape=(187, 2) took 0:00:00.000192s

rename_columns:
 shape=(187, 2) took 0:00:00.000957s

create_csv:
 shape=(187, 2) took 0:00:00.001647s



Summarize dataset: 100%|██████████| 15/15 [00:01<00:00, 14.45it/s, Completed]                    
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  9.37it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 154.32it/s]

end_pipeline:
 shape=(187, 2) took 0:00:01.595851s






In [29]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   short   187 non-null    object
 1   long    187 non-null    object
dtypes: object(2)
memory usage: 3.0+ KB
