# Clean Brands

## Import packages and File

In [29]:
import pandas as pd
from pandas_profiling import ProfileReport

df_name = "brands"
path_name = "../../data/raw_data/"
init_df = pd.read_csv(f"{path_name}{df_name}.csv")


# Log-Wrapper

In [30]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result
    return wrapper

## Pipeline Functions

### Pipeline Init

In [31]:
@log_step
def start_pipeline(df):
    return df.copy()

### Rename Columns

In [32]:
@log_step
def rename_columns(df):
    return (
        df
        .rename(columns={
            "short":"brand_short",
            "long":"brand_long"
            }
        )
    )

### Set Data Types

In [33]:
@log_step
def adjust_dtypes(df):
    #df["brand_short"] = df["brand_short"].astype("category")
    #df["brand_long"] = df["brand_long"].astype("category")
    return df

### Remove Duplicates

In [34]:
@log_step
def remove_duplicates(df):
    return df.drop_duplicates()

In [35]:
@log_step
def drop_nan(df):
    return df.dropna()


In [36]:
@log_step
def end_pipeline(df):
    df.to_csv(f"../../data/clean_data/clean_{df_name}.csv", index=False)
    ProfileReport(df, title=f"{df_name.title()} - Report", dark_mode=True).to_file(f"../../data/profile_reports/{df_name}_report.html")
    return df

## Pipeline UI

In [37]:
brands = (
init_df
        .pipe(start_pipeline)
        .pipe(rename_columns)
        .pipe(adjust_dtypes)
        .pipe(remove_duplicates)
        .pipe(end_pipeline)
)

start_pipeline:
 shape=(187, 2) took 0:00:00.000174s

rename_columns:
 shape=(187, 2) took 0:00:00.000775s

adjust_dtypes:
 shape=(187, 2) took 0:00:00.000005s

remove_duplicates:
 shape=(187, 2) took 0:00:00.002581s



Summarize dataset: 100%|██████████| 15/15 [00:00<00:00, 25.29it/s, Completed]                    
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  8.36it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 794.38it/s]

end_pipeline:
 shape=(187, 2) took 0:00:01.118978s






In [38]:
init_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   short   187 non-null    object
 1   long    187 non-null    object
dtypes: object(2)
memory usage: 3.0+ KB
