In [1]:
"""
functions that return the memory performance of dataframe and help
to optimise the memory storage by downcasting numberic columns or 
converting string columns into categories where appropriate
""";

In [None]:
# memory usage/saving

def get_memory_usage(pd_obj: Union[pd.DataFrame, pd.Series]) -> None:
    """
    print out the memory useage of a dataframe or series in mega bytes
    """
    if isinstance(pd_obj, pd.DataFrame):
        bytes_ = pd_obj.memory_usage(deep=True).sum()
    else:
        bytes_ = pd_obj.memory_usage(deep=True)
    print(f'{bytes_/1024**2:.2f} MB')

    
def downcast_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """
    downcast int and float cols of a dataframe, returns a dataframe
    """
    int_cols = df.select_dtypes(['int']).columns.tolist()
    float_cols = df.select_dtypes(['float']).columns.tolist()
    
    df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='unsigned')
    df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
    
    return df

def set_objs_to_cats(df: pd.DataFrame, threshold: float = 0.5, logger=None) -> pd.DataFrame:
    """
    sets any column of type object to type categorical is the number of unique values is 
    below the threshold (default=0.5)
    returns a dataframe
    """
    cols_obj = df.select_dtypes(['object']).columns
    len_index = len(df.index)

    for col in cols_obj:
        try:
            n_unique = len(df[col].unique())
            ratio = n_unique / len_index
            if logger:
                logger.debug(f'{col} ratio: {ratio}')

            if ratio < 0.5:
                df[col] = df[col].astype('category')
        except TypeError:
            print(f'Type error: {col}, unhasable type?')
    
    return df


def make_df_memory_efficent(df: pd.DataFrame, **kwargs):
    """
    downcast numeric values > set objects to cats > print of memory usage before and
    after the processing
    """
    get_memory_usage(df)
    df = downcast_numeric(df)
    df = set_objs_to_cats(df, **kwargs)
    get_memory_usage(df)
    return df


def get_dtype_dict(df: pd.DataFrame) -> Dict:
    """
    return a dict of column header keys and dtype values
    """
    cols = df.columns.tolist()
    dtypes = [i.name for i in df.dtypes.values]
    return dict(zip(cols, dtypes))

In [None]:
def pickle_obj(obj, file_nm: str) -> None:
    """
    save object to a pickle on your computer
    """
    with open(file_nm, 'wb') as f:
        pickle.dump(obj, f)