exp/nb_.py


#################################################
### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
#################################################
# files to edit: 01-main_train.ipynb 02-main_train-experiments.ipynb 03-main_simple-FI.ipynb 04-main_retrain-FI.ipynb 05-main_part_dep.ipynb 06-main_dendrogram-and-dem-red.ipynb _functions.ipynb check_data.ipynb contract_till_interpret_importance_clean.ipynb interpret_tabular.ipynb test_mixup.ipynb
from fastai.layers import FlattenedLoss

from fastai.tabular import *

from fastai.basic_train import _loss_func2activ

from fastai.callbacks import CSVLogger

from scipy.cluster import hierarchy as hc

from sklearn import manifold

import pickle

def _list_diff(list_1, list_2):
    diff = set(list_1) - set(list_2)
    return [item for item in list_1 if item in diff]

def list_diff(list1, list2, *args):
    diff = _list_diff(list1, list2)
    for arg in args:
        diff = _list_diff(diff, arg)
    return diff

def exp_mmape(pred:Tensor, targ:Tensor)->Rank0Tensor:
    "Exp median absolute percentage error between `pred` and `targ`."
    pred,targ = flatten_check(pred,targ)
    pred, targ = torch.exp(pred), torch.exp(targ)
    pct_var = (targ - pred)/targ
    return torch.abs(pct_var).median()

def MAELossFlat(*args, axis:int=-1, floatify:bool=True, **kwargs):
    "Same as `nn.MAELoss`, but flattens input and target."
    return FlattenedLoss(nn.L1Loss, *args, axis=axis, floatify=floatify, is_2d=False, **kwargs)

def _list_diff(list_1, list_2):
    diff = set(list_1) - set(list_2)
    return [item for item in list_1 if item in diff]

def list_diff(list1, list2, *args):
    diff = _list_diff(list1, list2)
    for arg in args:
        diff = _list_diff(diff, arg)
    return diff

def which_elms(values, in_list):
    '''
    Just outputs elements from values that are in list in_list
    '''
    return [x for x in values if (x in in_list)]

def is_in_list(values, in_list):
    '''
    Just outputs is one of the elements from values is in list in_list
    '''
    if (len(which_elms(values, in_list)) > 0):
        return True
    else:
        return False

def apply_fill_n_catf(df:DataFrame, learn:Learner)->DataFrame:
    '''
    Reapplies FillMissing and Categorify to given dataframe.
    '''

    df_copy = df.copy()
    fill, catf = None, None
    is_alone = True if (len(df) == 1) else False

    proc = learn.data.processor[0]
    if (is_alone):
        df_copy = df_copy.append(df_copy.iloc[0])

    for prc in proc.procs:
        if (type(prc) == FillMissing):
            fill = prc
        elif (type(prc) == Categorify):
            catf = prc
    if (fill is not None):
        fill.apply_test(df_copy)

    if (catf is not None):
        catf.apply_test(df_copy)
        for c in catf.cat_names:
            df_copy[c] = (df_copy[c].cat.codes).astype(np.int64) + 1
        cats = df_copy[catf.cat_names].to_numpy()

    # ugly workaround as apperently catf.apply_test doesn't work with lone row
    if (is_alone):
        df_copy = df_copy[:1]

    return df_copy


def apply_fill(df:DataFrame, learn:Learner)->DataFrame:
    '''
    Reapplies FillMissing to given dataframe.
    '''

    df_copy = df.copy()
    fill = None
    is_alone = True if (len(df) == 1) else False

    proc = learn.data.processor[0]
    if (is_alone):
        df_copy = df_copy.append(df_copy.iloc[0])

    for prc in proc.procs:
        if (type(prc) == FillMissing):
            fill = prc
    if (fill is not None):
        fill.apply_test(df_copy)

    # ugly workaround as apperently catf.apply_test doesn't work with lone row
    if (is_alone):
        df_copy = df_copy[:1]

    return df_copy


def get_model_real_input(df:DataFrame, learn:Learner, bs:int=None)->Tensor:

    df_copy = df.copy()
    fill, catf, norm = None, None, None
    cats, conts = None, None
    is_alone = True if (len(df) == 1) else False


    proc = learn.data.processor[0]
    if (is_alone):
        df_copy = df_copy.append(df_copy.iloc[0])

    for prc in proc.procs:
        if (type(prc) == FillMissing):
            fill = prc
        elif (type(prc) == Categorify):
            catf = prc
        elif (type(prc) == Normalize):
            norm = prc
    if (fill is not None):
        fill.apply_test(df_copy)
    if (catf is not None):
        catf.apply_test(df_copy)
        for c in catf.cat_names:
            df_copy[c] = (df_copy[c].cat.codes).astype(np.int64) + 1
        cats = df_copy[catf.cat_names].to_numpy()

    if (norm is not None):
        norm.apply_test(df_copy)
        conts = df_copy[norm.cont_names].to_numpy().astype('float32')

    # ugly workaround as apperently catf.apply_test doesn't work with lone row
    if (is_alone):
        xs = [torch.tensor([cats[0]], device=learn.data.device), torch.tensor([conts[0]], device=learn.data.device)]
    else:
        if (bs is None):
            xs = [torch.tensor(cats, device=learn.data.device), torch.tensor(conts, device=learn.data.device)]
        elif (bs > 0):
            xs = [list(chunks(l=torch.tensor(cats, device=learn.data.device), n=bs)),
                  list(chunks(l=torch.tensor(conts, device=learn.data.device), n=bs))]

    return xs


def get_cust_preds(df:DataFrame, learn:Learner, bs:int=None, parent=None)->Tensor:
    '''
    Using existing model to predict output (learn.model) on a new dataframe at once (learn.predict does it for
    one row which is pretty slow).
    '''
    def turn_to_activ(learn, acts):
        activ = _loss_func2activ(learn.loss_func)
        if activ is not None:
            return to_np(activ(acts))
        else:
            return to_np(acts)

    xs = get_model_real_input(df=df, learn=learn, bs=bs)
    learn.model.eval();
    if (bs is None):
        outp = learn.model(x_cat=xs[0], x_cont=xs[1])

    elif (bs > 0):
        res = []
        for ca, co in zip(xs[0], xs[1]):
            res.append(to_np(learn.model(x_cat=ca, x_cont=co)))
        #double translation to save gpu memory
        outp = tensor(np.concatenate(res, axis=0))
    return turn_to_activ(learn=learn, acts=outp)


def convert_dep_col(df:DataFrame, dep_col:AnyStr, learn:Learner)->Tensor:
    '''
    Converts dataframe column, named "depended column", into tensor, that can later be used to compare with predictions.
    Log will be applied if it was done in a training dataset
    '''
    actls = df[dep_col].T.to_numpy()[np.newaxis].T.astype('float32')
    actls = np.log(actls) if (hasattr(learn.data, 'log') and learn.data.log) else actls
    return torch.tensor(actls, device=learn.data.device)


def calc_loss(func:Callable, pred:Tensor, targ:Tensor, device=None)->Rank0Tensor:
    '''
    Calculates error from predictions and actuals with a given metrics function
    '''
    if (device is None):
        return func(pred, targ)
    else:
        return func(torch.tensor(pred, device=device), targ)


def calc_error(df:DataFrame, learn:Learner, dep_col:AnyStr,
               func:Callable, bs:int=None)->float:
    '''
    Wrapping function to calculate error for new dataframe on existing learner (learn.model)
    See following functions' docstrings for details
    '''
    preds = get_cust_preds(df=df, learn=learn, bs=bs)
    actls = convert_dep_col(df, dep_col, learn)
    error = calc_loss(func, pred=preds, targ=actls, device=learn.data.device)
    return float(error)

def emb_fwrd_sim(model, x_cat:Tensor, x_cont:Tensor)->Tensor:
    '''
    Part that was completely taking from fastai Tabular model source :)
    Gets inner representation of input dataframe (Catigorified, Filled and Normalized)
    and process it with embeddings 'prelayer'. Also continuous variables are processed with BatchNorm if needed.
    As a result output is model gets on it's layers as input (embedding in fact are not layers, but before them)
    '''
    if model.n_emb != 0:
        x = [e(x_cat[:,i]) for i,e in enumerate(model.embeds)]
        x = torch.cat(x, 1)
        x = model.emb_drop(x)
    if model.n_cont != 0:
        x_cont = model.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1) if model.n_emb != 0 else x_cont
    return x


def get_inner_repr(df:DataFrame, learn:Learner)->Tensor:
    '''
    Gets new dataframe that has categorical and continuous columns the learner war learnt with
    (are being taken from learner automatically)
    And outputs inner representation of these data -- what model gets after embeddings
    Is useful for ex. to use learnt embeddings in random forest
    This output can be directly feed to RF learner (after turning it to numpy if needed)
    '''
    xs = get_model_real_input(df=df, learn=learn)
    return emb_fwrd_sim(model=learn.model, x_cat=xs[0], x_cont=xs[1])

def calc_error_mixed_col(df:DataFrame,
                         learn:Learner,
                         dep_col:AnyStr,
                         sampl_col:AnyStr,
                         func:Callable,
                         bs:int=None,
                         rounds=5)->float:
    df_temp = pd.concat([df]*rounds, ignore_index=True).copy()
    df_temp[sampl_col] = np.random.permutation(df_temp[sampl_col].values)
    return calc_error(df=df_temp, learn=learn, dep_col=dep_col, func=func, bs=bs)


def get_columns(learn:Learner)->tuple:
    cats, cats_temp, conts, conts_temp = [], [], [], []
    proc = learn.data.processor[0]
    for prc in proc.procs:
        if (type(prc) == Categorify):
            cats_temp = prc.cat_names
        elif (type(prc) == Normalize):
            conts = prc.cont_names

    #delete _na columns
    conts_temp = [cont+'_na' for cont in conts]
    for cat in cats_temp:
        if (cat not in conts_temp):
             cats.append(cat)

    return cats, conts


def calc_feat_importance(df:DataFrame,
                         learn:Learner,
                         dep_col:AnyStr,
                         func:Callable,
                         bs:int=None,
                         rounds=5)->OrderedDict:

    base_error = calc_error(df=df, learn=learn, dep_col=dep_col, func=func, bs=bs)
    cats, conts = get_columns(learn=learn)
    importance = {}
    pbar = master_bar(cats+conts, total=len(cats+conts))
    for col in pbar:
        importance[col] = calc_error_mixed_col(df=df, learn=learn, dep_col=dep_col,
                                               sampl_col=col, func=func, bs=bs, rounds=rounds)
        _ = progress_bar(range(1), display=False, parent=pbar) #looks like fastprogress doesn't work without 2nd bar :(
    for key, value in importance.items():
        importance[key] = (value - base_error)/base_error
    return collections.OrderedDict(sorted(importance.items(), key=lambda kv: kv[1], reverse=True))


def calc_fi_custom(df:DataFrame,
                   learn:Learner,
                   dep_col:AnyStr,
                   fields:List,
                   func:Callable,
                   bs:int=None,
                   rounds=5)->OrderedDict:

    base_error = calc_error(df=df, learn=learn, dep_col=dep_col, func=func, bs=bs)
    importance = {}
    pbar = master_bar(fields, total=len(fields))
    for field in pbar:
        key = field if isinstance(field, str) else ', '.join(str(e) for e in field)
        importance[key] = calc_error_mixed_col(df=df, learn=learn, dep_col=dep_col,
                                               sampl_col=field, func=func, bs=bs, rounds=rounds)
        _ = progress_bar(range(1), display=False, parent=pbar) #looks like fastprogress doesn't work without 2nd bar :(
    for key, value in importance.items():
        importance[key] = (value - base_error)/base_error
    return collections.OrderedDict(sorted(importance.items(), key=lambda kv: kv[1], reverse=True))

def ord_dic_to_df(ord_dict:OrderedDict)->DataFrame:
    return pd.DataFrame([[k, v] for k, v in ord_dict.items()], columns=['feature', 'importance'])

def plot_importance(df:DataFrame, limit=20, asc=False):
    df_copy = df.copy()
    df_copy['feature'] = df_copy['feature'].str.slice(0,25)
    ax = df_copy.sort_values(by='importance', ascending=asc)[:limit].sort_values(by='importance', ascending=not(asc)).plot.barh(x="feature", y="importance", sort_columns=True, figsize=(10, 10))
    for p in ax.patches:
        ax.annotate(f'{p.get_width():.4f}', ((p.get_width() * 1.005), p.get_y()  * 1.005))


#     implement function that returns learner object in your notebook
#
#     For ex.
#     def build_learner_cur(df:DataFrame,
#                       bs:int,
#                       acc_func:Callable,
#                       dep_var:str,
#                       to_drop_cat:tuple=(),
#                       to_drop_cont:tuple=()):
#         cat_vars_mod = list_diff(cat_vars, to_drop_cat)
#         cont_vars_mod = list_diff(cont_vars, to_drop_cont)
#         data = (TabularList.from_df(df, path=path, cat_names=cat_vars_mod, cont_names=cont_vars_mod, procs=procs)
#                 .split_by_idx(valid_idx)
#                 .label_from_df(cols=dep_var, label_cls=FloatList, log=True)
#                 .databunch(bs=bs))
#         np.random.seed(1001)
#         learn = tabular_learner(data,
#                                 layers=p['layers'],
#                                 ps=p['layers_drop'],
#                                 emb_drop=p['emb_drop'],
#                                 y_range=y_range,
#                                 metrics=acc_func,
#                                 loss_func=MAELossFlat(),
#                                 callback_fns=[CSVLogger])
#         return learn
#

#     implement function does 1 training loop in your notebook
#
#     For ex.
#     def do_train_loop_cur(learn:Learner, cycles):
#         learn.fit_one_cycle(cyc_len=cycles, max_lr=p['max_lr'], wd=p['w_decay'])

def clear_pbar():
    # Just to clear the output. Yes, I know, I agree It's awfull should be refactored
    for _ in progress_bar(range(1), parent=None, leave=False):
        1==1

def extract_metrics_median(metrics_df:DataFrame, acc_func:Callable, bottom_X:float=0.2)->float:
    func_name = acc_func.__name__
    metr = metrics_df[func_name].to_numpy()
    subset = metr[np.argsort(metr)][-math.ceil(len(metr)*bottom_X):] if (func_name =='accuracy') else metr[np.argsort(metr)][:math.ceil(len(metr)*bottom_X)]
    metrics = np.median(subset)
    return float(metrics)

def calc_valid_acc(learn:Learner, func:Callable)->float:
    metr = learn.csv_logger.read_logged_file()
    acc = extract_metrics_median(metrics_df=metr, acc_func=func)
    return float(acc)

def calc_acc(df:DataFrame,
             bs:int,
             acc_func:Callable,
             dep_var:str,
             to_drop_cat:tuple=(),
             to_drop_cont:tuple=(),
             load_learn:str=None,
             trains:int=1,
             cycles:int=80,
             is_overall_mode:bool=None)->float:
    learn = build_learner(df=df,
                  bs=bs,
                  acc_func=acc_func,
                  dep_var=dep_var,
                  to_drop_cat=to_drop_cat,
                  to_drop_cont=to_drop_cont)
    if (load_learn is not None):
        learn = learn.load(load_learn)
    else:
        for i in range(trains):
            print(f"Train {i+1} of {trains}")
            do_train_loop(learn, cycles)
            clear_pbar()
    if (is_overall_mode is None) or (is_overall_mode == False):
        acc = calc_valid_acc(learn=learn, func=acc_func)
    else:
        acc = calc_error(df=df, learn=learn, dep_col=dep_var, func=acc_func, bs=bs)
    return acc

def calc_1_imp_relearn(base_error:float,
                       df:DataFrame,
                       bs:int,
                       acc_func:Callable,
                       dep_var:str,
                       to_drop_cat:tuple=(),
                       to_drop_cont:tuple=(),
                       load_learn:str=None,
                       trains:int=1,
                       cycles:int=80,
                       is_overall_mode:bool=None)->float:
    error = calc_acc(df, bs, acc_func, dep_var, to_drop_cat, to_drop_cont, load_learn,
                     trains=trains, cycles=cycles, is_overall_mode=is_overall_mode)
    if (acc_func.__name__ == 'accuracy'):
        base_acc, accuracy = base_error, error # Just rename for better understanding
        importance = (base_acc - accuracy)/base_acc
    else:
        importance = (error - base_error)/base_error
    return (list(to_drop_cat)+list(to_drop_cont), importance)

def print_importance_res(dropped:List, importance:float):
    print('Features '+', '.join(dropped)+' have accumulated importance of')
    print(importance)

def calc_many_imps_relearn(base_error:float,
                           df:DataFrame,
                           bs:int,
                           acc_func:Callable,
                           dep_var:str,
                           to_drop_cats:tuple=(),
                           to_drop_conts:tuple=(),
                           load_learn:str=None,
                           trains:int=1,
                           cycles:int=80,
                           is_overall_mode:bool=None)->float:

    to_drop_cats = listify(to_drop_cats)
    to_drop_conts = listify(to_drop_conts)
    importances = {}

    overall = len(list(to_drop_cats)+list(to_drop_conts))
    for i, var in enumerate(to_drop_cats):
        var = listify(var)
        print(f"Categorical feature {i+1} of {len(to_drop_cats)}")
        imp = calc_1_imp_relearn(base_error, df, bs, acc_func,
                                 dep_var=dep_var, to_drop_cat=var, trains=trains,
                                 cycles=cycles, is_overall_mode=is_overall_mode)
        key = imp[0] if isinstance(imp[0], str) else ', '.join(str(e) for e in imp[0])
        importances[key] = imp

    for i, var in enumerate(to_drop_conts):
        var = listify(var)
        print(f"Continuous feature {i+1} of {len(to_drop_conts)}")
        imp = calc_1_imp_relearn(base_error, df, bs, acc_func,
                                  dep_var=dep_var, to_drop_cont=var, trains=trains,
                                 cycles=cycles, is_overall_mode=is_overall_mode)
        key = imp[0] if isinstance(imp[0], str) else ', '.join(str(e) for e in imp[0])
        importances[key] = imp

    return importances

def calc_mean_dict(lst):
    mean_dict = {}
    ln = len(lst)
    for key, value in lst[0].items():
        mean_dict[key] = np.zeros(ln)
    for i, row in enumerate(lst):
        for key, value in row.items():
            mean_dict[key][i] = value[1]
    for key, value in mean_dict.items():
        mean_dict[key] = np.median(value)

    return mean_dict


def calc_many_imps_relearn_steps(base_error:float,
                                 df:DataFrame,
                                 bs:int,
                                 acc_func:Callable,
                                 dep_var:str,
                                 to_drop_cats:tuple=(),
                                 to_drop_conts:tuple=(),
                                 load_learn:str=None,
                                 trains=1,
                                 cycles=80,
                                 rounds=5,
                                 is_overall_mode:bool=None)->dict:
    '''
    to_drop_cats and to_drop_conts:tuple can be tupple of tuples (lists of lists)
    this means we measure every item in the first list and retrain without every item in the second one in one turn
    (treat it as one entity)
    '''
    acc = []
    for i in range(rounds):
        print(f"Round {i+1} of {rounds}")
        acc_ = calc_many_imps_relearn(base_error=base_error,
                                      df=df,
                                      bs=bs,
                                      acc_func=acc_func,
                                      dep_var=dep_var,
                                      to_drop_cats=to_drop_cats,
                                      to_drop_conts=to_drop_conts,
                                      trains=trains,
                                      cycles=cycles,
                                      is_overall_mode=is_overall_mode)
        acc.append(acc_)
    imp = calc_mean_dict(acc)
    return collections.OrderedDict(sorted(imp.items(), key=lambda kv: kv[1], reverse=True))


def calc_base_acc_steps(df:DataFrame,
                        bs:int,
                        acc_func:Callable,
                        dep_var:str,
                        trains=1,
                        cycles=80,
                        rounds=5,
                        is_overall_mode:bool=None)->float:
    base_acc=np.empty((rounds))
    for i in range(rounds):
        print(f"Round {i+1} of {rounds}")
        base_acc[i] = calc_acc(df=df, bs=bs,
                               acc_func=acc_func, dep_var=dep_var, trains=trains,
                               cycles=cycles, is_overall_mode=is_overall_mode)
    return np.median(base_acc)

def get_field_uniq_x_coef(df:DataFrame, field:str, coef:float)->list:
    '''
    This function outputs threshold to number of occurrences different variants of list of columns (fields)
    In short if coef for ex. is 0.9, then function outputs number of occurrences for all but least 10%
    of the least used
    If coef is more 1.0, then 'coef' itself is used as threshold
    '''
    if (coef > 1):
        return math.ceil(coef)
    coef = 0. if (coef < 0) else coef
    occs = df.groupby(field).size().reset_index(name="Times").sort_values(['Times'], ascending=False)
    num = math.ceil(coef*len(occs))
    if (num <= 0):
        # number of occurances is now = max_occs+1 (so it will be no items with this filter)
        return occs.iloc[0]['Times'] + 1
    else:
        return occs.iloc[num-1]['Times']


def get_part_dep_one_list(df:DataFrame,
                     learn:Learner, bs:int=None, fields:list=(), coef:float=1.0, to_int:bool=False,
                     dep_name:str=None, is_sorted:bool=True)->DataFrame:
    '''
    Function calculate partial dependency for column in fields.
    Fields is a list of lists of what columns we want to test. The inner items are treated as connected fields.
    For ex. fields = [['Store','StoreType']] mean that Store and StoreType is treated as one entity
    (it's values are substitute as a pair, not as separate values)
    coef is useful when we don't want to deal with all the variants, but only with most common
    '''
    NAN_SUBST = '###na###'
    CONT_COLS = get_cont_cols(learn)
    if (dep_name is None):
        dep_name = 'dep_var'

    fields = listify(fields)
    df = apply_fill(df=df, learn=learn)

    #divide cont variables into groups
    if is_in_list(values=fields, in_list=CONT_COLS):
        for col in which_elms(values=fields, in_list=CONT_COLS):
            edges = np.histogram_bin_edges(a=df[col].dropna(), bins='auto')
            for x,y in zip(edges[::],edges[1::]):
                df.loc[(df[col] > x) & (df[col] < y), col] = (x+y)/2

    field_min_occ = get_field_uniq_x_coef(df=df, field=fields, coef=coef)
    df[fields] = df[fields].fillna(NAN_SUBST) #to treat None as a separate field
    occs = df.groupby(fields).size().reset_index(name="Times").sort_values(['Times'], ascending=False)
    occs[fields] = occs[fields].replace(to_replace=NAN_SUBST, value=np.nan) #get back Nones from NAN_SUBST
    df[fields] = df[fields].replace(to_replace=NAN_SUBST, value=np.nan) #get back Nones from NAN_SUBST
    occs = occs[occs['Times'] >= field_min_occ]
    df_copy = df.merge(occs[fields]).copy()

    frame = []
    ln = len(occs)
    if (ln > 0):
        pbar = master_bar(occs.iterrows(), total=ln)
        for _, row in pbar:
            # We don't need to do df_copy = df.merge(occs[field]).copy() every time
            # as every time we change the same column (set of columns)
            record = []
            pb = progress_bar(fields, display=False, parent=pbar)
            for fld in pb:
                df_copy[fld] = row[fld]
            preds = get_cust_preds(df=df_copy, learn=learn, bs=bs)
            preds = np.exp(np.median(preds)) if (hasattr(learn.data, 'log') and learn.data.log) else np.median(preds)
            pred = int(preds) if to_int else preds
            for fld in fields:
                record.append(row[fld])
            record.append(pred)
            record.append(row['Times'])
            frame.append(record)
    out = pd.DataFrame(frame, columns=fields+[dep_name, 'times'])
    median = out[dep_name].median()
    out[dep_name] /= median
    if (is_sorted == True):
        out = out.sort_values(by=dep_name, ascending=False)
    return out

def get_cat_cols(learn:Learner,  is_wo_na=True)->List:
    '''
    Just outputs category fields from LabelLists object
    '''
    catf = None
    result = []
    proc = learn.data.processor[0]
    for prc in proc.procs:
        if (type(prc) == Categorify):
            catf = prc
    if (catf is not None):
        result = [c for c in catf.cat_names if ((is_wo_na is not None) and (is_wo_na == True) and (c[-3:] != "_na"))]
    return result


def get_cont_cols(learn:Learner)->List:
    '''
    Just outputs continuous fields from LabelLists object
    '''
    norm = None
    result = []
    proc = learn.data.processor[0]

    for prc in proc.procs:
        if (type(prc) == Normalize):
            norm = prc

    if (norm is not None):
        result = norm.cont_names

    return result


def get_part_dep(df:DataFrame, learn:Learner, bs:int=None,
                 fields:tuple=None, coef:float=1.0, to_int:bool=False,
                 dep_name:str=None, is_sorted:bool=True)->List:
    '''
    Makes a datafreme with partial dependencies for every categorical variable in df
    '''
    result = []
    if (fields is None):
        fields = get_cat_cols(learn=learn) + get_cont_cols(learn=learn)

    for field in fields:
        new_df = get_part_dep_one_list(df=df, learn=learn, bs=bs, fields=field, to_int=to_int,
                                       dep_name=dep_name, coef=coef, is_sorted=is_sorted)
        new_df['feature'] = str(field)
        if is_listy(field):
            new_df['value'] = new_df[field].values.tolist()
            new_df.drop(columns=field, inplace=True)
        else:
            new_df = new_df.rename(index=str, columns={str(field): "value"})
        result.append(new_df)
    clear_pbar()
    result = pd.concat(result, ignore_index=True, sort=True)
    result = result[['feature', 'value', dep_name, 'times']]

    return result

def build_correlation_matr(df:DataFrame):
    '''
    Build Spearman rank-order correlation matrix
    NA in df should be fixed before pass here
    '''
    corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
    corr[np.isnan(corr)] = 0.0
    np.fill_diagonal(corr, 1.0)
    return corr

def plot_dendrogram_corr(corr_matr, columns, figsize=None, leaf_font_size=16):
    '''
    Plots dendrogram for a given correlation matrix
    '''
    if (figsize is None):
        figsize = (15, 0.02*leaf_font_size*len(columns))
    corr_condensed = hc.distance.squareform(1-corr_matr)
    z = hc.linkage(corr_condensed, method='average')
    fig = plt.figure(figsize=figsize)
    dendrogram = hc.dendrogram(z, labels=columns, orientation='left', leaf_font_size=leaf_font_size)
    plt.show()

def plot_dendrogram(df:DataFrame, figsize=None, leaf_font_size=16):
    corr = build_correlation_matr(df)
    plot_dendrogram_corr(corr_matr=corr, columns=df.columns, figsize=figsize, leaf_font_size=leaf_font_size)

def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0]
    if (chi2 == 0):
        return 0.0
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

def get_cramer_v_matr(df:DataFrame)->np.ndarray:
    '''
    Calculate Cramers V statistic for every pair in df's columns
    '''
    cols = list(df.columns)
    corrM = np.zeros((len(cols), len(cols)))
    pbar = master_bar(list(itertools.combinations(cols, 2)))
    for col1, col2 in pbar:
        _ = progress_bar(range(1), parent=pbar) #looks like fastprogress doesn't work without 2nd bar :(
        idx1, idx2 = cols.index(col1), cols.index(col2)
        corrM[idx1, idx2] = cramers_corrected_stat(pd.crosstab(df[col1], df[col2]))
        corrM[idx2, idx1] = corrM[idx1, idx2]
    np.fill_diagonal(corrM, 1.0)
    return corrM

def get_top_corr_df(df:DataFrame, corr_thr:float=0.8, corr_matr:array=None)->DataFrame:
    if (corr_matr is not None):
        corr = corr_matr
    else:
        corr = build_correlation_matr(df=df)
    corr = np.where(abs(corr)<corr_thr, 0, corr)
    idxs = []
    for i in range(corr.shape[0]):
        if (corr[i, :].sum() + corr[:, i].sum() > 2):
            idxs.append(i)
    cols = df.columns[idxs]
    return pd.DataFrame(corr[np.ix_(idxs, idxs)], columns=cols, index=cols)

def get_top_corr_dict_corrs(top_corrs:DataFrame)->OrderedDict:
    cols = top_corrs.columns
    top_corrs_np = top_corrs.to_numpy()
    corr_dict = {}
    for i in range(top_corrs_np.shape[0]):
        for j in range(i+1, top_corrs_np.shape[0]):
            if (top_corrs_np[i, j] > 0):
                corr_dict[cols[i]+' vs '+cols[j]] = np.round(top_corrs_np[i, j], 3)
    return collections.OrderedDict(sorted(corr_dict.items(), key=lambda kv: abs(kv[1]), reverse=True))

def get_top_corr_dict(df:DataFrame, corr_thr:float=0.8, corr_matr:array=None)->OrderedDict:
    '''
    Outputs top pairs of correlation in a given dataframe with a given correlation matrix
    Filters output mith minimal correlation of corr_thr
    '''
    top_corrs = get_top_corr_df(df, corr_thr, corr_matr)
    return get_top_corr_dict_corrs(top_corrs)

def get_classes_o_list(learn:Learner):
    procs = learn.data.processor[0]
    return procs.classes

def get_rev_emb_idxs(learn:Learner)->dict:
    classes_dict = get_classes_o_list(learn=learn)
    return {c:i for i, (c, _) in enumerate(classes_dict.items()) if (c[-3:] != "_na")}


def get_emb_outp(learn:Learner, field:str, inp:str, rev_emb_idxs:dict, classes, embs):
    emb = embs[rev_emb_idxs[field]]
    idx, = np.where(classes[field] == inp)
    if (len(idx) == 1):
        cat_idx = idx[0]
    else:
        cat_idx = 0
    return emb(torch.tensor(cat_idx, device=learn.data.device))


def get_embs_map(learn:Learner)->OrderedDict:
    '''
    Output embedding vector for every item of every cafegirical column as a dictionary of dicts

    '''
    cat_cols = get_cat_cols(learn=learn, is_wo_na=True)
    rev_emb_idxs = get_rev_emb_idxs(learn=learn)
    classes = get_classes_o_list(learn=learn)
    embs = learn.model.embeds
    learn.model.eval();
    result = OrderedDict()

    for cat in cat_cols:
        cat_res = OrderedDict()
        for val in classes[cat]:
            cat_res[val] = get_emb_outp(learn=learn,
                                        field=cat, inp=str(val),
                                        rev_emb_idxs=rev_emb_idxs,
                                        classes=classes, embs=embs)
        result[cat] = cat_res

    return result


def emb_map_reduce_dim(embs_map:OrderedDict, outp_dim:int=3, to_df:bool=True, method:str='pytorch', exclude:list=None):
    '''
    Reduces dimention of embedding map upto outp_dim
    Can use 'pytorch' approach (pca)
    or 'scilearn' for manifold.TSNE (longer, but not sure that it is better)
    '''
    exclude = listify(exclude)
    result = OrderedDict()
    for feat, val in embs_map.items():
        reformat = []
        names = []
        for k,v in val.items():
            reformat.append(v)
            names.append(k)
        reformat = torch.stack(reformat)
        if (exclude is not None) and (feat in exclude):
            continue
        if (method == 'scilearn'):
            tsne = manifold.TSNE(n_components=outp_dim, init='pca')
            reduced = tsne.fit_transform(to_np(reformat))
        else:
            reduced = reformat.pca(outp_dim)
        record = OrderedDict({k:v for k, v in zip(names, reduced)})
        result[feat] = record

    if (to_df == True):
        data = []
        for feat, val in result.items():
            for k,v in val.items():
                dt = list(v) if (method == 'scilearn') else list(to_np(v))
                data.append([feat] + [k]  + dt)
        names = ['feature', 'value'] + ['axis_' + str(i) for i in range(outp_dim)]
        result = pd.DataFrame(data, columns=names)

    return result


def add_times_col(embs_map:DataFrame, df:DataFrame)->DataFrame:
    '''
    Adds to embeddings map dataframe new column with times of value's number of occurrences
    Usefull for estimation of how accurate the value is (more time means more sure you can be)
    '''
    times = np.zeros(len(embs_map))
    last_feat = ''
    vc = None
    for i, (f, v) in enumerate(zip(embs_map['feature'], embs_map['value'])):
        if (f != last_feat):
            vc = df[f].value_counts(dropna=False)
            vc.index = vc.index.map(str)
            last_feat = f
        if (v != '#na#'):
            times[i] = vc[v]
        else:
            times[i] = vc['nan'] if ('nan' in vc.index) else 0
    result = embs_map.copy()
    result['times'] = times
    return result

# Little helpers for saving/loading variables with pickle
def sv_var(var, name, path):
    f = open(path/f"{name}.pkl","wb")
    pickle.dump(var, f)
    f.close()

def ld_var(name, path):
    f = open(path/f"{name}.pkl","rb")
    var = pickle.load(f)
    f.close()
    return var

def plot_2d_emb(emb_map:DataFrame, feature:str, top_x:int=10):
    sub_df = emb_map.query(f"feature == '{feature}'").sort_values('times', ascending=False).head(top_x)
    X = sub_df['axis_0']
    Y = sub_df['axis_1']
    plt.figure(figsize=(15, 8))
    plt.scatter(X, Y)
    for name, x, y in zip(sub_df['value'], X, Y):
        plt.text(x, y, name, color=np.random.rand(3)*0.7, fontsize=11)
    plt.show()