In [1]:
# default_exp codexplainer.utils

In [21]:
# export

import pandas as pd
import numpy as np

from typing import Tuple, List, Optional, Dict, Set

In [10]:
# export
#Logging configuration

import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='mylog.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.INFO)

## Module codexplainer.utils

> Module providing utilities for handling-organizing data

In [24]:
# export

def get_tabulardata_distances_data(data: List,
                                   div_nm: Optional[str]='JS-Divergence',
                                   dist_nm: Optional[str]='JS-Distance') -> Tuple:
    """
    Get the tabular data for error distances (JS distance & divergence)
    
    :param err_data: List containing data of multiple experiments
    :pram div_nm: Optional[str] indicating the name of the divergence metrics in the data
    :pram dist_nm: Optional[str] indicating the name of the distance metrics in the data
    
    :return: Tuple[pd.DataFrame, pd.Dataframe] containing distance and divergence tabular data
    """
    dists = []
    divs = []
    
    for experiment in data:
        record_dist = { }
        record_div = { }
        
        for dimension, measures in experiment.items():
            record_dist[dimension] = measures[dist_nm]
            record_div[dimension] = measures[div_nm]
            
        dists.append(record_dist)
        divs.append(record_div)
    
    
    dist_df = pd.DataFrame(dists)
    div_df = pd.DataFrame(divs)
    
    return dist_df, div_df

In [13]:
# export

def get_data_stats_dataframe(stats_data: List,
                             measures: Optional[List[str]]=['mean']) -> Dict:
    """
    Get a set of dataframes for the stats data gathered accross all experiments
    Available for metrics & error analysis
    
    :param stats_data: List containing experimets data with metrics stats.
    :param measures: List[str] containing all the measures to consider
                     - ['mean'] considered by default
    
    :return: Dictionary containing pd.DataFrames for all the specified measures
    """
    data = { }
    
    for measure in measures:
        data[measure] = []
    
    for experiment in stats_data:
        for m in range(len(measures)):
            record = { }
            measure = measures[m]
            
            record  = { metric: stats[measure]  for metric, stats in experiment.items() }
                
            data[measure].append(record)
            
    data_df = { measure: pd.DataFrame(m_data)  for measure, m_data in data.items() }
    return data_df

In [18]:
# export

def clean_dict_dataset_nans(data: Dict, measures: Optional[List[str]]=['mean']):
    """
    Replace Nan values by mean
    :param data: Dict containing all "raw" dataframes for corresponding measures
    
    :return: Dictionary containing dataframes for each measure
    """
    clean_dict = { }
    for measure in measures:
        clean_dict[measure] = data[measure].fillna(data[measure].mean())
    
    return clean_dict

In [26]:
# export

def clean_dataset_nans(data: pd.DataFrame):
    clean_df = data.fillna(data.mean())
    return clean_df

In [20]:
# export

def integrate_missing_error_dims(df: pd.DataFrame, dimensions: Set):
    """
    Integrate dimensions in order to standarize dimensions for comparisons
    
    :return: pd.DataFrame with the integrated dimensions
    """
    
    result_df = df.copy()
    present_dims = list(df.columns)
    
    for dim in dimensions:
        if dim not in present_dims:
            result_df[dim] = np.zeros(len(df))
        
    return result_df

In [27]:
from nbdev.export import notebook2script
notebook2script()

Converted 0.1_mgmnt.prep.ipynb.
Converted 0.2_mgmnt.prep.files_mgmnt.ipynb.
Converted 0.3_mgmnt.prep.bpe_tokenization.ipynb.
Converted 0.4_mgmnt.prep.tokenization_counting.ipynb.
Converted 0.5_mgmnt.prep.token_mgmnt.ipynb.
Converted 1.1_exp.info.ipynb.
Converted 1.2_exp.desc.metrics.java.ipynb.
Converted 1.4_exp.metrics_python.ipynb.
Converted 1.5_exp.metrics_java.ipynb.
Converted 2.0_repr.codebert.ipynb.
Converted 2.0_repr.i.ipynb.
Converted 2.1_repr.codeberta.ipynb.
Converted 2.1_repr.roberta.train.ipynb.
Converted 2.2_repr.roberta.eval.ipynb.
Converted 2.3_repr.word2vec.train.ipynb.
Converted 2.6_repr.word2vec.eval.ipynb.
Converted 2.7_repr.distmetrics.ipynb.
Converted 2.8_repr.sentence_transformers.ipynb.
Converted 3.1_traceability.unsupervised.eda.ipynb.
Converted 3.2_traceability.unsupervised.approach.d2v.ipynb.
Converted 3.2_traceability.unsupervised.approach.w2v.ipynb.
Converted 4.0_infoxplainer.ir.ipynb.
Converted 4.1_infoxplainer.ir.unsupervised.d2v.ipynb.
Converted 4.2_infox