In [1]:
# default_exp mgmnt.prep.files_mgmnt

In [4]:
# export

import pandas as pd
import numpy as np

from pathlib import Path, PosixPath
from typing import List

In [1]:
# export
#Logging configuration

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

## files_mgmnt

> Module to handle loading of data sources (e.g. csv, jsonl) and related files (e.g., np serialized arrays).

> @Alvaro May 20th 2021

In [6]:
# export

def _check_file_existence(file_path: str) -> bool:
    """
    Validates the existence of a file
    """
    path = Path(file_path)
    if not path.exists():
        logging.error('Provided file cannot be found.')
        return False
    return True

def _check_dir_existence(path: PosixPath):
    """
    Validates the existence of a given directory
    """
    if not path.exists():
        msg = "Provided directory cannot be found."
        logging.error(msg)
        raise Exception(msg)
    

In [None]:
# export

def get_file_name(full_dir: str):
    """
    Retrieves the filename of a path
    """
    path = Path(full_dir)
    return Path(path.name).stem

In [7]:
# export

def get_files_list(directory: str, file_extension: str) -> List[str]:
    """
    Get a list of files (with a specific extension) within a directory.
    :param directory: Directory to extract list of files
    :param file_extension: File extension of files to include in the list
    
    :return: List of files within the directoy with the provided extension
    """
    path = Path(directory)
    _check_dir_existence(path)
    
    return list(path.glob(f'**/*.{file_extension}'))

In [30]:
# export

def jsonl_list_to_dataframe(file_list: List[str]) -> pd.DataFrame:
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True) 
                      for f in file_list], sort=False)

def jsonl_to_dataframe(file_path: str) -> pd.DataFrame:
    """
    Gets a DataFrame from a jsonl file
    :param file_path: Location of the jsonl file
    :return:
    """
    
    _check_file_existence(file_path)
    return pd.read_json(file_path, orient='records', lines=True)

In [4]:
# export

def csv_to_dataframe(file_path: str) -> pd.DataFrame:
    """Gets a DataFrame from a csv file"""
    
    _check_file_existence(file_path)
    return pd.read_csv(file_path)

In [5]:
# export

def load_np_vectors(path: str) -> np.array:
    """
    :param path: Location of the .npy files to be loaded
    
    :return: Np array corresponding to the loaded vectors
    """
    path = Path(path)
    if not path.exists():
        msg = "Vectors could not be found"
        logging.error(msg)
        raise Exception(msg)
    return np.load(str(path))

In [None]:
# export

def get_vector_paths_4_sample_set(set_name: str, base_path: str) -> List[PosixPath]:
    """
    Gets the files for a given directory containing sample set
    :param set_name: Str indicating the name of the directory for a given set of samples
    :param base_path: Str indicating the location directory of samples
    """
    paths = []
    vectors_path = f"{base_path}/{set_name}"
    path = Path(vectors_path)
    
    # TODO: Validate existence of directory
    
    # Iterate over all the samples for a set
    for sample_directory in path.iterdir():
        vectors_path = list(sample_directory.rglob('*-ft_vecs.npy'))
        if len(vectors_path) == 0:
            logging.warning(f"Could not load vectors for sample {str(directory)}")
            continue
            
        paths.append(vectors_path[0])
        
    return paths   

In [9]:
from nbdev.export import notebook2script
notebook2script()

Converted 0.1_mgmnt.prep.ipynb.
Converted 0.2_mgmnt.prep.files_mgmnt.ipynb.
Converted 0.3_mgmnt.prep.bpe_tokenization.ipynb.
Converted 0.4_mgmnt.prep.tokenization_counting.ipynb.
Converted 1.1_exp.info.ipynb.
Converted 1.2_exp.desc.metrics.java.ipynb.
Converted 1.4_exp.metrics_python.ipynb.
Converted 1.5_exp.metrics_java.ipynb.
Converted 2.0_repr.codebert.ipynb.
Converted 2.0_repr.i.ipynb.
Converted 2.1_repr.codeberta.ipynb.
Converted 2.1_repr.roberta.train.ipynb.
Converted 2.2_repr.roberta.eval.ipynb.
Converted 2.3_repr.word2vec.train.ipynb.
Converted 2.6_repr.word2vec.eval.ipynb.
Converted 2.7_repr.distmetrics.ipynb.
Converted 2.8_repr.sentence_transformers.ipynb.
Converted 3.1_traceability.unsupervised.eda.ipynb.
Converted 3.2_traceability.unsupervised.approach.d2v.ipynb.
Converted 3.2_traceability.unsupervised.approach.w2v.ipynb.
Converted 4.0_infoxplainer.ir.ipynb.
Converted 4.1_infoxplainer.ir.unsupervised.d2v.ipynb.
Converted 4.2_infoxplainer.ir.unsupervised.w2v.ipynb.
Converted