In [None]:
'''
File import for .parquet, .tsv and .csv files
At the moment a mix of .parquet, .tsv and .csv files will also be combined into one dataframe.

Providing a column mapping is optional. If no column mapping is provided, the default mapping will be used 

input: 
dir_path: path to the directory containing the files
file_types: list of file types to be imported
column_mapping: dictionary with column names as keys and the corresponding column names in the files as values

defaults: 
dir_path: 'data/'
file_types: ['.parquet', '.tsv', '.csv']
columns: {modified_sequence: 'modified_sequence', precursor_charge: 'precursor_charge', precursor_intensity: 'precursor_intensity'}

output: 
df: dataframe containing the imported data
'''
def combine_files_into_df(dir_path='data/', file_types=['.parquet', '.tsv', '.csv'], column_mapping=None):
    dfs = []
    
    if column_mapping is None:
        column_mapping = {
            'modified_sequence': 'modified_sequence',
            'precursor_charge': 'precursor_charge',
            'precursor_intensity': 'precursor_intensity'
        }
    
    for file in os.listdir(dir_path):
        if any(file.endswith(file_type) for file_type in file_types):
            file_path = os.path.join(dir_path, file)
            
            if file.endswith('.parquet'):
                df = pd.read_parquet(file_path, engine='fastparquet')
            elif file.endswith('.tsv'):
                df = pd.read_csv(file_path, sep='\t')
            elif file.endswith('.csv'):
                df = pd.read_csv(file_path)
            else:
                raise ValueError(f'File type {file_type} not supported')
            
            # Rename columns based on the provided mapping
            df = df.rename(columns=column_mapping)
            dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    return df