## The first thing we do is import the functions we want to use, as well as our default configuration file (called 'config' located in this same directory)

In [None]:
import configparser

from utilities.execution_modes import ExecutionType
import utilities.fileutils as fileutils

config = configparser.ConfigParser()
configFilePath = 'config'
config.read(configFilePath)


## We set the input directory (where to read CSV files from) and the output directory (where we write our CSV output to)

In [None]:
#input directory
file_directory = config['DEFAULT']['input']
#output directory
output_path = config['DEFAULT']['output']

print("input directory is: {}\noutput directory is: {}".format(file_directory, output_path))

## We can set the name of the Protein Sequence Column and Protein Accession Group Columns below

In [None]:
#name of the protein sequence column
sequence_column_name = config["DATA DESCRIPTOR"]["sequenceColumn"]
#name of the protein group accessions column)
add_column_name = config["DATA DESCRIPTOR"]["addColumn"]

print("sequence column name is : \"{}\"".format(sequence_column_name))
print("protein descriptor column name is : \"{}\"".format(add_column_name))

## we set the list of regular expressions we want to use below.
## separate them by a comma and a newline for readability!

In [None]:
#add more regular expressions to this!
#pattern for HLA-A11 is r'\b.[VIFY][MLFYIA]\w+[LIYVF].[KR]\b'
#pattern for HLA-A24 is r'\b.[YF]\w+[LFI]\b'
regex_list = [r'^[FL]',
              r'\b.[VIFY][MLFYIA]\w+[LIYVF].[KR]\b'
              ]

In [None]:
all, some, none = fileutils.create_anchor_match_dataframes(regex_list,
                                                             file_directory,
                                                             sequence_column_name,
                                                             add_column_name)
print("returned dataframes were: all{}\nsome {}\nnone {}".format(all, some, none))


In [None]:
    print("writing output to {}".format(output_path))
    fileutils.write_dataframe_to_csv_with_path(all, output_path, "sieved_compilation_problem_2_all")
    fileutils.write_dataframe_to_csv_with_path(some, output_path, "sieved_compilation_problem_2_some")
    fileutils.write_dataframe_to_csv_with_path(none, output_path, "sieved_compilation_problem_2_none")