#### GCsnap Example Notebook
This notebook contains some examples of how to use GCsnap modules and also some code that need to be run in advance.

##### Generate the parquet files for the mapping in advance

In [4]:
# This works here as we are in the same Folder as the scritps. Once build, it should work outside.
from gcsnap.mapping import SequenceMapping
from gcsnap.configuration import Configuration

# generate a config instance with all arguments read from config.yaml
config = Configuration()

In [5]:
# generate a mapping instance
mapping = SequenceMapping(config, '/scicore/home/schwede/GROUP/gcsnap_db/mappings/idmapping_selected.tab')

# pregenerate the parquet families
mapping.create_parquet_partitions()

Output()

In [12]:
# Test the databases
from gcsnap.db_handler_assemblies import AssemblyDB

# get a list from ncbi codes
mapping.mapping.get_targets_and_ncbi_codes() 




In [7]:

ncbi_codes = ['ERG67661.1', 'MBG2912773.1','STQ80048.1','PUX20837.1','WP_008915955.1','WP_004166538.1','WP_012770342.1','WP_020439301.1']


from gcsnap.db_handler_assemblies import AssembliesDBHandler
import os

assembly_db = AssembliesDBHandler('/scicore/home/schwede/GROUP/gcsnap_db/ncbi_db')
# get the assemblies accession for the ncbi codes (from default table 'mapping')
result_tuples = assembly_db.select(ncbi_codes)
ass_accessions = [element[1] for element in result_tuples]

# get the assembly file links for the accession
assemblies = assembly_db.select(ass_accessions, table = 'assemblies')


# get the firts file
ass_file = os.path.basename(assemblies[0][1]) + '_genomic.gff.gz'
print(assemblies)
print(ass_file)

# search the file
if ass_file.startswith('GCA'):
    db = 'genbank'
else:
    db = 'refseq'
data_path = os.path.join('/scicore/home/schwede/GROUP/gcsnap_db/',db,'data')

# check file exists
if os.path.exists(os.path.join(data_path,ass_file)):
    print('exists')
else:
    print('file not found')








[('GCA_000416965.1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/416/965/GCA_000416965.1_23.K55', 1345023, 'Exiguobacterium chiriqhucha RW-2'), ('GCA_001922905.1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/922/905/GCA_001922905.1_ASM192290v1', 413502, 'Cronobacter turicensis'), ('GCA_015722865.1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/015/722/865/GCA_015722865.1_ASM1572286v1', 626774, 'Proteus terrae subsp. cibarius'), ('GCA_900451105.1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/900/451/105/GCA_900451105.1_31180_B01', 569, 'Hafnia alvei'), ('GCF_000314895.2', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/314/895/GCF_000314895.2_ASM31489v2', 1141660, 'Providencia sneebia DSM 19967'), ('GCF_021498305.1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/021/498/305/GCF_021498305.1_ASM2149830v1', 82996, 'Serratia plymuthica'), ('GCF_029823335.1', 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/029/823/335/GCF_029823335.1_ASM2982333v1', 1254, 'Pediococcus acidilact

In [20]:
from gcsnap.db_handler_uniprot_mappings import UniprotMappingsDBHandler
mapping_db = UniprotMappingsDBHandler('/scicore/home/schwede/GROUP/gcsnap_db/mappings')

result = mapping_db.select_all()

print(len(result))
print(result[:5])

list_refseq = [e[3] for e in result]
list_refseq = list_refseq[1000:1010]

result = mapping_db.select(list_refseq, field = 'RefSeq')
print(result)

df = mapping_db.fetch_records_as_dataframe(list_refseq, field='RefSeq')
print(df)







30000
[('Q6GZX4', '001R_FRG3G', '2947773', 'YP_031579.1', None, 'UPI00003B0FD4', '654924', 'AAT09660.1', None), ('Q6GZX3', '002L_FRG3G', '2947774', 'YP_031580.1', None, 'UPI00003B0FD5', '654924', 'AAT09661.1', None), ('Q197F8', '002R_IIV3', '4156251', 'YP_654574.1', None, 'UPI0000D83464', '345201', 'ABF82032.1', None), ('Q197F7', '003L_IIV3', '4156252', 'YP_654575.1', None, 'UPI0000D83465', '345201', 'ABF82033.1', None), ('Q6GZX2', '003R_FRG3G', '2947775', 'YP_031581.1', None, 'UPI00003B0FD6', '654924', 'AAT09662.1', None)]
[('P26670', '3BHS_VACCW', '3707700', 'YP_233052.1', None, 'UPI0000124ED4', '10254', 'AAA48311.1; BAA01818.1; AAO89449.1', None), ('Q89187', '3BHS_VAR67', '1486530', 'NP_042199.1', None, 'UPI00000F6E4F', '587200', 'CAA49096.1', None), ('Q9Y3L3', '3BP1_HUMAN', '23616', 'NP_061830.3', '4J9D:B; 4J9D:D; 4J9D:F; 4J9F:B; 4J9F:D; 4J9F:F', 'UPI000004EE00', '9606', 'CAG30462.1; BAC85842.1; -; AAH08282.1; CAB75671.2', 'ENSG00000100092.24'), ('P55194', '3BP1_MOUSE', '20401', 'N