#### Examples how to use GCsnap modules in Jupyter notebook

In [None]:
# presently, the exit() calls from GCsnap cause kernel interuptions in jupyter notebook.
import os

from gcsnap.configuration import Configuration
from gcsnap.parallel_tools import ParallelTools

from gcsnap.assemblies import Assemblies

# create config for testing, uses the default configuration
config = Configuration()
# start cluster, mainly important when using Dask as the workers need to be started
parallel = ParallelTools(config)
# this examples work on one node with Dask LocalCluster as specified in the configuration


targets_all = ['R8WYQ6_9ENTR','Q6D966_PECAS','H2J113_RAHAC','I8TT21_9FIRM','C5S0I8_9PAST','D5DC35_PRIM3','G8LK95_9ENTR',
                'C6D8X9_PECCP','B2VEG9_ERWT9','X4ZG39_9BACL','H2J0C3_RAHAC','I0QQ93_9GAMM','B7L4Z9_ECO55','W7P5B4_9ENTR',
                'K8WTQ9_9GAMM','A0A9J9GIQ1_ENT38','A0A837FI48_9ENTR','A0A1Q6B4T0_ECOLX','A0A376DCF6_ECOLX','A0A3T0R815_ECOLX',
                '57208299','9733087','57292271','34290897','45716609','57332559','61349497','76522546','78506826','57906279',
                '69058994','56891587','76607510','15144448','61037681','76392646','64207687','45565517','66961501','83586024',
                'WP_008915955.1','WP_001538355.1','WP_013371219.1','WP_013448625.1','WP_025421344.1','WP_013084448.1','WP_004955457.1',
                'WP_004851226.1','WP_023335015.1','WP_009653630.1','WP_010732931.1','WP_006121662.1','WP_020439301.1','WP_012770342.1',
                'WP_004166538.1','WP_013574938.1','WP_070367577.1','WP_000291547.1','WP_001313400.1','WP_072833029.1','UniRef100_U1N0X1',
                'UniRef50_W9BCH3','UniRef50_E3EIU6','UniRef50_A7ML49','UniRef100_I3I4H6','UniRef50_P16552','UniRef50_C9XXN2','UniRef90_K7ZYB1',
                'UniRef100_A7Z937','UniRef50_A0A0C7KCH0','UniRef100_J1F361','UniRef100_I0QML3','UniRef90_J3A509','UniRef50_A9MER2','UniRef90_K6Y668',
                'UniRef100_A0A0M2F4Z0','UniRef100_A0A0L7AG82','UniRef100_A0A8T3L1L2','UniRef100_A0AAE8CJK3','UniRef90_A0A0L7TEX5',
                'EFK22699.1','AEX54717.1','AGB81004.1','ADG39479.1','AAU25613.1','AGK98652.1','ACZ77400.1','CCJ74421.1','EUK18068.1',
                'EHD21217.1','ACS85132.1','GAC24184.1','EWG74888.1','AHF73891.1','OIZ68276.1','ADW71651.1','ADW75107.1','KGA35409.1',
                'SEE95312.1','KPD02558.1']
targets_refseq = ['WP_001538355.1','WP_013371219.1','WP_013448625.1','WP_025421344.1']


**Mapping**  
Code to demonstrate theuse of the SequenceMapping class to map targets from different ID standards.

In [None]:
# With the class
from gcsnap.mapping import SequenceMapping
mapping = SequenceMapping(config, targets_all)
mapping.run()
targets_and_ncbi_codes = mapping.get_targets_and_ncbi_codes()
print(targets_and_ncbi_codes)

In [None]:
# Directly from the database: You need to specify the path to the database and the field you want to search for
from gcsnap.db_handler_uniprot_mappings import UniprotMappingsDBHandler
db_handler = UniprotMappingsDBHandler('/scicore/home/schwede/GROUP/gcsnap_db/db')
# as tuples
tuples = db_handler.select(targets_refseq, field = 'RefSeq')
print(tuples)
# or directly as dataframe
df = db_handler.fetch_records_as_dataframe(targets_refseq, field = 'RefSeq')
print(df)

**Assemblies**  
Code to demonstrate the use of the Assembly class to finde and parse assemblies.

In [None]:
# With the class
from gcsnap.genomic_context import GenomicContext
gc = GenomicContext(config)
from gcsnap.assemblies import Assemblies
assemblies = Assemblies(config, targets_and_ncbi_codes)
assemblies.run()
flanking_genes = assemblies.get_flanking_genes()
print(flanking_genes)
gc.update_syntenies(assemblies.get_flanking_genes())

In [None]:
# Directly from the database: You need to specify the path to the database
from gcsnap.db_handler_assemblies import AssembliesDBHandler
db_handler = AssembliesDBHandler('/scicore/home/schwede/GROUP/gcsnap_db/db')
refseqID = ['WP_001538355.1','WP_013371219.1','WP_013448625.1','WP_025421344.1']
# the assembly accesssions
code_and_accessions = db_handler.select(refseqID) # table='mappings' id default
print(code_and_accessions)
accessions = [x[1] for x in code_and_accessions]
# the assembly information
info = db_handler.select(accessions, table='assemblies')
print(info)


**Sequences**  
Code to demonstrate the use of the Sequences class to finde and parse assemblies.

In [None]:
# input from cells before
from gcsnap.sequences import Sequences
sequences = Sequences(config, gc)
sequences.run()
gc.update_syntenies(sequences.get_sequences())

In [None]:
ncbi_codes = ['WP_008843544.1', 'WP_008843542.1', 'WP_008843541.1', 'WP_052154931.1', 
             'WP_008843539.1', 'WP_008843538.1', 'WP_008843537.1', 'WP_008843536.1', 'WP_153886269.1']
# direct from the database
from gcsnap.db_handler_sequences import SequenceDBHandler
sequences_db = SequenceDBHandler('/scicore/home/schwede/GROUP/gcsnap_db/db')
tuples = sequences_db.select(ncbi_codes)
print(tuples)

In [None]:
import sqlite3
# Connect to the database
conn = sqlite3.connect('/scicore/home/schwede/GROUP/gcsnap_db/db/sequences.db')
# Create a cursor object
cursor = conn.cursor()
# Execute a query to list all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
# Fetch all results
tables = cursor.fetchall()
print(tables)

for table in tables:
    print(f"\nSchema for table {table[0]}:")
    cursor.execute(f"PRAGMA table_info({table[0]});")
    columns = cursor.fetchall()
    for column in columns:
        print(f"  Column: {column[1]}, Type: {column[2]}")

for table in tables:
    print(f"\nSample data from table {table[0]}:")
    cursor.execute(f"SELECT * FROM {table[0]} LIMIT 100;")
    rows = cursor.fetchall()
    for row in rows:
        print(row)       

In [None]:
import gzip
file = os.path.join('/scicore/home/schwede/GROUP/gcsnap_db/refseq/data','GCF_002831425.1_ASM283142v1_protein.faa.gz')
with gzip.open(file, 'rt', encoding='utf-8') as file:
    content = file.read()
lines = content.splitlines()
content = '$%'.join(lines)
# split that string to extract each sequence id
# EFB12766.1 hypothetical protein PANDA_022614, partial [Ailuropoda melanoleuca]WSDGHLIYYDDQTRQSVEDKVHMPVDCINIRTGHECRGT
# the first is an empty result
entries = content.split('>')[1:]
print(entries)
for entry in entries[:2]:
    # split the info from the acutal sequence str
    entry_split = entry.split('$%')
    sequence = ''.join(entry_split[1:])
    # split the organism name in []
    info_split = entry_split[0].split('[') 
    # orgname is same for the assembly, put it into assembly table
    #orgname = info_split[-1].replace(']','')
    ncbi_code = info_split[0].split(' ')[0]
    print(ncbi_code,sequence)




In [None]:

for entry in entries:
    # split the info from the acutal sequence str
    entry_split = entry.split('\n')
    sequence = ''.join(entry_split[1:])
    # split the organism name in []
    info_split = entry_split[0].split('[') 
    # orgname is same for the assembly, put it into assembly table
    #orgname = info_split[-1].replace(']','')
    ncbi_code = info_split[0].split(' ')[0]