In [9]:
import polars as pl 
import re
import yaml 
from utils.process import get_specie


# Load yaml file
with open('/home/simon.herman/Bureau/Gits/Elongates/env.yaml', 'r') as f:
    yaml_data = yaml.safe_load(f)
    species_dict = {k: v for d in yaml_data['Regex'].values() for k, v in d.items()}
    species_order = yaml_data['Species_order']['Scer']
    re_dict = yaml_data['Regex']['Scer']
def parse_blast_dataframe(row : tuple) -> tuple:

    query_pattern = r'(.*?)-(cluster_n\d+)'
    subject_pattern = r'(.*?)-(cluster_n\d+)'

    query_id = row[0]
    subject_id = row[1]
    evalue = row[2]
    qstart = row[3]
    qend = row[4]
    sstart = row[5]
    send = row[6]
    qseq = row[7]
    sseq = row[8]
    length = row[9]
    blast_gapopen = row[10]
    blast_gaps = row[11]



    query_matches = re.match(query_pattern, query_id)
    query_seq_id = query_matches.group(1) if query_matches else None
    query_cluster = query_matches.group(2) if query_matches else None

    subject_matches = re.match(subject_pattern, subject_id)
    subject_seq_id = subject_matches.group(1) if subject_matches else None
    subject_cluster = subject_matches.group(2) if subject_matches else None

    is_same_cluster = int(query_cluster == subject_cluster)
    q_specie = get_specie(re_dict, query_seq_id)
    s_specie = get_specie(re_dict, subject_seq_id)

    tuple_ = tuple([query_seq_id, subject_seq_id, evalue, qstart, qend, sstart, send, 
                    qseq, sseq, length, blast_gapopen, blast_gaps, query_cluster, subject_cluster, 
                    is_same_cluster, q_specie, s_specie])
    return tuple_


In [11]:
columns =  ["qseqid", "sseqid", "evalue", "qstart", "qend", "sstart", "send", "qseq", "sseq", "length", "blast_gapopen", "blast_gaps"]
nuc_blast  = pl.read_csv(f"~/Bureau/test.blast", separator="\t", has_header = False)
nuc_blast.columns = columns

nuc_blast = nuc_blast.with_columns(
    pl.lit("NA").alias('query_cluster'),
    pl.lit("NA").alias('subject_cluster'),
    pl.lit("NA").alias('same_cluster'),
    pl.lit("NA").alias('q_specie'),
    pl.lit("NA").alias('s_specie')
)

columns = nuc_blast.columns
nuc_blast = nuc_blast.apply(parse_blast_dataframe)
nuc_blast.columns = columns


In [12]:
nuc_blast

qseqid,sseqid,evalue,qstart,qend,sstart,send,qseq,sseq,length,blast_gapopen,blast_gaps,query_cluster,subject_cluster,same_cluster,q_specie,s_specie
str,str,f64,i64,i64,i64,i64,str,str,i64,i64,i64,str,str,i64,str,str
"""rna-NM_0011817…","""rna-XM_0339114…",3.8600e-133,3,342,356,689,"""GAACCGTTCGCATT…","""GAACCGTCCGCATG…",341,4,8,"""cluster_n6542""","""cluster_n6542""",1,"""Scer_NCBI""","""Spar_NCBI"""
"""Skud_9.62""","""rna-XM_0339110…",3.3700e-26,1,161,171,327,"""TGGAAAGAACGAAA…","""TGGAAAGAACGAAA…",164,8,10,"""cluster_n744""","""cluster_n744""",1,"""Skud""","""Spar_NCBI"""
"""Sbay_10.162""","""Sbay_10.163""",1.5600e-41,1,87,819,905,"""ATGTTCAAAGAACC…","""ATGTTCAAAGAACC…",87,0,0,"""cluster_n6775""","""cluster_n4451""",0,"""Sbay""","""Sbay"""
"""rna-NM_0011842…","""rna-NM_0011843…",3.3700e-20,1,48,2736,2783,"""ATGTCAGAAGATCA…","""ATGTCAGAAGATCA…",48,0,0,"""cluster_n5063""","""cluster_n3439""",0,"""Scer_NCBI""","""Scer_NCBI"""
"""rna-NM_0011842…","""Skud_13.4""",4.3600e-19,3,48,51,96,"""GTCAGAAGATCAAA…","""GTCAGAAGATCAAA…",46,0,0,"""cluster_n5063""","""cluster_n5063""",1,"""Scer_NCBI""","""Skud"""
"""rna-XM_0339089…","""rna-NM_0011843…",3.3700e-20,1,48,2736,2783,"""ATGTCAGAAGATCA…","""ATGTCAGAAGATCA…",48,0,0,"""cluster_n5063""","""cluster_n3439""",0,"""Spar_NCBI""","""Scer_NCBI"""
"""rna-XM_0339089…","""Skud_13.4""",4.3600e-19,3,48,51,96,"""GTCAGAAGATCAAA…","""GTCAGAAGATCAAA…",46,0,0,"""cluster_n5063""","""cluster_n5063""",1,"""Spar_NCBI""","""Skud"""
"""rna-XM_0339107…","""rna-NM_0011843…",3.3700e-20,1,48,2736,2783,"""ATGTCAGAAGATCA…","""ATGTCAGAAGATCA…",48,0,0,"""cluster_n5063""","""cluster_n3439""",0,"""Spar_NCBI""","""Scer_NCBI"""
"""rna-XM_0339107…","""Skud_13.4""",4.3600e-19,3,48,51,96,"""GTCAGAAGATCAAA…","""GTCAGAAGATCAAA…",46,0,0,"""cluster_n5063""","""cluster_n5063""",1,"""Spar_NCBI""","""Skud"""
"""rna-NM_0011803…","""Skud_4.258""",3.2900e-115,71,462,463,854,"""CCTTTTCTTCCTTA…","""CCTTGTTTTCCTTA…",395,2,6,"""cluster_n2561""","""cluster_n2561""",1,"""Scer_NCBI""","""Skud"""


In [13]:
cov = 0.5001
nter_elongates_data = pl.read_csv(f"output/{cov}/{cov}_elongates.csv", infer_schema_length = 5000)

In [14]:
nter_elongates_data.filter(
    (pl.col("is_max_Nter") == 1) & (pl.col("max_Nter") > 15)
)

cluster_size,cluster_name,seq_id,species,sequence,sequence_length,Nter_dashes,Cter_dashes,max_Nter,max_Cter,Nter_elongate,Nter_gaps,Nter_gap_openings,Nter_nb_aa,Nter_elongate_length,Cter_elongate,Cter_gaps,Cter_gap_openings,Cter_nb_aa,Cter_elongate_length,Nter_ratio,Cter_ratio,is_max_Nter,is_max_Cter,is_min_Nter,is_min_Cter,Meth_after_Nter,Nter_event_ID,Cter_event_ID,Nter_events,Cter_events
i64,str,str,str,str,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,str,i64,i64,i64,i64,f64,f64,i64,i64,i64,i64,str,i64,i64,i64,i64
4,"""cluster_n5821""","""rna-NM_0011823…","""Scer_NCBI""","""MNIDCLCRWVVLPL…",330,0,4,71,4,"""MNIDCLCRWVVLPL…",0,0,71,71,,0,0,0,0,0.215152,0.0,1,0,0,1,"""0""",2,1,1,0
4,"""cluster_n5821""","""rna-XM_0339122…","""Spar_NCBI""","""MNIDCLCHWIVLPL…",330,0,4,71,4,"""MNIDCLCHWIVLPL…",0,0,71,71,,0,0,0,0,0.215152,0.0,1,0,0,1,"""0""",2,1,1,0
4,"""cluster_n5821""","""Sarb_12G04260.…","""Sarb""","""MNIDFLCHWVVLPI…",330,0,0,71,4,"""MNIDFLCHWVVLPI…",0,0,71,71,"""GHIE""",0,0,4,4,0.215152,0.012121,1,1,0,0,"""0""",2,1,1,0
4,"""cluster_n2254""","""Sarb_07G00270.…","""Sarb""","""MIAKNHAAMGEKRK…",299,0,1,68,1,"""MIAKNHAAMGEKRK…",0,0,68,68,,0,0,0,0,0.227425,0.0,1,0,0,1,"""0""",3,1,2,0
4,"""cluster_n6542""","""rna-NM_0011817…","""Scer_NCBI""","""MEPFAFGRGAPALC…",1401,0,0,115,0,"""MEPFAFGRGAPALC…",0,0,115,115,,0,0,0,0,0.082084,0.0,1,1,0,1,"""1""",2,1,1,0
4,"""cluster_n706""","""Sarb_13G00390.…","""Sarb""","""MSSQKIDLTKLNPE…",163,0,0,38,0,"""MSSQKIDLTKLNPE…",0,0,38,38,,0,0,0,0,0.233129,0.0,1,1,0,1,"""1""",2,1,1,0
4,"""cluster_n706""","""rna-NM_0011824…","""Scer_NCBI""","""MSSQKIDLTKLNPE…",163,0,0,38,0,"""MSSQKIDLTKLNPE…",0,0,38,38,,0,0,0,0,0.233129,0.0,1,1,0,1,"""1""",2,1,1,0
4,"""cluster_n706""","""rna-XM_0339123…","""Spar_NCBI""","""MSSQKIDLTKLNPE…",163,0,0,38,0,"""MSSQKIDLTKLNPE…",0,0,38,38,,0,0,0,0,0.233129,0.0,1,1,0,1,"""1""",2,1,1,0
4,"""cluster_n2666""","""Skud_3.127""","""Skud""","""MRMIYKKVSSGRGR…",696,0,0,36,0,"""MRMIYKKVSSGRGR…",0,0,36,36,,0,0,0,0,0.051724,0.0,1,1,0,1,"""1""",2,1,1,0
4,"""cluster_n744""","""Skud_9.62""","""Skud""","""MERTKTKKNKTCKE…",986,0,0,55,0,"""MERTKTKKNKTCKE…",0,0,55,55,,0,0,0,0,0.055781,0.0,1,1,0,1,"""1""",2,1,1,0
