In [118]:
import polars as pl 
import numpy as np
cov = 0.50001


elongates = pl.read_csv(f"output/{cov}/{cov}_elongates.csv", infer_schema_length =10000)



In [148]:

from utils.handle_UTRs import translate_frames
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import numpy as np

def custom_target_elongate(cluster, scer_length, seq_id, specie, elongate_length, side, genome_dict, gff_dict, coeff = 1): 


    coordinates = list()
    result_dict = dict()

    if side != "Nter" and side != "Cter":

        raise ValueError("Side must be either Nter or Cter")
    
    gff = gff_dict[specie].filter(

        (pl.col("Type") == "CDS") & ((pl.col("Name") == seq_id) | (pl.col("Parent") == seq_id))

    )[["Start","End","Strand","Seqid"]] # Keep only necessary columns
    
    # Store datas necessary to compute the elongate sequence

    strand = gff[0]["Strand"].to_list()[0] # + or -
    strand_id = gff[0]["Seqid"].to_list()[0] # chromosome or scaffold id

    for row in gff.iter_rows(named=True): # Named = True to iter with column names

        coordinates.append(sorted((int(row['Start'])-1, int(row['End'])-1))) # -1 for python indexing
        
    coordinates = sorted(coordinates, key=lambda x: x[0]) # Sort coordinates by start position


    custom_elongate_length = int(np.ceil((scer_length - elongate_length) * 3 * coeff)) # Get the length of the elongate sequence in nucleotides

    if strand == "+":

        if side == "Nter":

            start_5 = coordinates[0][0]-custom_elongate_length if coordinates[0][0]-custom_elongate_length >= 0 else 0 # Get the start position of the 5' UTR
            
            elongate = genome_dict[specie][strand_id]["seq"][

                start_5:coordinates[0][0]

            ] 

        elif side == "Cter": # Useless check but it's for the sake of clarity
        
            end_3 = coordinates[-1][1]+1+custom_elongate_length if coordinates[-1][1]+1+custom_elongate_length <= genome_dict[specie][strand_id]["len"] else genome_dict[specie][strand_id]["len"] # Get the end position of the 3' UTR
            # +1 for -1,1 because GFF points to the last nucleotide of the stop codon

            elongate = genome_dict[specie][strand_id]["seq"][
                coordinates[-1][1]+1:end_3
                ] # Get the 3' sequence

    
    # Reverse complement if the strand is negative, don't forget to reverse the coordinates
    if strand == "-":

        if side == "Nter":

            end_5 = coordinates[-1][1]+1+custom_elongate_length if coordinates[-1][1]+1+custom_elongate_length <= genome_dict[specie][strand_id]["len"] else genome_dict[specie][strand_id]["len"] # Get the start position of the 5' UTR

            elongate = genome_dict[specie][strand_id]["seq"][
                coordinates[-1][1]+1:end_5
            ].reverse_complement() # Get the 5' sequence
        

        if side == "Cter":
        
            start_3 = coordinates[0][0]-custom_elongate_length if coordinates[0][0]-custom_elongate_length >= 0 else 0 # Get the end position of the 3' UTR

            elongate = genome_dict[specie][strand_id]["seq"][
                start_3:coordinates[0][0]
            ].reverse_complement() # Get the 3' sequence


    result_dict["nuc"] = SeqRecord(seq = Seq(elongate), id = f"{seq_id}", description = "") 

    # def translate_frames(dna_sequence, specie, seq_id, length, utr, cluster)

    result_dict["prot"] = translate_frames(dna_sequence = elongate, specie = specie, seq_id = seq_id, length = custom_elongate_length, utr = side, cluster = cluster)

    return result_dict
    
    




In [137]:
def custom_scer_elongates(scer_id : str, specie : str, scer_length : int, scer_elongate : str or Seq, infos_dict, cluster, gff_dict, genome_dict, side):

    coordinates = list()
    result_dict = dict()


    ########
    # Get infos from gff file 
    ########

    if side != "Nter" and side != "Cter":

        raise ValueError("Side must be either Nter or Cter")
    
    gff = gff_dict[specie].filter(

        (pl.col("Type") == "CDS") & ((pl.col("Name") == scer_id) | (pl.col("Parent") == scer_id))

    )[["Start","End","Strand","Seqid"]] # Keep only necessary columns
    
    # Store datas necessary to compute the elongate sequence

    strand = gff[0]["Strand"].to_list()[0] # + or -
    strand_id = gff[0]["Seqid"].to_list()[0] # chromosome or scaffold id

    for row in gff.iter_rows(named=True): 

        coordinates.append(sorted((int(row['Start'])-1, int(row['End'])-1))) # -1 for python indexing
        
    coordinates = sorted(coordinates, key=lambda x: x[0]) # Sort coordinates by start position

    ########
    # Check that input data are correct
    ########

    scer_elongate = scer_elongate.replace("-","")

    if len(scer_elongate) != scer_length:

        print(f"Seq id : {scer_id}")
        print(f"Cluster : {cluster}")
        raise ValueError("Mismatch between theorical and real length of the Scer peptidic elongate")

    for subject_id, subject_length in infos_dict.items():

        result_dict[subject_id] = dict()
        custom_length = (scer_length-subject_length)*3

        if strand == "+":

            if side == "Nter":
                
               nuc_elongate = genome_dict[specie][strand_id]["seq"][

                    coordinates[0][0]:coordinates[0][0] + custom_length # Get the 5' sequence

                ] 

            elif side == "Cter": # Useless check but it's for the sake of clarity
            

                nuc_elongate = genome_dict[specie][strand_id]["seq"][
                    
                    coordinates[-1][1]-(custom_length):coordinates[-1][1]] # Get the 3' sequence

    
        # Reverse complement if the strand is negative, don't forget to reverse the coordinates
        if strand == "-":

            if side == "Nter":


                nuc_elongate = genome_dict[specie][strand_id]["seq"][
                    coordinates[-1][1]-(custom_length):coordinates[-1][1]
                ].reverse_complement() # Get the 5' sequence
            

            elif side == "Cter":
            
                nuc_elongate = genome_dict[specie][strand_id]["seq"][
                    coordinates[0][0]:coordinates[0][0] + (custom_length)
                ].reverse_complement() # Get the 3' sequence



        result_dict[subject_id]["nuc"] = SeqRecord( seq = Seq(nuc_elongate), id = f"{scer_id}", description = "")
        result_dict[subject_id]["prot"] = SeqRecord( seq = Seq(scer_elongate[:int(custom_length/3)]), id = f"{scer_id}", description = "")
    
    return result_dict

In [144]:
import yaml
# Load your species regex from the yaml file
with open('/home/simon.herman/Bureau/Gits/Elongates/env.yaml', 'r') as f:
    yaml_data = yaml.safe_load(f)
    species_order = yaml_data['Species_order']['Scer']

seuil = 4

Nter_scer_conditions = ((pl.col("species") == "Scer_NCBI") & (abs(pl.col("max_Nter") - pl.col("Nter_nb_aa")) < seuil) & (pl.col("max_Nter") >= 10))
Cter_scer_conditions = ((pl.col("species") == "Scer_NCBI") & (abs(pl.col("max_Cter") - pl.col("Cter_nb_aa")) < seuil) & (pl.col("max_Cter") >= 10))
test = elongates.filter(Nter_scer_conditions | Cter_scer_conditions)["cluster_name"].unique().to_list()

nter_clusters = []
cter_clusters = []
i = 0

for cluster,sequences in elongates.filter(pl.col("cluster_name").is_in(test)).groupby("cluster_name"):

    species = sequences["species"].to_list()

    if len(species) < 3:
            
        continue


    species_ordered = [ s for s in species_order if s in species]
    
    if species_ordered[-1] != "Scer_NCBI" and species.count(species_ordered[-1]) > 1:
            
        continue

    if sequences.filter(pl.col("species") == species_ordered[-1])["Nter_nb_aa"].to_list()[0] < seuil and sequences["max_Nter"].max() >= 10:
    
        nter_clusters.append(cluster)

    if sequences.filter(pl.col("species") == species_ordered[-1])["Cter_nb_aa"].to_list()[0] < seuil and sequences["max_Cter"].max() >= 10:
    
        cter_clusters.append(cluster)



full_filtered_Nter = elongates.filter(pl.col("cluster_name").is_in(nter_clusters))
full_filtered_Cter = elongates.filter(pl.col("cluster_name").is_in(cter_clusters))



In [120]:
import gff3_parser
from utils.files import multifasta_to_dict
gff_dict = dict()
genome_dict = dict()

for specie in species_order:

    gff_dict[specie] = pl.from_pandas(gff3_parser.parse_gff3(f"input/{specie}.gff", parse_attributes = True, verbose = False))
    genome_dict[specie] = multifasta_to_dict(f"input/{specie}.fna", genome = True)

In [145]:
import os 
import shutil
from Bio import SeqIO

current_path = f"/home/simon.herman/Bureau/Gits/Elongates/work/{cov}"



dataframes = {
    "Nter": full_filtered_Nter,
    "Cter": full_filtered_Cter,
}

if os.path.exists(f"{current_path}/local_align_files"):

    shutil.rmtree(f"{current_path}/local_align_files")

os.mkdir(f"{current_path}/local_align_files")

for side in ["Nter", "Cter"]:

    df = dataframes[side]

    os.mkdir(f"{current_path}/local_align_files/{side}")

    for cluster, sequences in df.groupby("cluster_name"):

        scer_length = sequences.filter(pl.col("species") == "Scer_NCBI")[f"{side}_nb_aa"].max() # Maybe several Scer sequences in the cluster, we take the longest elongate
        
        if scer_length < 10:
            continue

        scer_id = sequences.filter((pl.col("species") == "Scer_NCBI") & (pl.col(f"{side}_nb_aa") == scer_length))["seq_id"].to_list()[0]
        os.mkdir(f"{current_path}/local_align_files/{side}/{cluster}")
     
        infos_for_scer_sequences = dict()
        for sequence in sequences.iter_rows(named = True): 

            if scer_length - sequence[f"{side}_nb_aa"] >= 10:


                os.mkdir(f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}")
                os.mkdir(f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}/nucleotide")
                os.mkdir(f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}/protein")


                dict_ = custom_target_elongate(cluster, scer_length, sequence["seq_id"], sequence["species"], sequence[f"{side}_nb_aa"], side, genome_dict, gff_dict)

            
                
                SeqIO.write(dict_["nuc"], f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}/nucleotide/{sequence['seq_id']}.fna", "fasta")
                
                SeqIO.write([ seq for seq in dict_["prot"].values()], f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}/protein/{sequence['seq_id']}.faa", "fasta")
                
                infos_for_scer_sequences[sequence["seq_id"]] = sequence[f"{side}_nb_aa"] 
                
        
        scer_cds = sequences.filter(pl.col("seq_id") == scer_id)
        
        scer_elongates = custom_scer_elongates(scer_id = scer_id, specie = "Scer_NCBI", scer_length = scer_length, 
                                                    scer_elongate = scer_cds[f"{side}_elongate"].to_list()[0], infos_dict = infos_for_scer_sequences, 
                                                    cluster = cluster, gff_dict = gff_dict, genome_dict = genome_dict, side = side)


        for subject_sequence in scer_elongates.keys():

            SeqIO.write(scer_elongates[subject_sequence]["nuc"], f"{current_path}/local_align_files/{side}/{cluster}/{subject_sequence}/nucleotide/{scer_id}_custom.fna", "fasta")
            SeqIO.write(scer_elongates[subject_sequence]["prot"], f"{current_path}/local_align_files/{side}/{cluster}/{subject_sequence}/protein/{scer_id}_custom.faa", "fasta")

In [146]:
import os
import subprocess
import polars as pl
from polars import NoDataError
from Bio import SeqIO
import re

# Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
dtypes = [pl.Utf8, pl.Utf8, pl.Float64, pl.Int64, pl.Int64, pl.Int64, pl.Int64, pl.Int64, pl.Int64, pl.Int64, pl.Float64, pl.Float64]
dfs = list()
trunc_files = open(f"output/{cov}/trunc_files.txt", "a")
current_path = f"/home/simon.herman/Bureau/Gits/Elongates/work/{cov}/local_align_files"
i = 0
for side in os.listdir(current_path):

    for cluster in os.listdir(f"{current_path}/{side}"):

        for subject_seq in os.listdir(f"{current_path}/{side}/{cluster}"):

            for condition in ["nucleotide","protein"]:

                root = f"{current_path}/{side}/{cluster}/{subject_seq}/{condition}/"
                if os.path.exists(output_csv):
                    os.remove(output_csv)

                to_align = os.listdir(f"{current_path}/{side}/{cluster}/{subject_seq}/{condition}")

                query_index = next((i for i, item in enumerate(to_align) if "custom" in item))
                subject_index = not query_index # List length 2 in every scenario

                query = os.path.join(root, to_align[query_index])
                subject = os.path.join(root, to_align[subject_index])      
                output_csv = os.path.join(root,"temp.tsv")      
                
                if condition == "nucleotide":
                    command = f"lalign36 -3 -E 1000 -m 8 {query} {subject} > {output_csv}"
                elif condition == "protein":
                    command = f"lalign36 -p -3 -E 1000 -m 8 {query} {subject} > {output_csv}"
                
                subprocess.run(command, shell=True)
                try:
                    df = pl.read_csv(output_csv, separator="\t", has_header=False, dtypes = dtypes)
                except:
                    trunc_files.write(f"{query}\n{subject_index}\n----------------\n")
                    NoDataError()

                if condition == "nucleotide":
                    df = df.sort("column_11", descending = False).head(1).with_columns(
                        Condition = pl.lit(condition),
                        Side = pl.lit(side))

                if condition == "protein":
                    tmp_ = []
                    for frame, align_res in df.groupby("column_2"):
                        tmp_.append(align_res.sort("column_11", descending = False).head(1).with_columns(
                            Condition = pl.lit(condition),
                            Side = pl.lit(side)))
                    df = pl.concat(tmp_)

                dfs.append(df)
                os.remove(output_csv)
                i += 1
                
        if i > 20:
            break

    if i > 20:
        break


                                    
                
df = pl.concat(dfs)
trunc_files.close()


df = pl.concat(dfs)
df = df.with_columns(

    pl.lit("NA").alias('subject'),
    pl.lit("NA").alias('relative_frame'),
    pl.lit("NA").alias('q_specie'),
    pl.lit("NA").alias('s_specie')
)


                
                

In [147]:
df.select("column_2").to_series().to_list()

['rna-XM_033908579.1-cluster_n3567-Nter-f2-53',
 'rna-XM_033908579.1-cluster_n3567-Nter-f1-53',
 'rna-XM_033908579.1-cluster_n3567-Nter-f0-53',
 'rna-XM_033908579.1-cluster_n3567-Nter-f2-53',
 'Sarb_01G00560.1-cluster_n3567-Nter-f2-562',
 'Sarb_01G00560.1-cluster_n3567-Nter-f1-562',
 'Sarb_01G00560.1-cluster_n3567-Nter-f0-562',
 'Sarb_01G00560.1-cluster_n3567-Nter-f2-562',
 'rna-XM_033911492.1-cluster_n4261-Nter-f2-10',
 'rna-XM_033911492.1-cluster_n4261-Nter-f1-10',
 'rna-XM_033911492.1-cluster_n4261-Nter-f2-10',
 'rna-XM_033911492.1-cluster_n4261-Nter-f0-10',
 'Sbay_12.217-cluster_n4261-Nter-f2-24',
 'Sbay_12.217-cluster_n4261-Nter-f0-24',
 'Sbay_12.217-cluster_n4261-Nter-f2-24',
 'Sbay_12.217-cluster_n4261-Nter-f1-24',
 'Smik_10.419-cluster_n4261-Nter-f2-24',
 'Smik_10.419-cluster_n4261-Nter-f0-24',
 'Smik_10.419-cluster_n4261-Nter-f2-24',
 'Smik_10.419-cluster_n4261-Nter-f1-24',
 'Skud_10.350-cluster_n4261-Nter-f2-24',
 'Skud_10.350-cluster_n4261-Nter-f0-24',
 'Skud_10.350-cluster_

In [None]:
def parse_blast_dataframe(row : tuple) -> tuple:



    ###################


    # MANQUE A PARSER LES SEQ ID DU DATAFRAME EN FONCTION DE SI CEST NUC OU PROT : FAIRE DEUX FONCTIONS POUR APPLY ET ENSUITE GROUPBY







    prot_subject_pattern = r'(.*?)-(cluster_n\d+)-(.*)-(f\d+)-(\d+)'

    query_id = row[0]
    subject_id = row[1]
    identity = row[2]
    align_lenght = row[3]
    mismatches = row[4]
    gap_opens = row[5]
    qstart = row[6]
    qend = row[7]
    sstart = row[8]
    send = row[9]
    evalue = row[10]
    bitscore = row[11]

    match = re.search(prot_subject_pattern, subject_id)


    tuple_ = tuple([query_seq_id, subject_seq_id, evalue, qstart, qend, sstart, send, 
                    qseq, sseq, length, blast_gapopen, blast_gaps, query_cluster, subject_cluster, subject_relative_frame, 
                    is_same_cluster, q_specie, s_specie])
    return tuple_