In [2]:
import polars as pl 
import numpy as np
cov = 0.50001


elongates = pl.read_csv(f"output/{cov}/{cov}_elongates.csv", infer_schema_length =10000)



In [3]:

from utils.handle_UTRs import translate_frames
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import numpy as np

def custom_target_elongate(cluster, scer_length, seq_id, specie, elongate_length, side, genome_dict, gff_dict, coeff = 1): 


    coordinates = list()
    result_dict = dict()

    if side != "Nter" and side != "Cter":

        raise ValueError("Side must be either Nter or Cter")
    
    gff = gff_dict[specie].filter(

        (pl.col("Type") == "CDS") & ((pl.col("Name") == seq_id) | (pl.col("Parent") == seq_id))

    )[["Start","End","Strand","Seqid"]] # Keep only necessary columns
    
    # Store datas necessary to compute the elongate sequence

    strand = gff[0]["Strand"].to_list()[0] # + or -
    strand_id = gff[0]["Seqid"].to_list()[0] # chromosome or scaffold id

    for row in gff.iter_rows(named=True): # Named = True to iter with column names

        coordinates.append(sorted((int(row['Start'])-1, int(row['End'])-1))) # -1 for python indexing
        
    coordinates = sorted(coordinates, key=lambda x: x[0]) # Sort coordinates by start position


    custom_elongate_length = int(np.ceil((scer_length - elongate_length) * 3 * coeff)) # Get the length of the elongate sequence in nucleotides

    if strand == "+":

        if side == "Nter":

            start_5 = coordinates[0][0]-custom_elongate_length if coordinates[0][0]-custom_elongate_length >= 0 else 0 # Get the start position of the 5' UTR
            
            elongate = genome_dict[specie][strand_id]["seq"][

                start_5:coordinates[0][0]

            ] 

        elif side == "Cter": # Useless check but it's for the sake of clarity
        
            end_3 = coordinates[-1][1]+1+custom_elongate_length if coordinates[-1][1]+1+custom_elongate_length <= genome_dict[specie][strand_id]["len"] else genome_dict[specie][strand_id]["len"] # Get the end position of the 3' UTR
            # +1 for -1,1 because GFF points to the last nucleotide of the stop codon

            elongate = genome_dict[specie][strand_id]["seq"][
                coordinates[-1][1]+1:end_3
                ] # Get the 3' sequence

    
    # Reverse complement if the strand is negative, don't forget to reverse the coordinates
    if strand == "-":

        if side == "Nter":

            end_5 = coordinates[-1][1]+1+custom_elongate_length if coordinates[-1][1]+1+custom_elongate_length <= genome_dict[specie][strand_id]["len"] else genome_dict[specie][strand_id]["len"] # Get the start position of the 5' UTR

            elongate = genome_dict[specie][strand_id]["seq"][
                coordinates[-1][1]+1:end_5
            ].reverse_complement() # Get the 5' sequence
        

        if side == "Cter":
        
            start_3 = coordinates[0][0]-custom_elongate_length if coordinates[0][0]-custom_elongate_length >= 0 else 0 # Get the end position of the 3' UTR

            elongate = genome_dict[specie][strand_id]["seq"][
                start_3:coordinates[0][0]
            ].reverse_complement() # Get the 3' sequence


    result_dict["nuc"] = SeqRecord(seq = Seq(elongate), id = f"{seq_id}-{cluster}", description = "") 

    # def translate_frames(dna_sequence, specie, seq_id, length, utr, cluster)

    result_dict["prot"] = translate_frames(dna_sequence = elongate, specie = specie, seq_id = seq_id, length = custom_elongate_length, utr = side, cluster = cluster)

    return result_dict
    
    




In [4]:
def custom_scer_elongates(scer_id : str, specie : str, scer_length : int, scer_elongate : str or Seq, infos_dict, cluster, gff_dict, genome_dict, side):

    coordinates = list()
    result_dict = dict()

    if side != "Nter" and side != "Cter":

        raise ValueError("Side must be either Nter or Cter")
    
    gff = gff_dict[specie].filter(

        (pl.col("Type") == "CDS") & ((pl.col("Name") == scer_id) | (pl.col("Parent") == scer_id))

    )[["Start","End","Strand","Seqid"]] # Keep only necessary columns
    
    # Store datas necessary to compute the elongate sequence

    strand = gff[0]["Strand"].to_list()[0] # + or -
    strand_id = gff[0]["Seqid"].to_list()[0] # chromosome or scaffold id

    for row in gff.iter_rows(named=True): 

        coordinates.append(sorted((int(row['Start'])-1, int(row['End'])-1))) # -1 for python indexing
        
    coordinates = sorted(coordinates, key=lambda x: x[0]) # Sort coordinates by start position

    try:
        scer_elongate = scer_elongate.replace("-","")
    except AttributeError:
        print(scer_id)
        print(cluster)

    if len(scer_elongate) != scer_length:

        print(f"Seq id : {scer_id}")
        print(f"Scer length : {scer_length}")
        print(f"Scer elongate length : {len(scer_elongate)}")
        print(f"Cluster : {cluster}")
        raise ValueError("The length of the elongate sequence must be equal to the length of the Scer sequence")

    for subject_id, subject_length in infos_dict.items():

        result_dict[subject_id] = dict()
        custom_length = (scer_length-subject_length)*3

        if strand == "+":

            if side == "Nter":
                
                elongate = genome_dict[specie][strand_id]["seq"][

                    coordinates[0][0]:coordinates[0][0] + custom_length # Get the 5' sequence

                ] 

            elif side == "Cter": # Useless check but it's for the sake of clarity
            

                elongate = genome_dict[specie][strand_id]["seq"][
                    
                    coordinates[-1][1]-(custom_length):coordinates[-1][1]] # Get the 3' sequence

    
        # Reverse complement if the strand is negative, don't forget to reverse the coordinates
        if strand == "-":

            if side == "Nter":


                elongate = genome_dict[specie][strand_id]["seq"][
                    coordinates[-1][1]-(custom_length):coordinates[-1][1]
                ].reverse_complement() # Get the 5' sequence
            

            elif side == "Cter":
            
                elongate = genome_dict[specie][strand_id]["seq"][
                    coordinates[0][0]:coordinates[0][0] + (custom_length)
                ].reverse_complement() # Get the 3' sequence


        result_dict[subject_id]["nuc"] = SeqRecord( seq = Seq(elongate), id = f"{scer_id}_{side}_{subject_id}", description = "")
        result_dict[subject_id]["prot"] = SeqRecord( seq = Seq(scer_elongate[:int(custom_length/3)]), id = f"{scer_id}_{side}_{subject_id}", description = "")
    
    return result_dict

In [12]:
import yaml
# Load your species regex from the yaml file
with open('/home/simon.herman/Bureau/Gits/Elongates/env.yaml', 'r') as f:
    yaml_data = yaml.safe_load(f)
    species_order = yaml_data['Species_order']['Scer']

seuil = 4

Nter_scer_conditions = ((pl.col("species") == "Scer_NCBI") & (abs(pl.col("max_Nter") - pl.col("Nter_nb_aa")) < seuil) & (pl.col("max_Nter") >= 10))
Cter_scer_conditions = ((pl.col("species") == "Scer_NCBI") & (abs(pl.col("max_Cter") - pl.col("Cter_nb_aa")) < seuil) & (pl.col("max_Cter") >= 10))
test = elongates.filter(Nter_scer_conditions | Cter_scer_conditions)["cluster_name"].unique().to_list()

nter_clusters = []
cter_clusters = []
i = 0

for cluster,sequences in elongates.filter(pl.col("cluster_name").is_in(test)).groupby("cluster_name"):

    species = sequences["species"].to_list()

    if len(species) < 3:
            
        continue


    species_ordered = [ s for s in species_order if s in species]
    
    if species_ordered[-1] != "Scer_NCBI" and species.count(species_ordered[-1]) > 1:
            
        continue

    if sequences.filter(pl.col("species") == species_ordered[-1])["Nter_nb_aa"].to_list()[0] < seuil and sequences["max_Nter"].max() >= 10:
    
        nter_clusters.append(cluster)

    if sequences.filter(pl.col("species") == species_ordered[-1])["Cter_nb_aa"].to_list()[0] < seuil and sequences["max_Cter"].max() >= 10:
    
        cter_clusters.append(cluster)



full_filtered_Nter = elongates.filter(pl.col("cluster_name").is_in(nter_clusters))
full_filtered_Cter = elongates.filter(pl.col("cluster_name").is_in(cter_clusters))



In [21]:
import gff3_parser
from utils.files import multifasta_to_dict
gff_dict = dict()
genome_dict = dict()

for specie in species_order:

    gff_dict[specie] = pl.from_pandas(gff3_parser.parse_gff3(f"input/{specie}.gff", parse_attributes = True, verbose = False))
    genome_dict[specie] = multifasta_to_dict(f"input/{specie}.fna", genome = True)

In [64]:
import os 
from Bio import SeqIO

current_path = f"/home/simon.herman/Bureau/Gits/Elongates/work/{cov}"

dataframes = {
    "Nter": full_filtered_Nter,
    "Cter": full_filtered_Cter,
}

os.mkdir(f"{current_path}/local_align_files")

for side in ["Nter", "Cter"]:

    df = dataframes[side]

    os.mkdir(f"{current_path}/local_align_files/{side}")

    for cluster, sequences in df.groupby("cluster_name"):

       


        os.mkdir(f"{current_path}/local_align_files/{side}/{cluster}")

        scer_length = sequences.filter(pl.col("species") == "Scer_NCBI")[f"{side}_nb_aa"].max() # Maybe several Scer sequences in the cluster, we take the longest elongate
        if scer_length < 10:
            continue
        scer_id = sequences.filter((pl.col("species") == "Scer_NCBI") & (pl.col(f"{side}_nb_aa") == scer_length))["seq_id"].to_list()[0]

        ###################

        # POURQUOI EST CE QUE CE CLUSTER ECHAPE AUX REGLES ??????

        ###################

        
            
        infos_for_scer_sequences = dict()
        for sequence in sequences.iter_rows(named = True): 

            if scer_length - sequence[f"{side}_nb_aa"] >= 10:

                os.mkdir(f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}")
                os.mkdir(f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}/nucleotide")
                os.mkdir(f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}/protein")


                dict_ = custom_target_elongate(cluster, scer_length, sequence["seq_id"], sequence["species"], sequence[f"{side}_nb_aa"], side, genome_dict, gff_dict)

                SeqIO.write(dict_["nuc"], f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}/nucleotide/{sequence['seq_id']}.fna", "fasta")
                for frame in dict_["prot"].keys():
                    SeqIO.write(dict_["prot"][frame], f"{current_path}/local_align_files/{side}/{cluster}/{sequence['seq_id']}/protein/{sequence['seq_id']}.faa", "fasta")
                infos_for_scer_sequences[sequence["seq_id"]] = sequence["Nter_nb_aa"] 
                
        
        scer_cds = sequences.filter(pl.col("seq_id") == scer_id)
        
        scer_elongates = custom_scer_elongates(scer_id = scer_id, specie = "Scer_NCBI", scer_length = scer_length, 
                                                    scer_elongate = scer_cds[f"{side}_elongate"].to_list()[0], infos_dict = infos_for_scer_sequences, 
                                                    cluster = cluster, gff_dict = gff_dict, genome_dict = genome_dict, side = side)
        if cluster == "cluster_n1071": 
            print(sequences["seq_id","Nter_nb_aa","max_Nter"])
            print(scer_elongates.keys())

        for subject_sequence in scer_elongates.keys():

            SeqIO.write(scer_elongates[subject_sequence]["nuc"], f"{current_path}/local_align_files/{side}/{cluster}/{subject_sequence}/nucleotide/Scer_custom.fna", "fasta")
            SeqIO.write(scer_elongates[subject_sequence]["prot"], f"{current_path}/local_align_files/{side}/{cluster}/{subject_sequence}/protein/Scer_custom.faa", "fasta")
            


KeyError: 'prot'

In [24]:
elongates.filter(pl.col("cluster_name") == "cluster_n3725").select("seq_id","Cter_nb_aa","Nter_nb_aa","max_Nter","max_Cter","Nter_elongate_length","Cter_elongate_length","Cter_elongate")




seq_id,Cter_nb_aa,Nter_nb_aa,max_Nter,max_Cter,Nter_elongate_length,Cter_elongate_length,Cter_elongate
str,i64,i64,i64,i64,i64,i64,str
"""Sarb_12G00060.…",0,0,72,11,0,0,
"""Smik_12.6""",0,72,72,11,72,0,
"""rna-XM_0339118…",11,72,72,11,72,11,"""EEEYQFERHKL"""
"""rna-NM_0011818…",0,72,72,11,72,0,
"""Sbay_10.18""",0,72,72,11,72,0,
"""Skud_12.17""",0,72,72,11,72,0,
