In [39]:
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import gff3_parser
from utils.files import multifasta_to_dict
import yaml 
cov = 0.5
import polars as pl



species = yaml.safe_load(open('env.yaml'))["Species_order"]["Scer"] # Species will be passed as argument in the future
gff_dict = dict()
genome_dict = dict()
test = dict()
for specie in species:

    genome_dict[specie] = multifasta_to_dict(f"input/{specie}.fna", genome = True)
    test[specie] = SeqIO.to_dict(SeqIO.parse(f"input/{specie}.fna", "fasta"))




In [42]:
genome_dict["Scer_NCBI"]
test["Scer_NCBi"]

{'NC_001133.9': {'seq': Seq('CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACA...GGG'),
  'len': 230218},
 'NC_001134.8': {'seq': Seq('AAATAGCCCTCATGTACGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTT...TGT'),
  'len': 813184},
 'NC_001135.5': {'seq': Seq('CCCACACACCACACCCACACCACACCCACACACCACACACACCACACCCACACA...GTG'),
  'len': 316620},
 'NC_001136.10': {'seq': Seq('ACACCACACCCACACCACACCCACACACACCACACCCACACACCACACCCACAC...TGG'),
  'len': 1531933},
 'NC_001137.3': {'seq': Seq('CGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTTCAACCAAAAGCTACTT...TTT'),
  'len': 576874},
 'NC_001138.5': {'seq': Seq('GATCTCGCAAGTGCATTCCTAGACTTAATTCATATCTGCTCCTCAACTGTCGAT...TGG'),
  'len': 270161},
 'NC_001139.9': {'seq': Seq('CCACACCCACACACACCACACCCACACCCACACACTACCCTAACACTACCCTAA...TTT'),
  'len': 1090940},
 'NC_001140.6': {'seq': Seq('CCCACACACACCACACCCACACACCACACCCACACTTTTCACATCTACCTCTAC...TGG'),
  'len': 562643},
 'NC_001141.2': {'seq': Seq('CACACACACCACACCCACACCACACCACACCACACCCACACCCACACACACCAC...TGT'),
  'len': 439888}

In [32]:
good = dict()
for specie in ["Skud","Smik","Sbay"]:

    # Check for each specie if the CDS in the gff_dict dataframes end with a stop codon : TGA TAA TAG 
    # Also keep track of the sense of the gene : + or -
    good_sequences = list()
    for sequence in gff_dict[specie].filter(pl.col("Type") == "CDS").iter_rows(named = True):

        cds_name = sequence["Name"]
        sense = sequence["Strand"]
        stop = int(sequence["End"])
        contig = sequence["Seqid"]

        if sense == "+":

            if genome_dict[specie][contig]["seq"][stop-3:stop] == "TGA" or genome_dict[specie][contig]["seq"][stop-3:stop] == "TAA" or genome_dict[specie][contig]["seq"][stop-3:stop] == "TAG":

                good_sequences.append(cds_name)

        elif sense == "-":

            if genome_dict[specie][contig]["seq"][stop:stop+3][::-1] == "TGA" or genome_dict[specie][contig]["seq"][stop:stop+3][::-1] == "TAA" or genome_dict[specie][contig]["seq"][stop:stop+3][::-1] == "TAG":
                good_sequences.append(cds_name)


    good[specie] = good_sequences


In [34]:
print(good["Sbay"])

['Sbay_5.1', 'Sbay_5.2', 'Sbay_5.5', 'Sbay_5.10', 'Sbay_5.12', 'Sbay_5.17', 'Sbay_5.21', 'Sbay_5.25', 'Sbay_5.26', 'Sbay_5.27', 'Sbay_5.28', 'Sbay_5.29', 'Sbay_5.31', 'Sbay_5.35', 'Sbay_5.37', 'Sbay_5.38', 'Sbay_5.39', 'Sbay_5.42', 'Sbay_5.43', 'Sbay_5.45', 'Sbay_5.49', 'Sbay_5.59', 'Sbay_5.62', 'Sbay_5.67', 'Sbay_5.68', 'Sbay_5.71', 'Sbay_5.79', 'Sbay_5.80', 'Sbay_5.82', 'Sbay_5.83', 'Sbay_5.87', 'Sbay_5.89', 'Sbay_5.92', 'Sbay_5.93', 'Sbay_5.94', 'Sbay_5.97', 'Sbay_5.99', 'Sbay_5.102', 'Sbay_5.103', 'Sbay_5.104', 'Sbay_5.105', 'Sbay_5.108', 'Sbay_5.112', 'Sbay_5.115', 'Sbay_5.116', 'Sbay_5.117', 'Sbay_5.118', 'Sbay_5.119', 'Sbay_5.120', 'Sbay_5.127', 'Sbay_5.129', 'Sbay_5.131', 'Sbay_5.132', 'Sbay_5.133', 'Sbay_5.136', 'Sbay_5.139', 'Sbay_5.140', 'Sbay_5.142', 'Sbay_5.148', 'Sbay_5.154', 'Sbay_5.156', 'Sbay_5.158', 'Sbay_5.166', 'Sbay_5.167', 'Sbay_5.169', 'Sbay_5.170', 'Sbay_5.173', 'Sbay_5.178', 'Sbay_5.179', 'Sbay_5.180', 'Sbay_5.181', 'Sbay_5.182', 'Sbay_5.184', 'Sbay_5.185', 'Sb