In [40]:
import gff3_parser
import polars as pl

orfs = pl.from_pandas(gff3_parser.parse_gff3("input/mapping_orf_Scer_SGD_noMT.gff", parse_attributes=True))

GFF = gff3_parser.parse_gff3("input/Scer.gff", parse_attributes=True)


COLUMNS = [["seq_id", "start", "end", "strand", "phase", "attributes"]]

 Input genomic fasta file: Scer_SGD.fna
 Input gff file: Scer_SGD.gff
Building structured data...


100%|██████████| 351379/351379 [00:01<00:00, 269144.45it/s]


Adding Supplemental Attribute table...
Finding unique attribute keys...


100%|██████████| 351377/351377 [00:01<00:00, 318773.00it/s]


Making attribute table...


100%|██████████| 351377/351377 [00:03<00:00, 111395.98it/s]


date Tue Jan 13 13:06:13 2015
 Created by Saccharomyces Genome Database (http://www.yeastgenome.org/)
 Weekly updates of this file are available for download from:
 http://downloads.yeastgenome.org/curation/chromosomal_feature/saccharomyces_cerevisiae.gff
Building structured data...


100%|██████████| 23076/23076 [00:00<00:00, 301025.29it/s]


Adding Supplemental Attribute table...
Finding unique attribute keys...


100%|██████████| 23058/23058 [00:00<00:00, 194367.59it/s]


Making attribute table...


100%|██████████| 23058/23058 [00:00<00:00, 88183.10it/s]


In [78]:
from collections import OrderedDict as OD
import polars as pl
import warnings
import re

GFF_POLARS = pl.from_pandas(GFF)

ORF_DF_COLUMNS = ['Seqid',
 'Source',
 'Type',
 'Start',
 'End',
 'Score',
 'Strand',
 'Phase',
 'Status',
 'color',
 'Parent',
 'ID',
 'Ovp_with',
 'Ovp_gene']

class Gene:

    def __init__(self, ID, chromosome, start, end, multi, sense):

        self.ID = ID
        self.chromosome = chromosome
        self.start = start
        self.end = end
        self.sense = sense
        self.multi = multi
        self.aORFs = OD()
        self._exons = OD()

    @property
    def exons(self):
        return self._exons

    @exons.setter
    def exons(self, value):
    
        self._exons = value

    def add_exon(self, key, value):
        
        if key in self._exons:
            raise KeyError(f'exon {key} already exists')
        else:
            self._exons[key] = value

    def sort(self, feature):

        if isinstance(feature, list):
            

    
 
class Orf: 

    def __init__(self, ID, start, end, gene ):

        self.ID = ID
        self.start = start
        self.end = end
        self.gene = gene
        self._frame = None
        self._ribospike = None

    @property
    def ribospike(self):
        return self._ribospike
    
    @ribospike.setter
    def ribospike(self, value):
        if type(value) != int:
            raise TypeError('ribospike must be int')
        else:
            self._ribospike = value
            self._frame = self.gene.start + self._ribospike

    @property
    def frame(self):
        return self._frame


class GeneStructureError(Exception):
    def __init__(self, message):
        self.message = message
        super().__init__(self.message)


def get_good_column_names(columns):

    import re
    from itertools import chain

    parent_col = [col for col in columns if re.match(r"[Pp]arent", col)]
    name_col = [col for col in columns if re.match(r"[Nn]ame", col)]

    if len(parent_col) == 1 and len(name_col) == 1:

        return str(parent_col[0]), str(name_col[0])
    
    else:

        raise AttributeError("Problem with GFF columns : Parent or Name not found. See get_good_column_names()")
        
def return_gene_infos(gene_infos, all_exons = True) -> dict:



    chromosome = gene_infos["Seqid"].unique().to_list()[0],
    start = gene_infos.filter(pl.col("Type") == "gene")["Start"].to_list()[0],
    end = gene_infos.filter(pl.col("Type") == "gene")["End"].to_list()[0],
    sense = gene_infos.filter(pl.col("Type") == "gene")["Strand"].to_list()[0]
    multi = gene_infos["Type"].to_list().count("CDS") > 1


    return {

        "chromosome" : chromosome,
        "start" : start,
        "end" : end,
        "sense" : sense,
        "multi" : multi
    } 

def init_gene_object(gene_id, gff_dataframe):


    """
    
    On veut pour la gene_id donnée initiliaser un objet Gene avec ses attributs
    tirés du gff_dataframe.
    
    """

    pattern = fr".*{gene_id}.*"

    parent, name = get_good_column_names(gff_dataframe.columns)

    # Polars does not support regex filtering : pandas is used instead
    gene_rows = pl.from_pandas(gff_dataframe[
        gff_dataframe[parent].str.contains(pattern, regex=True, na=False) |
        gff_dataframe[name].str.contains(pattern, regex=True, na=False)
    ]) # Get all rows related to the gene being initialized


    
    gene_infos = return_gene_infos(gene_rows)


    gene = Gene(

        ID = gene_id,
        chromosome = gene_infos["chromosome"],
        start = gene_infos["start"],
        end = gene_infos["end"],
        sense = gene_infos["sense"],
        multi = gene_infos["multi"]
        
    )
    

    cds_counter = 1
    for exon in gene_rows.filter(pl.col("Type") == "CDS").iter_rows(named = True):

        key = f'{exon["ID"]}-{cds_counter}' if exon["ID"] else f'{exon["Name"]}-{cds_counter}'
        
        gene.add_exon(
            key = key,
            value = OD(
                [
                    ("Start", exon["Start"]),
                    ("End", exon["End"]),
                    
                ]
            )
        )

        cds_counter = cds_counter + 1



    return gene



def check_double_overlap(row : tuple):

    orf = dict(zip(ORF_DF_COLUMNS, row))
    
    overlaps = [match for item in orf["Ovp_with"].split("|") for match in re.findall(r"\b([\w-]+)_mRNA\b", item)]

    if len(overlaps) == 0:

        overlaps = [match for item in orf["Ovp_with"].split("|") for match in re.findall(r"\b([\w-]+)_CDS\b", item)]
    
    buffer = []

    if len(overlaps) != 1:

        for overlap in overlaps:

            
            if GFF_POLARS.filter(
                
                (pl.col("ID") == overlap)
                )["Strand"].unique().to_list()[0] == orf["Strand"]:

                buffer.append(overlap)
                

        if len(buffer) == 1: # Several genes are overlapped by the ORF, but only one is on the same strand

            orf["Ovp_gene"] = buffer[0]
            return tuple(orf.values())
        
    
        elif len(buffer) == 0: # No gene found on the same strand as the ORF

            orf["Ovp_gene"] = "NA"
            return tuple(orf.values())
        
        else: # Several genes found on the same strand as the ORF

            orf["Ovp_gene"] = "Two_or_more_genes"
            return tuple(orf.values())


    else: # If there is only one gene found in the overlapping information given by ORFMine ID

        orf["Ovp_gene"] = overlaps[0]
        return tuple(orf.values())



Décider de la structure de de chaque ORF/Exon + de la structure qui les contient 
Comment implémenter ?


In [52]:
pattern = r'\b([\w-]+)_CDS\b'

dframe = (
    orfs
    .filter(pl.col("Type") == "nc_ovp_same-CDS")
    .with_columns([
        pl.col("ID").apply(lambda value: value.split("_")[3]).alias("Phase"), # Extract strand phase from the ID generated by ORFMine
        pl.lit("NA").alias("Ovp_gene") # Create a column for the gene ID that will be filled by check_double_overlap()
    ])
)


dframe = dframe.apply(check_double_overlap) # Use .apply() method to leverage parallelization
dframe.columns = ORF_DF_COLUMNS




    

In [74]:
grouped = dframe.groupby("Ovp_gene")

gene_list = list()
for name,data in grouped:

    if "gene" in GFF_POLARS.filter(pl.col("ID") == name)["Type"].unique().to_list(): # If overlapped feature is not a gene ( = transposable for example ) it's not stored

        gene = init_gene_object(name, GFF)

        for row in data.iter_rows(named = True):

            gene.add_orf(
                key = row["ID"],
                value = OD(
                    [
                        ("Start", row["Start"]),
                        ("End", row["End"]),
                        ("Strand", row["Strand"]),
                        ("Phase", row["Phase"]),
                        ("Ovp_with", row["Ovp_with"])
                    ]
                )
            )
    
        


"""
for name, data in grouped:


    print(name)
 
    gene = init_gene_object(name, gff)
    
    if gene:
        print(gene.get_exons())
        print(gene.sense)
        

    # for orf in data.iter_rows(named = True):

        # print(orf)
"""

In [53]:
def return_gene_infos(gene_infos : polars.Dataframe, all_exons = True):


    chromosome = gene_infos["Seqid"].unique().to_list()[0],
    start = gene_infos.filter(pl.col("Type") == "gene")["Start"].to_list()[0],
    end = gene_infos.filter(pl.col("Type") == "gene")["End"].to_list()[0],
    sense = gene_infos.filter(pl.col("Type") == "gene")["Strand"].to_list()[0]
    multi = len(gene_infos["Type"].to_list().count("mRNA") > 1)

def check_gene_structure(gene_infos : polars.DataFrame):

    try:
        exon_or_CDS = gene_infos["Type"].to_list().count("CDS") == gene_infos["Type"].to_list().count("exon")
        num_mRNA = gene_infos["Type"].to_list().count("mRNA")
        
        if not exon_or_CDS:
            raise GeneStructureError("The counts of 'CDS' and 'exon' in the 'Type' column do not match.")
        if num_mRNA != 1:
            raise GeneStructureError("The count of 'mRNA' in the 'Type' column is not 1.")

        # Add more error checks here as needed in the future.

        return 0

    except GeneStructureError as e:
        return e





OrderedDict([('a', 1), ('b', 2)])
OrderedDict([('b', 2), ('a', 1)])


In [80]:
from collections import OrderedDict

# List of OrderedDict objects
data_list = [
    OrderedDict([('Start', 20), ('End', 30)]),
    OrderedDict([('Start', 10), ('End', 15)]),
    OrderedDict([('Start', 25), ('End', 35)]),
]

# Sort the list based on the 'Start' key
sorted_list = sorted(data_list, key=lambda x: x['Start'])

# Print the sorted list
print(sorted_list)

[OrderedDict([('Start', 10), ('End', 15)]), OrderedDict([('Start', 20), ('End', 30)]), OrderedDict([('Start', 25), ('End', 35)])]


In [79]:
gene = Gene(
    ID = "gene_1",
    chromosome = "chr1",
    start = 1,
    end = 100,
    sense = "+",
    multi = False
)

gene.add_exon(

    key = "exon_1",
    value = OD(
        [   
            ("Start", 1),
            ("End", 10)
        ]
    )
)
