In [3]:
import gff3_parser
import polars as pl

orfs = pl.from_pandas(gff3_parser.parse_gff3("input/mapping_orf_Scer_SGD_noMT.gff", parse_attributes=True))

gff = gff3_parser.parse_gff3("input/Scer.gff", parse_attributes=True)


COLUMNS = [["seq_id", "start", "end", "strand", "phase", "attributes"]]

 Input genomic fasta file: Scer_SGD.fna
 Input gff file: Scer_SGD.gff
Building structured data...


100%|██████████| 351379/351379 [00:01<00:00, 282639.50it/s]


Adding Supplemental Attribute table...
Finding unique attribute keys...


100%|██████████| 351377/351377 [00:01<00:00, 311708.14it/s]


Making attribute table...


100%|██████████| 351377/351377 [00:02<00:00, 118708.58it/s]


date Tue Jan 13 13:06:13 2015
 Created by Saccharomyces Genome Database (http://www.yeastgenome.org/)
 Weekly updates of this file are available for download from:
 http://downloads.yeastgenome.org/curation/chromosomal_feature/saccharomyces_cerevisiae.gff
Building structured data...


100%|██████████| 23076/23076 [00:00<00:00, 236653.57it/s]


Adding Supplemental Attribute table...
Finding unique attribute keys...


100%|██████████| 23058/23058 [00:00<00:00, 280141.07it/s]


Making attribute table...


100%|██████████| 23058/23058 [00:00<00:00, 81527.79it/s]


In [40]:
from collections import OrderedDict as OD

class Gene:

    def __init__(self, ID, chromosome, start, end, multi, sense):

        self.ID = ID
        self.chromosome = chromosome
        self.start = start
        self.end = end
        self.sense = sense
        self.multi = multi
        self.aORFs = OD()
        self._exons = OD()

    def add_ORF(self, key, value):

        if type(value) != OD:
            raise TypeError('aORF must be an OrderedDict')
        
        elif key in self.aORFs:
            raise KeyError(f'aORF {key} already exists')
        
        else:
            self.aORFs[key] = value

    @property
    def exons(self):
        return self._exons

    @exons.setter
    def exons(self, value):
        if type(value) != OD:
            raise TypeError('exon must be an OrderedDict')

        self._exons = value
        self.sort_exons()

    def add_exon(self, key, value):
        if type(value) != OD:
            raise TypeError('exon must be an OrderedDict')
        
        elif key in self._exons:
            raise KeyError(f'exon {key} already exists')
        
        else:
            self._exons[key] = value
            self.sort_exons()

    def sort_exons(self):
        if self.sense == "+":
            self._exons = OD(sorted(self._exons.items(), key=lambda x: (x[1]['Start'], x[1]['End'])))
        elif self.sense == "-":
            self._exons = OD(sorted(self._exons.items(), key=lambda x: (x[1]['Start'], x[1]['End']), reverse=True))



    def get_exons(self):
        return self._exons
    

    
class Orf: 

    def __init__(self, ID, start, end, gene ):

        self.ID = ID
        self.start = start
        self.end = end
        self.gene = gene
        self._frame = None
        self._ribospike = None

    @property
    def ribospike(self):
        return self._ribospike
    
    @ribospike.setter
    def ribospike(self, value):
        if type(value) != int:
            raise TypeError('ribospike must be int')
        else:
            self._ribospike = value
            self._frame = self.gene.start + self._ribospike

    @property
    def frame(self):
        return self._frame

In [41]:
import re

pattern = r'\b([\w-]+)_CDS\b'

dframe = (
    orfs
    .filter(pl.col("Type") == "nc_ovp_same-CDS")
    .with_columns([
        pl.col("ID").apply(lambda value: value.split("_")[3]).alias("Phase"),
        pl.col("Ovp_with").apply(lambda value: re.findall(pattern, value)[0] if re.findall(pattern, value) else '').alias("Ovp_CDS")
    ])
)

grouped = dframe.groupby("Ovp_CDS")

gene_list = list()

for name, data in grouped:

    gene = init_gene_object(name, gff)
    
    if gene:
        print(gene.get_exons())
        print(gene.sense)
        

    # for orf in data.iter_rows(named = True):

        # print(orf)


    

['gene', 'CDS', 'mRNA']
OrderedDict([('YNL065W_CDS-1', OrderedDict([('Start', '503724'), ('End', '505484')]))])
+
['CDS', 'mRNA', 'gene']
OrderedDict([('YPR074W-A_CDS-1', OrderedDict([('Start', '695018'), ('End', '695188')]))])
+
['CDS', 'gene', 'mRNA']
OrderedDict([('YGL052W_CDS-1', OrderedDict([('Start', '403437'), ('End', '403742')]))])
+
['CDS', 'gene', 'mRNA']
OrderedDict([('YPL002C_CDS-1', OrderedDict([('Start', '553627'), ('End', '554328')]))])
-
['gene', 'mRNA', 'CDS']
OrderedDict([('YER136W_CDS-1', OrderedDict([('Start', '439616'), ('End', '440971')]))])
+
['CDS', 'gene', 'mRNA']
OrderedDict([('YML087C_CDS-1', OrderedDict([('Start', '94431'), ('End', '95369')]))])
-
['intron', 'gene', 'CDS', 'mRNA']
OrderedDict([('YKL006C-A_CDS-2', OrderedDict([('Start', '430597'), ('End', '430632')])), ('YKL006C-A_CDS-1', OrderedDict([('Start', '430198'), ('End', '430455')]))])
-
['CDS', 'mRNA', 'gene']
OrderedDict([('YGL243W_CDS-1', OrderedDict([('Start', '43307'), ('End', '44509')]))])
+
['

IndexError: list index out of range

In [35]:
import re

def get_good_column_names(columns):

    import re
    from itertools import chain

    parent_col = [col for col in columns if re.match(r"[Pp]arent", col)]
    name_col = [col for col in columns if re.match(r"[Nn]ame", col)]

    if len(parent_col) == 1 and len(name_col) == 1:

        return str(parent_col[0]), str(name_col[0])
    
    else:

        raise AttributeError("Problem with GFF columns : Parent or Name not found. See get_good_column_names()")
        


def init_gene_object(gene_id, gff_dataframe):

    import warnings

    """
    
    On veut pour la gene_id donnée initiliaser un objet Gene avec ses attributs
    tirés du gff_dataframe.
    
    """

    pattern = fr".*{gene_id}.*"

    parent, name = get_good_column_names(gff_dataframe.columns)

    gene_rows = pl.from_pandas(gff_dataframe[
        gff_dataframe[parent].str.contains(pattern, regex=True, na=False) |
        gff_dataframe[name].str.contains(pattern, regex=True, na=False)
    ])


    types = gene_rows["Type"].unique().to_list()

    if ("exon" not in types) and ("CDS" not in types):

        raise AttributeError(f"Problem with gene feature type in GFF : exon or CDS not found. See {gene_id}")

    mRNAs = gene_rows.filter(pl.col("Type") == "mRNA")
    mRNAs = (
        mRNAs
        .with_columns(pl.col('Start').cast(int).alias('Start'),
                    pl.col('End').cast(int).alias('End'))
        .with_columns((pl.col('End') - pl.col('Start')).alias('Length'))
        .sort('Length', descending=True)

    ).head(1)

    # Once we have the longest mRNA, we can get the exons and CDSs

    exons = gene_rows.filter(
        (pl.col("Type") == "exon") & (pl.col("Parent") == mRNAs["ID"].to_list()[0])
    )

    cdss = gene_rows.filter(
        (pl.col("Type") == "CDS") & (pl.col("Parent") == mRNAs["ID"].to_list()[0])
    )

    
    if exons["ID"].len() > cdss["ID"].len():
    
        warnings.warn(f"Number of exons and CDS for gene {gene_id} does not match\nExons will be used for exon number computation")

    if gene_rows["Seqid"].unique().len() != 1:

        raise AttributeError(f"Problem for gene {gene_id} : chromosome is not unique")

    multi = [ False if cdss["ID"].len() == 1 else True ]


    gene = Gene(

        ID = gene_id,
        chromosome = gene_rows["Seqid"].unique().to_list()[0],
        start = gene_rows.filter(pl.col("Type") == "gene")["Start"].to_list()[0],
        end = gene_rows.filter(pl.col("Type") == "gene")["End"].to_list()[0],
        multi = multi,
        sense = gene_rows.filter(pl.col("Type") == "gene")["Strand"].to_list()[0]
        
    )

    cds_counter = 1
    for exon in cdss.iter_rows(named = True):

        key = f'{exon["ID"]}-{cds_counter}' if exon["ID"] else f'{exon["Name"]}-{cds_counter}'
        
        gene.add_exon(
            key = key,
            value = OD(
                [
                    ("Start", exon["Start"]),
                    ("End", exon["End"]),
                    
                ]
            )
        )

        cds_counter = cds_counter + 1



    return gene



    





In [53]:

# Your ordered dict
a = OD([("a", 1), ("b", 2)])

print(a)

# Sort by keys in reverse order
a = OD(sorted(a.items(), key=lambda x: x[0], reverse=True))

print(a)

OrderedDict([('a', 1), ('b', 2)])
OrderedDict([('b', 2), ('a', 1)])
