In [1]:
# Predicting Operons for E.Coli
import pandas as pd

def parse_ptt_file(ptt_file_path):
    # Open the ptt file for reading
    with open(ptt_file_path, 'r') as file:
        # Skip the first three lines (header information)
        next(file)
        next(file)
        next(file)

        
        operons = []  # List to store all operons
        present_operon = []  # List to store genes in the current operon
        previous_end = -1  # Starting value for the previous gene's end position
        previous_strand = '+'  # Starting value for the previous gene's strand

        # Iterate over each line in the file
        for line in file:
            # Parse the line to extract start, end, strand, and gene_name
            parts = line.strip().split('\t')
            start, end = map(int, parts[0].split('..'))
            strand, gene_name = parts[1], parts[4]

            # Check if the current gene belongs to the same operon as the previous gene
            if strand == previous_strand and (start - previous_end) < 50 and present_operon:
                # If it does, add it to the current operon
                present_operon.append((start, end, gene_name))
            else:
                # If it doesn't, start a new operon
                # Check if there is a valid operon in present_operon
                if present_operon and len(present_operon) > 1:
                    # If there is, add it to the list of operons
                    operons.append(present_operon)

                # Start a new operon with the current gene
                present_operon = [(start, end, gene_name)]
                # Update the previous strand
                previous_strand = strand

            # Update the previous end position
            previous_end = end

        # Add the last operon to the list of operons
        if present_operon and len(present_operon) > 1:
            operons.append(present_operon)

        return operons


ecoli_operons = parse_ptt_file("E_coli_K12_MG1655.ptt")

# Print information about each operon
for operon in ecoli_operons:
    start, end = operon[0][0], operon[-1][1]  # Get the start and end positions of the operon
    genes = ', '.join(gene for _, _, gene in operon)  # Get a comma-separated list of gene names
    print(f"Operon {start} - {end}, having {len(operon)} genes: {genes}")

# Print the total number of operons
print(len(ecoli_operons))

Operon 337 - 5020, having 3 genes: thrA, thrB, thrC
Operon 10643 - 11786, having 2 genes: yaaW, yaaI
Operon 16751 - 16903, having 2 genes: mokC, hokC
Operon 19811 - 20508, having 2 genes: insB, insA
Operon 21181 - 25701, having 4 genes: yaaY, ribF, ileS, lspA
Operon 25826 - 27227, having 2 genes: fkpB, ispH
Operon 29651 - 34038, having 2 genes: carA, carB
Operon 34781 - 36162, having 2 genes: caiE, caiD
Operon 39244 - 41931, having 2 genes: caiA, caiT
Operon 42403 - 44129, having 2 genes: fixA, fixB
Operon 44180 - 45750, having 2 genes: fixC, fixX
Operon 47246 - 49631, having 2 genes: kefF, kefC
Operon 50380 - 54702, having 5 genes: apaH, apaG, rsmA, pdxA, surA
Operon 59687 - 63264, having 2 genes: rluA, rapA
Operon 66835 - 70048, having 2 genes: araA, araB
Operon 72229 - 75480, having 3 genes: thiQ, thiP, thiB
Operon 78848 - 83529, having 4 genes: leuD, leuC, leuB, leuA
Operon 85630 - 87848, having 2 genes: ilvI, ilvH
Operon 89634 - 100711, having 10 genes: mraZ, rsmH, ftsL, ftsI, mur

In [2]:
# Predicting Operons for B.subtilis


def parse_ptt_file(ptt_file_path):
    # Open the ptt file for reading
    with open(ptt_file_path, 'r') as file:
        # Skip the first three lines (header information)
        next(file)
        next(file)
        next(file)

        
        operons = []  # List to store all operons
        present_operon = []  # List to store genes in the current operon
        previous_end = -1  # Starting value for the previous gene's end position
        previous_strand = '+'  # Starting value for the previous gene's strand

        # Iterate over each line in the file
        for line in file:
            # Parse the line to extract start, end, strand, and gene_name
            parts = line.strip().split('\t')
            start, end = map(int, parts[0].split('..'))
            strand, gene_name = parts[1], parts[4]

            # Check if the current gene belongs to the same operon as the previous gene
            if strand == previous_strand and (start - previous_end) < 50 and present_operon:
                # If it does, add it to the current operon
                present_operon.append((start, end, gene_name))
            else:
                # If it doesn't, start a new operon
                # Check if there is a valid operon in present_operon
                if present_operon and len(present_operon) > 1:
                    # If there is, add it to the list of operons
                    operons.append(present_operon)

                # Start a new operon with the current gene
                present_operon = [(start, end, gene_name)]
                # Update the previous strand
                previous_strand = strand

            # Update the previous end position
            previous_end = end

        # Add the last operon to the list of operons
        if present_operon and len(present_operon) > 1:
            operons.append(present_operon)

        return operons


ecoli_operons = parse_ptt_file("B_subtilis_168.ptt")

# Print information about each operon
for operon in ecoli_operons:
    start, end = operon[0][0], operon[-1][1]  # Get the start and end positions of the operon
    genes = ', '.join(gene for _, _, gene in operon)  # Get a comma-separated list of gene names
    print(f"Operon {start} - {end}, having {len(operon)} genes: {genes}")

# Print the total number of operons
print(len(ecoli_operons))

Operon 3206 - 4812, having 3 genes: yaaA, recF, yaaB
Operon 19062 - 20558, having 2 genes: yaaD, yaaE
Operon 22496 - 23769, having 2 genes: dck, dgk
Operon 26814 - 29705, having 4 genes: dnaX, yaaK, recR, yaaL
Operon 35845 - 37638, having 2 genes: xpaC, yaaN
Operon 37720 - 39797, having 2 genes: yaaO, tmk
Operon 39871 - 42858, having 5 genes: yaaQ, yaaR, holB, yaaT, yabA
Operon 42917 - 44799, having 3 genes: yabB, yazA, yabC
Operon 50087 - 51518, having 2 genes: rnmV, ksgA
Operon 54441 - 55672, having 2 genes: purR, yabJ
Operon 56352 - 58698, having 2 genes: glmU, prs
Operon 64817 - 68137, having 3 genes: yabM, yabN, yabO
Operon 68216 - 69545, having 3 genes: yabP, yabQ, divIC
Operon 73106 - 74825, having 2 genes: yabS, yabT
Operon 74929 - 76886, having 2 genes: tilS, hprT
Operon 79092 - 81695, having 3 genes: coaX, hslO, yacD
Operon 82864 - 88635, having 8 genes: pabB, pabA, pabC, sul, folB, folK, yazB, dusB
Operon 101449 - 106004, having 4 genes: ctsR, mcsA, mcsB, clpC
Operon 106096 

In [3]:
# Predicting Operons for Synechocystis


def parse_ptt_file(ptt_file_path):
    # Open the ptt file for reading
    with open(ptt_file_path, 'r') as file:
        # Skip the first three lines (header information)
        next(file)
        next(file)
        next(file)

        
        operons = []  # List to store all operons
        present_operon = []  # List to store genes in the current operon
        previous_end = -1  # Starting value for the previous gene's end position
        previous_strand = '+'  # Starting value for the previous gene's strand

        # Iterate over each line in the file
        for line in file:
            # Parse the line to extract start, end, strand, and gene_name
            parts = line.strip().split('\t')
            start, end = map(int, parts[0].split('..'))
            strand, gene_name = parts[1], parts[4]

            # Check if the current gene belongs to the same operon as the previous gene
            if strand == previous_strand and (start - previous_end) < 50 and present_operon:
                # If it does, add it to the current operon
                present_operon.append((start, end, gene_name))
            else:
                # If it doesn't, start a new operon
                # Check if there is a valid operon in present_operon
                if present_operon and len(present_operon) > 1:
                    # If there is, add it to the list of operons
                    operons.append(present_operon)

                # Start a new operon with the current gene
                present_operon = [(start, end, gene_name)]
                # Update the previous strand
                previous_strand = strand

            # Update the previous end position
            previous_end = end

        # Add the last operon to the list of operons
        if present_operon and len(present_operon) > 1:
            operons.append(present_operon)

        return operons


ecoli_operons = parse_ptt_file("Synechocystis_PCC6803_uid159873.ptt")

# Print information about each operon
for operon in ecoli_operons:
    start, end = operon[0][0], operon[-1][1]  # Get the start and end positions of the operon
    genes = ', '.join(gene for _, _, gene in operon)  # Get a comma-separated list of gene names
    print(f"Operon {start} - {end}, having {len(operon)} genes: {genes}")

# Print the total number of operons
print(len(ecoli_operons))

Operon 15937 - 17687, having 2 genes: fecE, fecB
Operon 21838 - 25144, having 3 genes: sll1204, sll1203, sll1202
Operon 32524 - 33544, having 2 genes: sll1405, exbB
Operon 39802 - 43397, having 2 genes: fhuA, fecB
Operon 43571 - 44885, having 2 genes: fecB, slr1493
Operon 48067 - 50231, having 4 genes: sll1400, ssl2733, sll1399, psbW
Operon 55669 - 57224, having 2 genes: rfbD, msrA
Operon 61834 - 64914, having 3 genes: slr1114, slr1115, slr1116
Operon 70016 - 74229, having 4 genes: sll1062, sll1061, sll1060, ssl2069
Operon 81696 - 83853, having 2 genes: sll1054, sll1053
Operon 102930 - 106539, having 2 genes: sll0710, sll0709
Operon 108290 - 110313, having 2 genes: slr0722, slr0723
Operon 114874 - 116477, having 2 genes: slr0730, slr0731
Operon 118333 - 119962, having 2 genes: sll0703, sll0702
Operon 123501 - 124348, having 2 genes: sll0700, sll0699
Operon 134437 - 138112, having 2 genes: infB, slr0236
Operon 142559 - 143826, having 2 genes: slr0240, slr0241
Operon 144061 - 145081, hav

In [4]:
# Predicting Operons for Halobacterium


def parse_ptt_file(ptt_file_path):
    # Open the ptt file for reading
    with open(ptt_file_path, 'r') as file:
        # Skip the first three lines (header information)
        next(file)
        next(file)
        next(file)

        
        operons = []  # List to store all operons
        present_operon = []  # List to store genes in the current operon
        previous_end = -1  # Starting value for the previous gene's end position
        previous_strand = '+'  # Starting value for the previous gene's strand

        # Iterate over each line in the file
        for line in file:
            # Parse the line to extract start, end, strand, and gene_name
            parts = line.strip().split('\t')
            start, end = map(int, parts[0].split('..'))
            strand, gene_name = parts[1], parts[4]

            # Check if the current gene belongs to the same operon as the previous gene
            if strand == previous_strand and (start - previous_end) < 50 and present_operon:
                # If it does, add it to the current operon
                present_operon.append((start, end, gene_name))
            else:
                # If it doesn't, start a new operon
                # Check if there is a valid operon in present_operon
                if present_operon and len(present_operon) > 1:
                    # If there is, add it to the list of operons
                    operons.append(present_operon)

                # Start a new operon with the current gene
                present_operon = [(start, end, gene_name)]
                # Update the previous strand
                previous_strand = strand

            # Update the previous end position
            previous_end = end

        # Add the last operon to the list of operons
        if present_operon and len(present_operon) > 1:
            operons.append(present_operon)

        return operons


ecoli_operons = parse_ptt_file("Halobacterium_NRC1.ptt")

# Print information about each operon
for operon in ecoli_operons:
    start, end = operon[0][0], operon[-1][1]  # Get the start and end positions of the operon
    genes = ', '.join(gene for _, _, gene in operon)  # Get a comma-separated list of gene names
    print(f"Operon {start} - {end}, having {len(operon)} genes: {genes}")

# Print the total number of operons
print(len(ecoli_operons))

Operon 248 - 3254, having 3 genes: -, yvrO, -
Operon 5646 - 9860, having 3 genes: glmS, graD5, graD2
Operon 18357 - 19530, having 2 genes: -, -
Operon 26648 - 29429, having 2 genes: -, -
Operon 29476 - 30014, having 2 genes: -, -
Operon 31838 - 32355, having 2 genes: -, -
Operon 33182 - 33915, having 2 genes: -, -
Operon 36321 - 39432, having 3 genes: -, -, -
Operon 41880 - 44149, having 2 genes: -, -
Operon 64976 - 65472, having 2 genes: -, -
Operon 66054 - 66534, having 2 genes: -, -
Operon 74752 - 77867, having 2 genes: moeA2, moeA1
Operon 77923 - 79597, having 2 genes: pimT1, -
Operon 87668 - 90587, having 2 genes: rmeM, rmeS
Operon 90639 - 94359, having 2 genes: rmeR, -
Operon 102034 - 102692, having 2 genes: -, -
Operon 102863 - 103146, having 2 genes: -, -
Operon 104073 - 106791, having 2 genes: trp1, -
Operon 106803 - 107562, having 2 genes: -, -
Operon 112247 - 113712, having 2 genes: hpyA, aup
Operon 117867 - 118700, having 2 genes: -, -
Operon 119759 - 120364, having 2 genes

In [5]:
# Predicting the operons of Crop Microbiome.
def identify_and_display_operons(gff_filepath):
    current_operon = []
    previous_gene_end = -1  
    previous_strand_direction = ''

    def print_operon(operon):
        if operon: 
            print(f"Operon: {operon}")

    with open(gff_filepath) as gff:
        for record in gff:
            if not record.startswith("#") and 'gene' in record:
                fields = record.strip().split('\t')
                gene_start, gene_end, strand_direction = int(fields[3]), int(fields[4]), fields[6]

                if previous_strand_direction != strand_direction or abs(gene_start - previous_gene_end) > 50:
                    print_operon(current_operon)
                    current_operon = [] 

                
                current_operon.append((gene_start, gene_end))
                
                previous_gene_end, previous_strand_direction = gene_end, strand_direction

    
    print_operon(current_operon)


identify_and_display_operons('2088090036.gff')


Operon: [(1, 849)]
Operon: [(3, 131)]
Operon: [(2, 313)]
Operon: [(2306, 3409)]
Operon: [(3, 200)]
Operon: [(57, 1028)]
Operon: [(3, 188)]
Operon: [(2, 172)]
Operon: [(976, 2079)]
Operon: [(2306, 3409)]
Operon: [(3, 643)]
Operon: [(136, 558)]
Operon: [(2, 286)]
Operon: [(3, 458)]
Operon: [(197, 382)]
Operon: [(25, 150), (194, 586)]
Operon: [(655, 1083)]
Operon: [(149, 1252)]
Operon: [(31, 966)]
Operon: [(1, 135)]
Operon: [(25, 522)]
Operon: [(3, 200)]
Operon: [(1850, 2017)]
Operon: [(29, 160)]
Operon: [(3, 188)]
Operon: [(1, 129)]
Operon: [(1, 129)]
Operon: [(3, 178)]
Operon: [(966, 1625)]
Operon: [(149, 1252)]
Operon: [(25, 150)]
Operon: [(149, 1252)]
Operon: [(1, 99)]
Operon: [(2310, 3413)]
Operon: [(51, 1625)]
Operon: [(2308, 3411)]
Operon: [(3, 188)]
Operon: [(3, 188)]
Operon: [(1, 135)]
Operon: [(1, 795)]
Operon: [(60, 1268)]
Operon: [(29, 160)]
Operon: [(976, 2079)]
Operon: [(3, 176)]
Operon: [(84, 401)]
Operon: [(2, 394)]
