In [46]:
import pandas as pd

In [47]:
import os

# Get the current working directory
current_dir = os.getcwd()

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

print("Parent Directory:", parent_dir)

Parent Directory: /gpfs/fs1/home/j/jparkin/wongkoji/PathwayCompleteness


In [102]:
# generate a pandas dataframe from our kraken2 report, delimited by tab ('\t')
k2_report_file = "/scratch/j/jparkin/wongkoji/outputs/kraken2/pm_mtx.kreport2"
kraken_out_file = "/scratch/j/jparkin/wongkoji/outputs/kraken2/pm_mtx_kraken.out"
fastq_file_path = "/scratch/j/jparkin/wongkoji/data/pm_merged.fq"

report_df = pd.read_csv(filepath_or_buffer=k2_report_file, 
                        sep='\t',
                        names=["%_clade", "#_clade_frags", "#_taxon_frags", "class", "ncbi_taxon_id", "name"])

# Added a header explaining each column of the data.

# %_clade: Percentage of fragments covered by the clade rooted at this taxon
# #_clade_frags: Number of fragments covered by the clade rooted at this taxon
# #_taxon_frags: Number of fragments assigned directly to this taxon
# class: A rank code indicating (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, (S)pecies.
# taxon_id: The Taxonomic ID number from NCBI
# name: scientific name of taxon

# print(report_df["class"].unique())


In [104]:
print(len(report_df[report_df['class'].isin(['S', 'S1', 'S2', 'S3', 'S4'])]))
# report_df['class']
lst = ['Pseudomonas chlororaphis subsp. piscium', 'Pseudomonas chlororaphis subsp. aurantiaca', 'Pseudomonas chlororaphis subsp. aureofaciens', 'Pseudomonas syringae pv. actinidiae']
for item in lst:
    if item in report_df['name']:
        print("Found")
    else:
        print(item)
'Pseudomonas sp. A2' in report_df['name']

622
Pseudomonas chlororaphis subsp. piscium
Pseudomonas chlororaphis subsp. aurantiaca
Pseudomonas chlororaphis subsp. aureofaciens
Pseudomonas syringae pv. actinidiae


False

In [None]:
report_df['class'].unique()

In [50]:
# get only classified species who cover 0.05% of all fragments
def filter_kraken2(class_name):
    if class_name == "S":
        return "S"
    elif "S" in class_name:
        return "S"
    else:
        return class_name
# report_df = report_df[report_df['class'].isin(["S", "S1", "S2", "S3"])]
report_df = report_df[report_df['class'].isin(["S"])]

report_df = report_df[report_df['%_clade'] >= 0.05]

print("sum of clade_% fragments for classified species:", round(report_df["%_clade"].sum(),2))
print("number of species identified", len(report_df))
report_df.head(20)

sum of clade_% fragments for classified species: 67.99
number of species identified 21


Unnamed: 0,%_clade,#_clade_frags,#_taxon_frags,class,ncbi_taxon_id,name
10,4.42,52890,52881,S,47883,Pseudomonas synxantha
12,4.16,49840,49840,S,76760,Pseudomonas rhodesiae
13,4.16,49774,49774,S,76758,Pseudomonas orientalis
14,3.53,42291,42253,S,380021,Pseudomonas protegens
16,3.1,37118,36090,S,294,Pseudomonas fluorescens
26,0.08,939,939,S,1114970,Pseudomonas ogarae
49,16.23,194296,168457,S,587753,Pseudomonas chlororaphis
60,8.29,99227,62886,S,317,Pseudomonas syringae
117,2.65,31682,31682,S,78327,Pseudomonas mosselii
118,2.53,30338,29940,S,303,Pseudomonas putida


In [91]:
report_df['name']

10                           Pseudomonas synxantha
12                           Pseudomonas rhodesiae
13                          Pseudomonas orientalis
14                           Pseudomonas protegens
16                         Pseudomonas fluorescens
26                              Pseudomonas ogarae
49                        Pseudomonas chlororaphis
60                            Pseudomonas syringae
117                           Pseudomonas mosselii
118                             Pseudomonas putida
133                          Pseudomonas monteilii
144                      Pseudomonas citronellolis
145                         Pseudomonas aeruginosa
182                             Pseudomonas simiae
183                          Pseudomonas siliginis
184                        Pseudomonas atacamensis
186                             Pseudomonas sp. A2
187                        Pseudomonas sp. B21-010
406                     Pseudomonas brassicacearum
409                           P

In [51]:
#TODO: extract k-mer count information corresponding to data in stdout in kraken.out
k2_output = pd.read_csv(filepath_or_buffer=kraken_out_file, 
                        sep='\t',
                        names=["is_classified", "seq_id", "taxon_id", "bp_length", "LCA_mapping"])

In [52]:
k2_output.head()
print("num reads:", len(k2_output))

num reads: 1197492


In [53]:
# Keep only reads that are in filtered classes in report_df
k2_output_filt = k2_output[k2_output["taxon_id"].isin(report_df['ncbi_taxon_id'])]
k2_output_filt = k2_output_filt[k2_output_filt["taxon_id"] != 0]
print("num reads:", len(k2_output_filt))
k2_output_filt.head()

num reads: 750143


Unnamed: 0,is_classified,seq_id,taxon_id,bp_length,LCA_mapping
0,C,lcl|NZ_CP010896.1_cds_WP_010207718.1_1-2/1,321846,199,286:10 321846:10 286:1 321846:26 286:5 321846:...
1,C,lcl|NZ_CP010896.1_cds_WP_010207718.1_1-4/1,321846,201,321846:67 2:5 321846:20 286:1 321846:5 286:69
2,C,lcl|NZ_CP010896.1_cds_WP_010207719.1_2-4/1,321846,216,321846:6 1224:5 286:29 321846:5 286:9 321846:5...
3,C,lcl|NZ_CP010896.1_cds_WP_010207719.1_2-2/1,321846,207,321846:3 286:7 1236:1 135621:2 286:2 2:5 286:3...
4,C,lcl|NZ_CP010896.1_cds_WP_021491432.1_3-2/1,321846,187,286:39 321846:3 286:4 321846:5 286:9 321846:3 ...


In [54]:
# And can create a map of taxon_id to seq_ids
seq_by_taxon = k2_output_filt.groupby("taxon_id")['seq_id'].apply(list).to_dict()
# number of taxon ids in our dataset
len(seq_by_taxon.keys())

21

In [55]:
# List all the taxon ids
print("Taxon ID:", list(seq_by_taxon.keys()))

# List the first five sequence ids by taxon 321846
print("Taxon 321846 Sequence IDs:", seq_by_taxon[321846][0:5])

# Get the number of taxons with ID 321846 in the list
print("# Sequences in taxon 321846:", len(seq_by_taxon[321846]))

Taxon ID: [287, 294, 303, 317, 47883, 53408, 76758, 76759, 76760, 78327, 107445, 321846, 380021, 587753, 930166, 1114970, 2219225, 2565368, 2745498, 2842346, 2895471]
Taxon 321846 Sequence IDs: ['lcl|NZ_CP010896.1_cds_WP_010207718.1_1-2/1', 'lcl|NZ_CP010896.1_cds_WP_010207718.1_1-4/1', 'lcl|NZ_CP010896.1_cds_WP_010207719.1_2-4/1', 'lcl|NZ_CP010896.1_cds_WP_010207719.1_2-2/1', 'lcl|NZ_CP010896.1_cds_WP_021491432.1_3-2/1']
# Sequences in taxon 321846: 51026


In [56]:
# # import regex to extract our k-mer count information
# import re

# # This block is for testing out how to get the kmer-counts in a sequence
# index = 0
# id = k2_output["taxon_id"][index]
# print("Assigned ID:", id)
# lca_mapping = k2_output["LCA_mapping"][index]
# mappings = lca_mapping.split()
# print("Mappings:", mappings)
# total = 0
# match_total = 0
# # loop over LCA mappings for this particular sequence
# for mapping in mappings:
#     query = fr'([0-9]*):([0-9]*)'
#     match = re.match(query, mapping) # use regex match to find query mapping
#     q_id = match.group(1)
#     kmer_count = int(match.group(2))
#     # we want to calculate the confidence score this assignment to the taxon_id
#     print(f"k-mer count for taxon {q_id}:", kmer_count)
#     if int(id) == int(q_id):
#         total += kmer_count
#         match_total += kmer_count
#     else:
#         total += kmer_count
# score = match_total/total
# print("Score:", score)

In [57]:
import re

# function to calculate k2 assignment score
def calculate_score(df, index: int) -> float:
    """
    Calculates the confidence score of taxonomic assignment to a sequence by
    dividing the number of k-mers mapped to a taxon by the total number of 
    k-mers from that sequence.

    Args:
        index (int): an index into the kraken2 output DataFrame, where each 
            index corresponds to a sequence
    """
    tax_id = df["taxon_id"][index]
    seq_id = k2_output["seq_id"][index]
    print("Assigned ID:", tax_id)
    lca_mapping = df["LCA_mapping"][index]
    mappings = lca_mapping.split()
    print("Mappings:", mappings)
    total = 0
    match_total = 0
    # loop over LCA mappings for this particular sequence
    for mapping in mappings:
        query = fr'([0-9]*):([0-9]*)'
        match = re.match(query, mapping) # use regex match to find query mapping
        q_id = match.group(1)
        kmer_count = int(match.group(2))
        # we want to calculate the confidence score this assignment to the taxon_id
        if int(tax_id) == int(q_id):
            total += kmer_count
            match_total += kmer_count
        else:
            total += kmer_count
    score = match_total/total
    print(f"Score for sequence {seq_id} assigned to taxon {tax_id}:", score)
    return score

In [58]:
# Now we can use this function to calculate the score from any index
calculate_score(k2_output, 1)
calculate_score(k2_output, 15)

Assigned ID: 321846
Mappings: ['321846:67', '2:5', '321846:20', '286:1', '321846:5', '286:69']
Score for sequence lcl|NZ_CP010896.1_cds_WP_010207718.1_1-4/1 assigned to taxon 321846: 0.5508982035928144
Assigned ID: 321846
Mappings: ['286:48', '321846:11', '286:1', '321846:17', '286:36', '135621:2', '286:4', '1224:5', '286:2', '2:5', '1224:2', '286:2', '1236:2', '286:2', '1224:3', '1236:5', '286:19']
Score for sequence lcl|NZ_CP010896.1_cds_WP_021491428.1_9-2/1 assigned to taxon 321846: 0.1686746987951807


0.1686746987951807

In [59]:
import pandas as pd
from Bio import SeqIO

def fasta_to_dataframe(fasta_file):
    data = {
        'seq_id': [],
        'sequence': [],
        'bp_length': []
    }

    for record in SeqIO.parse(fasta_file, "fasta"):
        data['seq_id'].append(record.id)
        data['sequence'].append(str(record.seq))
        data['bp_length'].append(len(record.seq))

    df = pd.DataFrame(data)
    return df

# fasta_file_path = "/home/j/jparkin/wongkoji/data/merged_paired_mtx.fasta"
# fasta_df = fasta_to_dataframe(fasta_file_path)

# Display the fasta df
# print(fasta_df)

In [60]:
import pandas as pd
from Bio import SeqIO

def fastq_to_dataframe(fastq_file):
    data = {
        'seq_id': [],
        'sequence': [],
        'bp_length': []
    }

    for record in SeqIO.parse(fastq_file, "fastq"):
        data['seq_id'].append(record.id)
        data['sequence'].append(str(record.seq))
        data['bp_length'].append(len(record.seq))

    df = pd.DataFrame(data)
    return df

fasta_df = fastq_to_dataframe(fastq_file_path)

# Display the fasta df
fasta_df.head()

Unnamed: 0,seq_id,sequence,bp_length
0,lcl|NZ_CP010896.1_cds_WP_010207718.1_1-2/1,GAGAATGAAGAGCCGTCCCGCGACAGCTTTGATCCGATGGCAGGCG...,199
1,lcl|NZ_CP010896.1_cds_WP_010207718.1_1-4/1,AGACGACTTGGAGGGCGCAGGAGCAGGAGACGGCGCACTGACAGCC...,201
2,lcl|NZ_CP010896.1_cds_WP_010207719.1_2-4/1,TGTGCTCTATGCGCGCCGATATCGGTCAGCCGGACCGTCACCAGGT...,216
3,lcl|NZ_CP010896.1_cds_WP_010207719.1_2-2/1,CTTGCCATCAACCAGCTTGGAGGTGAAGGTGAACTCGCCGGTGGTC...,207
4,lcl|NZ_CP010896.1_cds_WP_021491432.1_3-2/1,CCCGTCAATGCGAATCTGGAACTCTCCGCCGCGATCACGGGATATC...,187


In [61]:
fasta_df["seq_id"]

0             lcl|NZ_CP010896.1_cds_WP_010207718.1_1-2/1
1             lcl|NZ_CP010896.1_cds_WP_010207718.1_1-4/1
2             lcl|NZ_CP010896.1_cds_WP_010207719.1_2-4/1
3             lcl|NZ_CP010896.1_cds_WP_010207719.1_2-2/1
4             lcl|NZ_CP010896.1_cds_WP_021491432.1_3-2/1
                               ...                      
1197487    lcl|NZ_CP115258.1_cds_WP_128447624.1_5908-2/1
1197488    lcl|NZ_CP115258.1_cds_WP_023089992.1_5909-4/1
1197489    lcl|NZ_CP115258.1_cds_WP_023089992.1_5909-2/1
1197490    lcl|NZ_CP115258.1_cds_WP_022581007.1_5910-4/1
1197491    lcl|NZ_CP115258.1_cds_WP_022581007.1_5910-2/1
Name: seq_id, Length: 1197492, dtype: object

In [62]:
a = seq_by_taxon[321846]
sequences_taxon_321846 = fasta_df[fasta_df["seq_id"].isin(a)]


In [63]:
print(sequences_taxon_321846[0:5])

                                       seq_id  \
0  lcl|NZ_CP010896.1_cds_WP_010207718.1_1-2/1   
1  lcl|NZ_CP010896.1_cds_WP_010207718.1_1-4/1   
2  lcl|NZ_CP010896.1_cds_WP_010207719.1_2-4/1   
3  lcl|NZ_CP010896.1_cds_WP_010207719.1_2-2/1   
4  lcl|NZ_CP010896.1_cds_WP_021491432.1_3-2/1   

                                            sequence  bp_length  
0  GAGAATGAAGAGCCGTCCCGCGACAGCTTTGATCCGATGGCAGGCG...        199  
1  AGACGACTTGGAGGGCGCAGGAGCAGGAGACGGCGCACTGACAGCC...        201  
2  TGTGCTCTATGCGCGCCGATATCGGTCAGCCGGACCGTCACCAGGT...        216  
3  CTTGCCATCAACCAGCTTGGAGGTGAAGGTGAACTCGCCGGTGGTC...        207  
4  CCCGTCAATGCGAATCTGGAACTCTCCGCCGCGATCACGGGATATC...        187  


In [64]:
print(f"There are {len(sequences_taxon_321846)} sequences binned in taxon 321846")

There are 51026 sequences binned in taxon 321846


In [65]:
# id_to_seq_map = {}
# for tax_id in list(seq_by_taxon.keys()):
#     print(tax_id)
#     seq_ids = seq_by_taxon[tax_id]
#     id_to_seq_map[tax_id] = fasta_df[fasta_df["seq_id"].isin(seq_ids)]

# need to find a more efficient way

In [66]:
fasta_df = fasta_df[fasta_df["seq_id"].isin(k2_output_filt["seq_id"])]
merged_df = pd.merge(fasta_df, k2_output[['seq_id', 'taxon_id', "LCA_mapping"]], on='seq_id', how='left')

In [67]:
merged_df.dropna(inplace=True)
# Remove unclassified (taxon 0) reads
merged_df = merged_df[merged_df["taxon_id"] != 0]
print("number of taxons:", len(merged_df['taxon_id'].unique()))
print("number of reads:", len(merged_df))

number of taxons: 21
number of reads: 750143


In [68]:
k2_output_filt["taxon_id"]

0          321846
1          321846
2          321846
3          321846
4          321846
            ...  
1197487       287
1197488       287
1197489       287
1197490       287
1197491       287
Name: taxon_id, Length: 750143, dtype: int64

In [69]:
# Check for discrepancies
# discrepancies = merged_df[merged_df['taxon_id'] != pd.Series(k2_output_filt['taxon_id'])]

# # If there are no discrepancies, discrepancies dataframe will be empty
# if discrepancies.empty:
#     print("All taxon_ids were added correctly.")
# else:
#     print("Discrepancies found:")
#     print(discrepancies)

In [70]:
# Now that we have a DataFrame containing the sequence_ids, sequences, and taxon_ids, we can perform EC analysis with DeepEC
# First, let's create a fasta file containing all the seq_ids and sequences corresponding to a taxon

# Example taxon_id to filter
target_taxon_id = 321846

# Filter rows in merged_df based on the target_taxon_id
filtered_df = merged_df[merged_df['taxon_id'] == target_taxon_id]

# Extract seq_ids and sequences
seq_ids = filtered_df['seq_id'].tolist()
sequences = filtered_df['sequence'].tolist()

# Writing to a FASTA file
fasta_file = '/home/j/jparkin/wongkoji/data/taxon_321846_reads.fasta'

with open(fasta_file, 'w') as fasta_file:
    for seq_id, sequence in zip(seq_ids, sequences):
        fasta_file.write(f">{seq_id}\n{sequence}\n")

In [71]:
merged_df["taxon_id"].unique()

array([ 321846, 1114970,     294,   47883,   76760,     317,   53408,
       2895471,  930166,  587753,  380021, 2745498,     303,     287,
         76758, 2842346,   78327,   76759, 2565368,  107445, 2219225])

In [72]:
# Now that we have a DataFrame containing the sequence_ids, sequences, and taxon_ids, we can perform EC analysis with DeepEC
# First, let's create a fasta file containing all the seq_ids and sequences corresponding to a taxon

all_taxa = merged_df["taxon_id"].unique()
# Example taxon_id to filter
target_taxon_id = 321846

# Filter rows in merged_df based on the target_taxon_id
filtered_df = merged_df[merged_df['taxon_id'] == target_taxon_id]

# Extract seq_ids and sequences
seq_ids = filtered_df['seq_id'].tolist()
sequences = filtered_df['sequence'].tolist()

# Writing to a FASTA file
fasta_file = '/home/j/jparkin/wongkoji/data/taxon_321846_reads.fasta'

with open(fasta_file, 'w') as fasta_file:
    for seq_id, sequence in zip(seq_ids, sequences):
        fasta_file.write(f">{seq_id}\n{sequence}\n")

In [73]:
from Bio import SeqIO
import pandas as pd

# Step 1: Read FASTQ file and create DataFrame
def fastq_to_df(fastq_file):
    records = SeqIO.parse(fastq_file, "fastq")
    data = [(record.id, str(record.seq), record.letter_annotations["phred_quality"]) for record in records]
    df = pd.DataFrame(data, columns=["Sequence_ID", "Sequence", "Quality_Scores"])
    return df

# Step 2: Filter DataFrame based on taxon sequence IDs
def filter_sequences(df, taxon_sequence_ids):
    filtered_df = df[df['Sequence_ID'].isin(taxon_sequence_ids)]
    return filtered_df

# Step 3: Write filtered sequences to a new FASTQ file
def write_filtered_fastq(filtered_df, output_file):
    with open(output_file, "w") as f:
        for index, row in filtered_df.iterrows():
            f.write(f"@{row['Sequence_ID']}\n")
            f.write(f"{row['Sequence']}\n+\n{''.join(map(chr, pd.Series(row['Quality_Scores']) + 33))}\n")

# # Example usage
taxon = target_taxon_id
# fastq_file = fastq_file_path
# output_file = f"/home/j/jparkin/wongkoji/data/{taxon}.fastq"
# # Filter rows in merged_df based on the target_taxon_id
# filtered_df = merged_df[merged_df['taxon_id'] == taxon]
# taxon_sequence_ids = filtered_df['seq_id'] # Example list of sequence IDs for a specific taxon

# # Step 1: Read FASTQ file and create DataFrame
# df = fastq_to_df(fastq_file)

# # Step 2: Filter DataFrame based on taxon sequence IDs
# filtered_df = filter_sequences(df, taxon_sequence_ids)

# # Step 3: Write filtered sequences to a new FASTQ file
# write_filtered_fastq(filtered_df, output_file)


In [74]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

def translate_without_internal_stops(seq):
    stop_present = False
    translated_seq = seq.translate()
    if '*' in translated_seq:
        return None
    return str(translated_seq)

def main(input_file, output_file):
    valid_records = []
    with open(input_file, 'r') as f:
        records = SeqIO.parse(f, 'fasta')
        for record in records:
            translated_seq = translate_without_internal_stops(record.seq)
            if translated_seq:
                protein_record = SeqRecord(Seq(translated_seq), id=record.id, description="")
                valid_records.append(protein_record)

    with open(output_file, 'w') as f:
        SeqIO.write(valid_records, f, 'fasta')

input_file = f'/home/j/jparkin/wongkoji/data/taxon_{taxon}_reads.fasta'
output_file = f'/home/j/jparkin/wongkoji/data/taxon_{taxon}_reads_translated.fasta'
main(input_file, output_file)



In [75]:
import subprocess
#TODO: Get all sequences per taxon and add them to a single fasta for all taxons
taxon_ids = list(seq_by_taxon.keys())

#TODO: Translate each one

#TODO: Pass through deepEC

In [76]:
deepec_file = "/scratch/j/jparkin/wongkoji/deepec_outputs/deepEC_321846/DeepECv2_result.txt"
deepec_df = pd.read_csv(deepec_file, sep='\t')
print(len(deepec_df))
# Drop any unannotated reads
deepec_df = deepec_df[deepec_df['prediction'] != 'None']
# Number of unique enzymes (ECs) present in sample
len(deepec_df['prediction'].unique()) - 1
deepec_df

19392


Unnamed: 0,sequence_ID,prediction
4,lcl|NZ_CP010896.1_cds_WP_044286386.1_10-4/1,EC:6.1.1.14
7,lcl|NZ_CP010896.1_cds_WP_003187265.1_11-4/1,EC:6.1.1.14
16,lcl|NZ_CP010896.1_cds_WP_010207746.1_23-4/1,EC:1.3.3.3
17,lcl|NZ_CP010896.1_cds_WP_010207746.1_23-2/1,EC:1.3.3.3
18,lcl|NZ_CP010896.1_cds_WP_010207747.1_24-4/1,EC:1.1.1.25
...,...,...
19341,lcl|NZ_CP117448.1_cds_WP_021492572.1_5804-4/1,EC:1.1.1.3
19356,lcl|NZ_CP117448.1_cds_WP_305469641.1_5821-2/1,EC:3.6.-.-
19360,lcl|NZ_CP117448.1_cds_WP_042571091.1_5827-4/1,EC:3.5.4.16
19363,lcl|NZ_CP117448.1_cds_WP_044288079.1_5831-4/1,EC:3.5.4.19


In [77]:
# still contains incomplete ECs eg. EC:3.6.-.-
#TODO: these incomplete ECs
def clean_ec_data(ec_number):
    if not ec_number:
        return "None"
    if '-' in ec_number:
        return "None"
    # TODO: split these
    elif ';' in ec_number:
        return "None"
    else:
        return ec_number
    
deepec_df['prediction'] = deepec_df['prediction'].apply(clean_ec_data)

# Drop any unannotated reads
deepec_df = deepec_df[deepec_df['prediction'] != 'None']
# format ECs with lowercase for KEGG API
deepec_df['prediction'] = deepec_df['prediction'].apply(str.lower)
deepec_df.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,sequence_ID,prediction
19337,lcl|NZ_CP117448.1_cds_WP_010207600.1_5797-4/1,ec:3.1.3.15
19341,lcl|NZ_CP117448.1_cds_WP_021492572.1_5804-4/1,ec:1.1.1.3
19360,lcl|NZ_CP117448.1_cds_WP_042571091.1_5827-4/1,ec:3.5.4.16
19363,lcl|NZ_CP117448.1_cds_WP_044288079.1_5831-4/1,ec:3.5.4.19
19370,lcl|NZ_CP117448.1_cds_WP_021491450.1_5840-4/1,ec:6.5.1.2


In [78]:
from Bio.KEGG.REST import kegg_link

# Define the pathway ID
pathway_id = 'ec00300'  # 'path:' prefix indicates a pathway in KEGG

In [79]:
# creating a df for our ecs
enzymes_file = os.path.join(parent_dir, "db/enzymes_ec.tsv")
enzymes_df = pd.read_csv(enzymes_file, delimiter='\t', names=["ec", "name"])
# creating a df for our pathways
pathways_file = os.path.join(parent_dir, "db/pathways_ec.tsv")
pathways_df = pd.read_csv(pathways_file, delimiter='\t', names=["pathway", "function"])

In [80]:
# load in our ec-pathways relationship
import json
json_file = open(os.path.join(parent_dir, "db/pathways.json"), "r")
ec_paths = json.load(json_file)

In [81]:
# rows = [(key, value) for key, values in ec_paths.items() for value in values]
# paths_to_ec_df = pd.DataFrame(rows, columns=['pathway', 'ec'])
# paths_to_ec_df.head()

paths_to_ec_df = pd.read_csv(os.path.join(parent_dir, "db/pathways_ecs.csv"))

In [82]:
def get_pathways(ec_id, df: pd.DataFrame):
    """get a list of pathways given an EC number in format ec:#.#.#.#"""
    if "ec:" not in ec_id:
        raise NameError
    return list(df[df['ec'] == ec_id]['pathway'])

def get_enzymes(pathway_id, df: pd.DataFrame):
    """get a list of enzymes given a KEGG pathway in format ec#####"""
    return list(df[df['pathway'] == pathway_id]['ec']) 

In [83]:
# to get all the ecs for a pathway
paths_to_ec_df[paths_to_ec_df['pathway'] == "ec00970"].head()
# to get all the pathways for an ec
paths_to_ec_df[paths_to_ec_df['ec'] == "ec:1.1.1.3"].head()

Unnamed: 0,pathway,ec
96,ec01100,ec:1.1.1.3
2866,ec01110,ec:1.1.1.3
4186,ec01120,ec:1.1.1.3
6759,ec00260,ec:1.1.1.3
6829,ec00270,ec:1.1.1.3


In [84]:
ec_pathways_dict = {}
# Finds pathways an enzyme is present in within half a second
def find_pathways_opt(ec_number, ec_pathways_dict):
    """Finds pathways an enzyme is present

    Args:
        ec_number (str): must be a valid Enzyme Commission number

    Returns:
        list of pathways
    """
    if 'ec' not in ec_number:
        ec_number = f"ec:{ec_number}"
    pathways = get_pathways(ec_number, paths_to_ec_df)
    for path in pathways:
        if path not in ec_pathways_dict:
            ec_pathways_dict[path] = [ec_number]
        else:
            ec_pathways_dict[path].append(ec_number)
    return ec_pathways_dict

In [85]:
present_enzymes = list(set(deepec_df['prediction']))
for ec in present_enzymes:
    find_pathways_opt(ec, ec_pathways_dict)
print(ec_pathways_dict)

{'ec00970': ['ec:6.1.1.1', 'ec:6.1.1.18', 'ec:6.1.1.5', 'ec:6.3.5.7', 'ec:6.1.1.10', 'ec:2.1.2.9', 'ec:6.1.1.6', 'ec:6.1.1.11', 'ec:6.1.1.14', 'ec:6.1.1.21', 'ec:6.1.1.19', 'ec:6.1.1.15', 'ec:6.1.1.7', 'ec:6.1.1.16', 'ec:6.1.1.20', 'ec:6.1.1.4', 'ec:6.1.1.9', 'ec:6.1.1.2'], 'ec01100': ['ec:2.3.1.15', 'ec:2.7.1.23', 'ec:1.14.13.131', 'ec:2.6.1.1', 'ec:5.4.99.2', 'ec:1.14.11.1', 'ec:2.7.9.2', 'ec:6.3.1.11', 'ec:5.3.3.2', 'ec:5.3.1.16', 'ec:2.7.4.23', 'ec:4.2.1.9', 'ec:2.5.1.18', 'ec:3.5.2.7', 'ec:2.5.1.3', 'ec:1.11.1.7', 'ec:3.5.3.4', 'ec:4.3.2.3', 'ec:2.1.3.3', 'ec:2.7.7.33', 'ec:4.2.1.164', 'ec:2.3.1.41', 'ec:2.1.1.152', 'ec:3.1.3.1', 'ec:2.7.1.25', 'ec:1.3.1.98', 'ec:6.3.5.2', 'ec:4.1.1.81', 'ec:4.3.2.1', 'ec:5.3.1.28', 'ec:4.1.3.40', 'ec:1.14.99.1', 'ec:2.7.1.202', 'ec:2.8.3.5', 'ec:2.3.1.274', 'ec:3.5.4.25', 'ec:1.5.1.42', 'ec:1.14.13.127', 'ec:1.2.1.19', 'ec:6.1.1.18', 'ec:3.1.2.6', 'ec:2.6.1.113', 'ec:1.1.1.60', 'ec:4.2.1.84', 'ec:1.2.1.41', 'ec:3.1.3.45', 'ec:2.7.2.2', 'ec:2.4.2.

In [86]:
# TODO: create a dictionary with keys as pathway and values as enzymes present in deep_ec_df['prediction']
completeness = {}
present_enzymes = list(set(deepec_df['prediction']))

In [87]:
# from Bio.KEGG.REST import kegg_link
import re
# Note: some ecs don't have a corresponding pathway such as ec:1.4.3.11
# calculate_pathway_completeness("ec00300", ec_pathways_dict)

# now we can calculate pathway completeness
def calculate_pathway_completeness(pathway_of_interest, ec_pathways):
    if pathway_of_interest not in ec_pathways:
        print(f"Pathway {pathway_of_interest} not present in dictionary.")
        return
    if pathway_of_interest not in list(paths_to_ec_df['pathway']):
        print(f"Pathway {pathway_of_interest} not present in DataFrame.") 
        return
    
    # Get the list of EC numbers involved in the pathway of interest
    total_enzymes = set(get_enzymes(pathway_of_interest, paths_to_ec_df))
    present_enzymes = list(set(ec_pathways[pathway_of_interest]))
    # print(present_enzymes)
    
    try:
        completeness = len(present_enzymes) / len(total_enzymes)
        return completeness
    except ZeroDivisionError as e:
        print(pathway_of_interest)
        print("division by zero, is the pathway of interest formatted correctly? (eg. 'ec00300')")
        return

# brute force
def calculate_all_paths_completeness(ec_pathways_dict):
    pathway_completeness = {}
    for path in ec_pathways_dict:
        if path not in pathway_completeness:
            pathway_completeness[path] = calculate_pathway_completeness(path, ec_pathways_dict)
    return pathway_completeness

completeness = calculate_all_paths_completeness(ec_pathways_dict)

In [88]:
ec00270 = calculate_pathway_completeness("ec00270", ec_pathways_dict)
print(ec00270)

0.24719101123595505


In [89]:
# convert completeness to a dataframe which we can export as a csv
completeness_df = pd.DataFrame(list(completeness.items()), columns=['pathway', 'completeness'])
completeness_df.to_csv(os.path.join(parent_dir, "output/321846_pathway_completeness.csv"), index=False)