### import libs
+ Note: sometimes when you update uitl functions, re-run import may not load your change
+ Try restart kernal.

In [1]:
import pandas as pd
import sys
import nest_asyncio
import itertools
import os
nest_asyncio.apply() # for fetch_ec_improved to run in jupyter notebook

from EC40_loader import EC40_loader

from fetch_ec_improved import fetch_ec_async
from evaluate_ec import evaluate_ec
from abstracts.AbstractDataLoader import AbstractDataLoader
from Diamond.DiamondFeatureEngineer import DiamondFeatureEngineer
from Diamond.DiamondPredictor import DiamondPredictor
import constants
import numpy as np

### File Paths

In [2]:
ec40_path = "../dataset/ec40"
diamond_results_path = "../dataset/diamond_results"
ec_results_file = "ec_results.csv"
diamond_output_file = "diamond_output.m8"
filtered_output_file = "filtered_diamond_output.m8"
evaluate_file = "evaluation_results.csv"

### Load EC40 Dataset

In [3]:
ec40_loader = EC40_loader(_dir=ec40_path)
ec40_loader.set_source_file("ec40.csv")
processed_ec40 = ec40_loader.preprocess()
display(processed_ec40)
ec40_train, ec40_valid, ec40_test = ec40_loader.train_test_split(processed_ec40)

Unnamed: 0,accession,sequence,ec,traintest,negative_for,mainclass_set,sprot_version,len,cluster_ID,representative
0,Q7VRM4,MQAKILRIATRKSPLAICQACYVCNKLKHYHPHIQTELIPIITTGD...,['2.5.1.61'],0,,Transferases,2017_03,1,cdhit40.fasta_410186,False
1,A4XK06,MKKLRIGARDSKLSRIQVDIVARKIKQTLGIECEFVPIKTKGDIDK...,['2.5.1.61'],0,,Transferases,2017_03,1,cdhit40.fasta_171115,False
2,Q8KCJ4,MKKELIIGTRSSPLALWQAEFTKAELSRHFPELNITLKLVKTTGDV...,['2.5.1.61'],0,,Transferases,2017_03,1,cdhit40.fasta_410186,False
3,Q9VR91,MFNRQASGGAGSSGQGAGSSQTASAAPVSAGVGVGGGGGASGAAAG...,['2.3.2.26'],0,,Transferases,2017_03,1,cdhit40.fasta_134383,True
4,O95714,MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIV...,['2.3.2.26'],0,,Transferases,2017_03,1,cdhit40.fasta_42431,False
...,...,...,...,...,...,...,...,...,...,...
55125,Q54944,MANIVNFTDKQFENRLNDNLEELIQGKKAVESPTAFLLGGQPGSGK...,['2.7.1.176'],1,,Transferases,2017_03,1,cdhit40.fasta_18384,False
55126,Q9T080,MGGLKFHVLMYPWFATGHMTPFLFLANKLAEKGHTVTFLIPKKALK...,['2.4.1.-'],1,,Transferases,2017_03,1,cdhit40.fasta_405900,False
55127,Q03VR7,MAQTIDIANPTRTQAILNEYGLRAKKKFGQNFLTDLNVLHNIVEAA...,['2.1.1.182'],1,,Transferases,2017_03,1,cdhit40.fasta_32179,False
55128,A0LH38,MSKLVPPHGKEKKLKPLLLEGAALAAEKEKAKTLKVVPMTSREASD...,['2.7.7.4'],1,,Transferases,2017_03,1,cdhit40.fasta_437049,False


Removed 2622 cluster(s) from train_val_data to avoid overlap with test.
train: 30967
valid: 4921
test: 5459


### Prepare fatsa sequence

In [None]:
train_fasta_path = ec40_loader.get_fasta("train")
valid_fasta_path = ec40_loader.get_fasta("valid")
test_fasta_path = ec40_loader.get_fasta("test")

Found 30967 sequences.
Finished writting to ../dataset/ec40/train.fasta
Found 4921 sequences.
Finished writting to ../dataset/ec40/valid.fasta
Found 5459 sequences.
Finished writting to ../dataset/ec40/test.fasta


### Dimond Query - Feature Engineering

Download DIMOND
+ if not downloaded, uncomment below (linux version)

In [None]:
# linux
# %cd Diamond
# !wget http://github.com/bbuchfink/diamond/releases/download/v2.0.4/diamond-linux64.tar.gz
# !tar xzf diamond-linux64.tar.gz
# %cd ..

Prepare DIMOND Database Folder
+ if not downloaded, uncomment below

In [None]:
# !mkdir ../dataset/dimond_db/
# %cd ../dataset/dimond_db/

Download UniRef90
+ if not downloaded, uncomment below

In [None]:
# !wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref90/uniref90.fasta.gz

In [None]:
# %cd ../../experiments

Generate Database
+ it takes roughly 45 mins for 24 cpu threads
+ if not generated uncomment below

In [None]:
# !Diamond/diamond makedb --in ../dataset/dimond_db/uniref90.fasta.gz -d ../dataset/dimond_db/uniref90.dmnd

Run DIAMOND search 
+ 30 mins for all test_sequences
+ DIAMOND Output format

            qseqid means Query Seq-id
           sseqid means Subject Seq-id
           pident means Percentage of identical matches
           length means Alignment length
         mismatch means Number of mismatches
          gapopen means Number of gap openings
           qstart means Start of alignment in query
             qend means End of alignment in query
           sstart means Start of alignment in subject
             send means End of alignment in subject
           evalue means Expect value
         bitscore means Bit score

In [None]:
test_diamond_output_file = os.path.join(diamond_results_path, f"test_{diamond_output_file}")
print(test_fasta_path)
print(test_diamond_output_file)

../dataset/ec40/test.fasta
../dataset/diamond_results/test_diamond_output.m8


In [None]:
# !Diamond/diamond blastp --db ../dataset/dimond_db/uniref90.dmnd --query ../dataset/ec40/test.fasta --out ../dataset/diamond_results/test_diamond_output.m8 --quiet

In [None]:
# !Diamond/diamond blastp --db ../dataset/dimond_db/uniref90.dmnd --query ../dataset/ec40/train.fasta --out ../dataset/diamond_results/train_diamond_output.m8 --quiet

In [None]:
# !Diamond/diamond blastp --db ../dataset/dimond_db/uniref90.dmnd --query ../dataset/ec40/valid.fasta --out ../dataset/diamond_results/valid_diamond_output.m8 --quiet

In [16]:
!head $test_diamond_output_file

Q8EB91	UniRef90_Q8EB91	100.0	274	0	0	1	274	1	274	4.3e-160	573.5
Q8EB91	UniRef90_UPI001C65D99A	87.2	274	35	0	1	274	1	274	3.1e-142	514.2
Q8EB91	UniRef90_A0A2M7HU62	85.8	274	39	0	1	274	1	274	4.9e-140	506.9
Q8EB91	UniRef90_A3D186	84.3	274	43	0	1	274	1	274	1.1e-136	495.7
Q8EB91	UniRef90_A1RMV0	83.9	274	44	0	1	274	1	274	1.5e-136	495.4
Q8EB91	UniRef90_A0A1N6ZPP1	82.8	274	47	0	1	274	1	274	3.1e-134	487.6
Q8EB91	UniRef90_A0A972FYA5	71.8	273	77	0	1	273	1	273	5.5e-115	423.7
Q8EB91	UniRef90_A0A6G7LV78	72.2	273	75	1	1	273	1	272	7.9e-114	419.9
Q8EB91	UniRef90_UPI0037355830	72.3	271	75	0	1	271	1	271	1.0e-113	419.5
Q8EB91	UniRef90_Q07YJ6	69.4	271	83	0	1	271	1	271	6.2e-111	410.2


In [17]:
diamond_loader = AbstractDataLoader(diamond_results_path)
diamond_loader.test_path = test_diamond_output_file
diamond_fe = DiamondFeatureEngineer(diamond_results_path, diamond_loader, prefix="filtered")
diamond_fe.apply_feature_engineering("test")

finished writting 476 query to ../dataset/diamond_results/filtered_test.csv


### Fetch EC number from UniProt API - Generate Prediction

In [19]:
diamond_pred = DiamondPredictor(predict_path=os.path.join(diamond_results_path, ec_results_file))
diamond_pred.predict(diamond_fe.feature_loader.test_path)

Parsing DIAMOND output and fetching EC numbers concurrently...


Processing queries: 100%|██████████| 476/476 [00:06<00:00, 73.72it/s] 

Saving results to CSV...
Results saved to '../dataset/diamond_results/ec_results.csv'





Unnamed: 0,Query,Subject,EC Number
0,Q9HJS3,UniRef90_A0A497Q6U7,3.1.21.2
1,P47618,UniRef90_A0AAW6HNF0,6.1.1.19
2,C0QU23,UniRef90_A0A432ERB0,6.3.4.19
3,Q75CI5,UniRef90_Q6CQ13,2.3.2.-
4,Q2GLU6,UniRef90_A0A218KP16,3.1.21.10
...,...,...,...
471,Q6DN58,,No EC number found
472,O94420,,No EC number found
473,P47490,,No EC number found
474,Q9Y7U9,,No EC number found


### Evaluate EC Result

+ No Prediction: No available diamond alignment result
+ No EC number found: No record in uniprot api

In [13]:
# !mkdir ../metrics

In [4]:
eval_report = evaluate_ec("../dataset/diamond_results/ec_results.csv", os.path.join(diamond_results_path, f"test_{evaluate_file}"), ec40_loader.test_path, method_name="Diamond Benchmark")
eval_report

Loading data...
Matching predictions...


Matching Predictions: 100%|██████████| 5459/5459 [00:00<00:00, 8663.42it/s]


Computing evaluation metrics...
Saved results to and ../dataset/diamond_results/test_evaluation_results.csv


Unnamed: 0,Method,Exact Match Accuracy,No EC number found,No Prediction,Position 1 Accuracy,Position 1 Precision,Position 1 Recall,Position 1 F1-Score,Position 2 Accuracy,Position 2 Precision,Position 2 Recall,Position 2 F1-Score,Position 3 Accuracy,Position 3 Precision,Position 3 Recall,Position 3 F1-Score,Position 4 Accuracy,Position 4 Precision,Position 4 Recall,Position 4 F1-Score
0,Diamond Benchmark,0.039201,0.941015,0.941015,0.055688,0.721168,0.037458,0.07089,0.053306,0.486238,0.031849,0.058538,0.04946,0.586344,0.039202,0.072057,0.039201,0.275194,0.044003,0.068529


In [None]:
metrics = pd.read_csv('../metrics/experiment_results.csv')
report_combined = pd.concat([metrics, eval_report], axis=0, ignore_index=True)
display(report_combined)
report_combined.to_csv('../metrics/experiment_results.csv', index=False)