### import libs
+ Note: sometimes when you update uitl functions, re-run import may not load your change
+ Try restart kernal.

In [None]:
import pandas as pd
import sys
sys.path.append("experiments")
import nest_asyncio
nest_asyncio.apply() # for fetch_ec_improved to run in jupyter notebook

from fetch_ec_improved import fetch_ec_async
from evaluate_ec import evaluate_ec

### Load EC40 Dataset

In [None]:
ec40 = pd.read_pickle('../dataset/ec40/ec40.pkl')
ec40

### Prepare test sequence

In [None]:
# Filter for test sequences
# (Adjust the filtering criteria if your CSV uses a different convention;
#  here we assume '0' indicates test sequences.)
test_df = ec40[ec40["traintest"] == 0]

print(f"Found {len(test_df)} test sequences.")

# Write the test sequences to a FASTA file.
# We will use the 'accession' column as the FASTA header and 'sequence' as the sequence.
with open("../dataset/test_sequences/test_sequences.fasta", "w") as fout:
    for index, row in test_df.iterrows():
        accession = row["accession"]
        sequence = row["sequence"]
        fout.write(f">{accession}\n{sequence}\n")

### Dimond Query

Download DIMOND
+ if not downloaded, uncomment below (linux version)

In [None]:
# linux
# !wget http://github.com/bbuchfink/diamond/releases/download/v2.0.4/diamond-linux64.tar.gz
# !tar xzf diamond-linux64.tar.gz

Prepare DIMOND Database Folder
+ if not downloaded, uncomment below

In [None]:
# !mkdir ../dataset/dimond_db/
# %cd ../dataset/dimond_db/

Download UniRef90
+ if not downloaded, uncomment below

In [None]:
# !wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref90/uniref90.fasta.gz

In [None]:
# %cd ../../experiments

Generate Database
+ it takes roughly 45 mins for 24 cpu threads
+ if not generated uncomment below

In [None]:
# !diamond makedb --in ../dataset/dimond_db/uniref90.fasta.gz -d ../dataset/dimond_db/uniref90.dmnd

Run DIMOND search 
+ 84 mins for all test_sequences

In [None]:
!diamond blastp --db ../dataset/dimond_db/uniref90.dmnd \
                --query ../dataset/test_sequences/test_sequences.fasta \
                --out ../dataset/test_sequences/test_sequences_results.m8

### Fetch EC number from UniProt API
+ 37 mins for all test sequences

In [None]:
output_file = "../dataset/test_sequences/test_sequences_results.m8"
ec_result_path = "../dataset/test_sequences/test_sequences_ec_results.csv"

fetch_ec_async(output_file, ec_result_path)

### Evaluate EC Result

In [None]:
!mkdir ../metrics

In [None]:
ec_results_file = "../dataset/test_sequences/test_sequences_ec_results.csv"
metrics_file = "../metrics/metrics.csv"
evaluate_file = "../dataset/test_sequences/evaluation_results.csv"
ec40_file = "../dataset/ec40/ec40.csv"
evaluate_ec(ec_results_file, metrics_file, evaluate_file, ec40_file)

### Visualize Metrics

In [None]:
metrics = pd.read_csv("../metrics/metrics.csv")
metrics