# <font color=#c51b8a>VPOD 'Mine-n-Match':</font>
## <font color=#c994c7>Part 1 </font> - Use Species Names from Merged Accessory DBs to Query NCBI for All Related Opsin Sequences  

In [None]:
import os
import json
import pandas as pd
email = 'seth.frazer@embl.de'

In [None]:
from mnm_scripts.mine_n_match_functions import  merge_accessory_dbs
from mnm_scripts.ncbi_functions import ncbi_fetch_opsins
from mnm_scripts.utils import fasta_to_dataframe

## <font color=#c51b8a>Load data-tables with all of the species and Lambda Max data from accessory lmax databases</font> 

### <font color=#c994c7>VPOD Single Cell Microspectrophotmetry (SCP) Datatable </font>
### In this case our dataframe does not have full species name in one column so we must create a list by directly combining the genus and species names. Then filter to create a list of all unique species names 


In [None]:
report_dir = './data_sources/lmax'
df_list = []

In [None]:
scp_file = f'{report_dir}/vpod/scp_cleaned.txt'
scp_df = pd.read_csv(scp_file, index_col=0, sep='\t', encoding='utf-8')
scp_df['Full_Species'] = (scp_df['Genus'] + ' ' + scp_df['Species']).to_list()
df_list.append(scp_df)
scp_df.head()

### <font color=#c994c7>Longcore - 'Animal Photopigments' Datatable</font>

### In this next case our dataframe has the full species name in one column so we can create a list directly and filter to create a list of all unique species names


In [None]:
longcore_file = f'{report_dir}/longcore_data/AnimalPhotopigmentsV1_1.csv'
longcore_df = pd.read_csv(longcore_file, index_col=0, encoding='utf-8')
df_list.append(longcore_df)
longcore_df.head()

### <font color=#c994c7>Murphy and Westerman Datatable</font>

In [None]:
murphy_westerman_file = f'{report_dir}/murphy_westerman/Murphy and Westerman.csv'
murphy_westerman_df = pd.read_csv(murphy_westerman_file, index_col=0, encoding='utf-8')
df_list.append(murphy_westerman_df)
murphy_westerman_df.head()

### <font color=#c994c7>Caves 'Fish' Datatable</font>

In [None]:
caves_fish_db_file = f'{report_dir}/caves/caves_fish_db.csv'
caves_df = pd.read_csv(caves_fish_db_file, index_col=0)
caves_df['Full_Species'] = (caves_df['Genus'] + ' ' + caves_df['Species']).to_list()
df_list.append(caves_df)
caves_df.head()

### <font color=#c994c7>Morgan Porter's Datatables</font>

In [None]:
porter_file1 = f'{report_dir}/megan_porter/megan_porter_extract_clean_2005.csv'
porter_df1 = pd.read_csv(porter_file1, index_col=0)
porter_df2 = pd.read_csv(f'{report_dir}/megan_porter/porter_2006_table1_clean.tsv', sep='\t', index_col=0, encoding='utf-8')
df_list.append(porter_df1)
df_list.append(porter_df2)
porter_df1.head()

In [None]:
porter_df2.head()

### <font color=#c994c7>J. Van Der Kooi's Datatable</font>

In [None]:
kooi_insect_db_file = f'{report_dir}/j_van_der_kooi/j_van_der_kooi_2021_mt_cleaned.csv'
kooi_df = pd.read_csv(kooi_insect_db_file, index_col=0, encoding='utf-8')
df_list.append(kooi_df)
kooi_df.head()

### <font color=#c994c7>Merge Accessory Lambda Max Databases</font>

In [None]:
# Call the function to merge all the species, lambdamax, and potential accession information into one dataframe
merged_df, merged_df_file = merge_accessory_dbs(df_list, report_dir)
merged_df.head()

In [None]:
merged_df.shape

### <font color=#c994c7>All unique species names have been extracted from accessory databases. Now we iteratively query NCBI for opsins from each species.</font>

In [None]:
# Option to just load an existing merged database and by-pass replication
report_dir = './data_sources/lmax/'
merged_df = pd.read_csv(f'{report_dir}/VPOD_in_vivo_1.0_2025-09-22_15-52-28.csv')

In [None]:
species_list = merged_df["Full_Species"].to_list()
# This is the length of the species list before filtering for only unique species names
len(species_list)

In [None]:
# This is the length of the species list which only includes unique species names
species_list = list(set(species_list))
len(species_list)

## <font color=#c51b8a>Run NCBI Query Mining Process</font>

In [None]:
ncbi_query_df, query_report_dir = ncbi_fetch_opsins(email=email, job_label='all_dbs', out='all_dbs', species_list=species_list)

In [None]:
ncbi_query_df.shape

In [None]:
len(ncbi_query_df['Full_Species'].unique())

## <font color=#c51b8a>Load Accessory Opsin Sequence Databases</font> 

### <font color=#c994c7>Load Previous MnM Data</font>

In [None]:
# Ignore this box, it's here for cases where you want to load existing query df
#query_report_dir = "mnm_data/mnm_on_all_dbs_2025-10-03_17-37-05"
#ncbi_query_file = f'{query_report_dir}/mnm_on_all_dbs_ncbi_q_data_cleaned.csv'
#ncbi_query_df = pd.read_csv(ncbi_query_file)

In [None]:
import json
taxon_file = './data_sources/taxonomy/ncbi_taxon_dict.json'
with open(taxon_file, 'r') as f:
    existing_taxon_dict = json.load(f)

In [None]:
ncbi_sp_list = ncbi_query_df['Full_Species'].to_list()
ncbi_prot_list = ncbi_query_df['Protein'].to_list()
ncbi_query_df.head()

In [None]:
report_dir2 = './data_sources/seqs'
sequence_list = []
acc_list = []
db_sp_list = []
prot_descriptions = []
source_list = []

### <font color=#c994c7>Bilaterian Animal Opsins: Ramirez et al 2016</font>

In [None]:
ramirez_file = f'{report_dir2}/bilaterian_animal_ops/Supplemental_Table_T1_cleaned.csv'
ramirez_df = pd.read_csv(ramirez_file)
ramirez_df.head()

In [None]:
ramirez_df.shape

In [None]:
for i, org in enumerate(ramirez_df['Organism']):
    if org not in species_list:
        for sp in species_list:
            if (org in existing_taxon_dict.get(sp,{}).get('Synonyms',[])) or (sp in existing_taxon_dict.get(org,{}).get('Synonyms',[])):
                new_row = ramirez_df.iloc[i].copy()
                #print(f'Old Species: {new_row["Organism"]} // New Species: {sp}')
                new_row['Organism'] = sp
                #print(new_row)
                ramirez_df = pd.concat([ramirez_df, new_row], ignore_index=True)

In [None]:
ramirez_df_filtered = ramirez_df[ramirez_df['Organism'].isin(species_list)].copy()
ramirez_df_filtered.shape

In [None]:

ramirez_df_filtered['raw_seqs'] = ramirez_df_filtered['aligned_seqs'].str.replace('-', '')
prot_len_list = []
for prot in ramirez_df_filtered['raw_seqs']:
    prot_len = len(prot)
    prot_len_list.append(prot_len)
ramirez_df_filtered['prot_len'] = prot_len_list
ramirez_df_filtered = ramirez_df_filtered[(ramirez_df_filtered['prot_len']>=300) & (ramirez_df_filtered['prot_len']<=600)].reset_index(drop=True)
ramirez_df_filtered.head()

In [None]:
ramirez_df_filtered.shape

In [None]:
len(set(ramirez_df_filtered['Organism'].to_list()))

In [None]:
ramirez_df_filtered2 = ramirez_df_filtered[~ramirez_df_filtered['raw_seqs'].isin(ncbi_prot_list)]
ramirez_df_filtered2 = ramirez_df_filtered2.reset_index(drop=True)
ramirez_df_filtered2['source'] = 'Ramirez et al 2016'
ramirez_df_filtered2.shape

In [None]:
ramirez_df_filtered2.to_csv(f'{report_dir2}/bilaterian_animal_ops/Supplemental_Table_T1_filtered.csv')
len(set(ramirez_df_filtered2['raw_seqs'].to_list()))

In [None]:
sequence_list += ramirez_df_filtered2['raw_seqs'].to_list()
acc_list += ramirez_df_filtered2['gene_ID'].to_list()
db_sp_list += ramirez_df_filtered2['Organism'].to_list()
prot_descriptions += ramirez_df_filtered2['Protein names'].to_list()
source_list += ramirez_df_filtered2['source'].to_list()

### <font color=#c994c7>Crustacean Conundrums: Palecanda et al 2022</font>

In [None]:
palecanda_file = f'{report_dir2}/crustacean_conundrums/rstb20210289_si_003.csv'
palecanda_df = pd.read_csv(palecanda_file)
palecanda_df.head()

In [None]:
palecanda_df.shape

In [None]:
for i, org in enumerate(palecanda_df['Genus species']):
    if org not in species_list:
        for sp in species_list:
            if (org in existing_taxon_dict.get(sp,{}).get('Synonyms',[])) or (sp in existing_taxon_dict.get(org,{}).get('Synonyms',[])):
                new_row = palecanda_df.iloc[i].copy()
                #print(f'Old Species: {new_row["Organism"]} // New Species: {sp}')
                new_row['Genus species'] = sp
                #print(new_row)
                palecanda_df = pd.concat([palecanda_df, new_row], ignore_index=True)

In [None]:
palecanda_df_filtered = palecanda_df[palecanda_df['Genus species'].isin(species_list)].reset_index(drop=True)
prot_len_list = []
for prot in palecanda_df_filtered['Protein Sequence']:
    prot_len = len(prot)
    prot_len_list.append(prot_len)
palecanda_df_filtered['prot_len'] = prot_len_list
palecanda_df_filtered = palecanda_df_filtered[(palecanda_df_filtered['prot_len']>=300) & (palecanda_df_filtered['prot_len']<=600)].reset_index(drop=True)

palecanda_df_filtered.head()

In [None]:
palecanda_df_filtered.shape

In [None]:
len(set(palecanda_df_filtered['Genus species'].to_list()))

In [None]:
palecanda_df_filtered2 = palecanda_df_filtered[~palecanda_df_filtered['Protein Sequence'].isin(ncbi_prot_list)]
palecanda_df_filtered2=palecanda_df_filtered2.reset_index(drop=True)
palecanda_df_filtered2['source'] = 'Palecanda et al 2022'
palecanda_df_filtered2.shape

In [None]:
palecanda_df_filtered2.to_csv(f'{report_dir2}/crustacean_conundrums/palecanda_2022_filtered_seq_data.csv')
len(set(palecanda_df_filtered2['Protein Sequence'].to_list()))

In [None]:
sequence_list += palecanda_df_filtered2['Protein Sequence'].to_list()
acc_list += palecanda_df_filtered2['Transcript ID'].to_list()
db_sp_list += palecanda_df_filtered2['Genus species'].to_list()
prot_descriptions += palecanda_df_filtered2['Opsin Type'].to_list()
source_list += palecanda_df_filtered2['source'].to_list()

### <font color=#c994c7>Ray-Finned Fish Opsin DB: Policarpo et al 2024</font>

In [None]:
ray_finned_fasta_file = f'{report_dir2}/ray_finned_fish/mined_alignments/Complete_opsins.aln.txt'
ray_finned_df = fasta_to_dataframe(ray_finned_fasta_file)

In [None]:
ray_finned_df

In [None]:
ray_finned_df.to_csv(f'{report_dir2}/ray_finned_fish/mined_alignments/extracted_fish_ops.csv')

In [None]:
for i, org in enumerate(ray_finned_df['species_name']):
    if org not in species_list:
        for sp in species_list:
            if (org in existing_taxon_dict.get(sp,{}).get('Synonyms',[])) or (sp in existing_taxon_dict.get(org,{}).get('Synonyms',[])):
                new_row = ray_finned_df.iloc[i].copy()
                #print(f'Old Species: {new_row["Organism"]} // New Species: {sp}')
                new_row['species_name'] = sp
                #print(new_row)
                ray_finned_df = pd.concat([ray_finned_df, new_row], ignore_index=True)

In [None]:
ray_finned_df_filtered = ray_finned_df[ray_finned_df['species_name'].isin(species_list)].reset_index(drop=True)
ray_finned_df_filtered.shape

In [None]:
ray_finned_df_filtered2 = ray_finned_df_filtered[~ray_finned_df_filtered['sequence'].isin(ncbi_prot_list)].reset_index(drop=True)
ray_finned_df_filtered2.shape

In [None]:
ray_finned_df_filtered3 = ray_finned_df_filtered2[(ray_finned_df_filtered2['seq_length']>=300) & (ray_finned_df_filtered2['seq_length']<=600)].reset_index(drop=True)
ray_finned_df_filtered3['source'] = 'Policarpo et al 2024'
ray_finned_df_filtered3.shape

In [None]:
ray_finned_df_filtered3.to_csv(f'{report_dir2}/ray_finned_fish/mined_alignments/extracted_fish_ops_filtered.csv')

In [None]:
sequence_list += ray_finned_df_filtered3['sequence'].to_list()
acc_list += ray_finned_df_filtered3['accession'].to_list()
db_sp_list += ray_finned_df_filtered3['species_name'].to_list()
prot_descriptions += ray_finned_df_filtered3['opsin_type'].to_list()
source_list += ray_finned_df_filtered3['source'].to_list()

### <font color=#c994c7>Visual Insect Opsin Compendium: Guignard et. al. 2022</font>

In [None]:
insect_ops_file = f'{report_dir2}/Guignard_2022_insect_ops/extracted_insect_ops.csv'
insect_ops_df = pd.read_csv(insect_ops_file, index_col=0)

In [None]:
insect_ops_df.head()

In [None]:
for i, org in enumerate(insect_ops_df['species_name']):
    if org not in species_list:
        for sp in species_list:
            if (org in existing_taxon_dict.get(sp,{}).get('Synonyms',[])) or (sp in existing_taxon_dict.get(org,{}).get('Synonyms',[])):
                new_row = insect_ops_df.iloc[i].copy()
                #print(f'Old Species: {new_row["Organism"]} // New Species: {sp}')
                new_row['species_name'] = sp
                #print(new_row)
                insect_ops_df = pd.concat([insect_ops_df, new_row], ignore_index=True)

In [None]:
insect_ops_df_filtered = insect_ops_df[insect_ops_df['species_name'].isin(species_list)].reset_index(drop=True)
insect_ops_df_filtered.shape

In [None]:
insect_ops_df_filtered = insect_ops_df_filtered[~insect_ops_df_filtered['sequence'].isin(ncbi_prot_list)].reset_index(drop=True)
insect_ops_df_filtered.shape

In [None]:
insect_ops_df_filtered = insect_ops_df_filtered[(insect_ops_df_filtered['seq_length']>=300) & (insect_ops_df_filtered['seq_length']<=600)].reset_index(drop=True)
insect_ops_df_filtered['source'] = 'Guignard et al 2022'
insect_ops_df_filtered.shape

In [None]:
insect_ops_df_filtered.to_csv(f'{report_dir2}/Guignard_2022_insect_ops/extracted_insect_ops_filtered.csv')

In [None]:
sequence_list += insect_ops_df_filtered['sequence'].to_list()
acc_list += insect_ops_df_filtered['accession'].to_list()
db_sp_list += insect_ops_df_filtered['species_name'].to_list()
prot_descriptions += insect_ops_df_filtered['opsin_type'].to_list()
source_list += insect_ops_df_filtered['source'].to_list()

### <font color=#c994c7>Extract all taxon info for the species these collection of sequences belong to:</font>


In [None]:
gn_list = []
sp_list = []
for sp in db_sp_list:
#    print(sp)
    gn_list.append(sp.split(' ', 1)[0])
    sp_list.append(sp.split(' ', 1)[1])

In [None]:
# Load our existing taxonomy dictionary and pull relevant taxon info
taxon_file = './data_sources/taxonomy/ncbi_taxon_dict.json'
if os.path.isfile(taxon_file):
    with open(taxon_file, 'r') as f:
        species_taxon_dict = json.load(f)
        
phylum_list = []
subphylum_list = []
class_list = []      
for sp in db_sp_list:
    phylum_list.append(species_taxon_dict[sp]["Phylum"])
    subphylum_list.append(species_taxon_dict[sp]["Subphylum"])
    class_list.append(species_taxon_dict[sp]["Class"])

### <font color=#c994c7>MATEdb2: Martínez-Redondo et. al. 2024</font>
For now I have no pipeline set up to extract sequences from the assembled transcriptomes but this may be coming in the future

In [None]:
matedb2_links_file = f'{report_dir2}/matedb2/Table_S1_cleaned.csv'
matedb2_links_df = pd.read_csv(matedb2_links_file)
matedb2_links_df.head()

In [None]:
matedb2_links_df.shape

In [None]:
matedb2_links_filtered = matedb2_links_df[matedb2_links_df['SCIENTIFIC NAME'].isin(species_list)].reset_index(drop=True)
matedb2_links_filtered.head()

In [None]:
matedb2_links_filtered.shape

In [None]:
matedb2_links_filtered2 = matedb2_links_filtered[~matedb2_links_filtered['SCIENTIFIC NAME'].isin(ncbi_sp_list)]
matedb2_links_filtered2.shape

### <font color=#c994c7>Create Merged Dataframe From All Accessory Opsin Sequence DBs</font>

In [None]:
#make a merged df of all the accessory seq dbs, filter out reedundant datapoints, then append to the end of the NCBI query sheet?
data = {'Accession': acc_list, 'Phylum': phylum_list, 'Subphylum': subphylum_list, 'Class': class_list,'Genus': gn_list, 'Species': sp_list, 'Full_Species': db_sp_list, 'Gene_Description': prot_descriptions, 'Protein' : sequence_list, 'source' : source_list} 
#data = {'Accession': acc_list, 'Genus': gn_list, 'Species': sp_list, 'Full_Species': db_sp_list, 'Gene_Description': prot_descriptions, 'Protein' : sequence_list, 'source' : source_list} 

acc_seq_db_df = pd.DataFrame(data)
acc_seq_db_df.head()

In [None]:
acc_seq_db_df.shape

In [None]:
acc_seq_db_df_filtered = acc_seq_db_df.copy()
acc_seq_db_df_filtered.drop_duplicates(subset=['Full_Species', 'Protein'],  keep='first', inplace=True)
acc_seq_db_df_filtered=acc_seq_db_df_filtered.reset_index(drop=True)
acc_seq_db_df_filtered.shape

In [None]:
len(set(acc_seq_db_df_filtered['Full_Species'].to_list()))

In [None]:
acc_seq_db_df_filtered.to_csv(f'{report_dir2}/vpod_comp_accessory_seq_dbs.csv')

In [None]:
fasta_file = f'./{report_dir2}/acc_db_seqs.fasta'
with open(fasta_file, 'w') as f:
    for id, seq in zip(acc_seq_db_df_filtered['Accession'], acc_seq_db_df_filtered['Protein']):
        f.write(f'>{id}\n{seq}\n')

### <font color=#c994c7>Merge the Formated Accessory Sequence DBs w/the Mined NCBI Data</font>

In [None]:
ncbi_query_df.head()

In [None]:
acc_seq_db_df_filtered.drop(columns='source', inplace =True)
acc_seq_db_df_filtered.head()

In [None]:
final_query_df = pd.concat([ncbi_query_df, acc_seq_db_df_filtered]).reset_index(drop = True)
final_query_df['Prot_Len'] = final_query_df['Protein'].str.len()
final_query_df.to_csv(f'{query_report_dir}/ncbi_q_merged_w_acc_seq_db.csv', index=False)
final_query_df.head()

In [None]:
final_query_df.shape

In [None]:
fasta_file = f'./{query_report_dir}/mined_and_acc_seqs.fasta'
with open(fasta_file, 'w') as f:
    for id, seq in zip(final_query_df['Accession'], final_query_df['Protein']):
        f.write(f'>{id}\n{seq}\n')

## <font color=#c994c7>Part 2: Blast Filtering</font> - Filter non-visual opsins by using BLAST against our sequence database of visual and non-visual opsins.

- We  filter based on if the closest match is tagged as a non-visual or visual opsin.

In [None]:
from mnm_scripts.blastp import run_blastp_analysis
import pathlib
script_path = pathlib.Path().resolve()  # Get absolute path
wrk_dir = str(script_path).replace('\\', '/')

In [None]:
#query_report_dir = 'mnm_data/mnm_on_all_dbs_2025-10-03_17-37-05'
#fasta_file = f'./{query_report_dir}/mined_and_acc_seqs.fasta'

blast_db_path = './data_sources/blastdbs/mnm_opsin_ref_db'
blast_output = f'./{query_report_dir}/blast_reference_report'

In [None]:
blast_analysis_df = run_blastp_analysis('blastp', fasta_file, blast_db_path, blast_output, wrk_dir=wrk_dir)

In [None]:
blast_analysis_df.head()

Note, the size of our dataframe will decrease here because any sequences which returned a 'blast unsuccessful' is dropped since they are likely not even a member of the larger opsin protein family

In [None]:
blast_analysis_df.shape

Now let's clean the blast dataframe to keep only the accessions/sequences matched to visual opsins

In [None]:
clean_blast_analysis_df = blast_analysis_df.copy()
clean_blast_analysis_df = clean_blast_analysis_df[(clean_blast_analysis_df['sseqid'].str.contains('visual_opsin')) & (clean_blast_analysis_df['pident'] >= 20)]
clean_blast_analysis_df.shape

Now we filter the main dataframe with the accessions matchd to visual opsins in the blast dataframe
- There will be more sequences in the main dataframe because there is a certain amount of permitted overlap

In [None]:
import pandas as pd
final_query_df = pd.read_csv(f'./{query_report_dir}/ncbi_q_merged_w_acc_seq_db.csv')
final_query_df.head()

In [None]:
final_query_df.shape

In [None]:
seqs_to_keep = clean_blast_analysis_df['qseqid'].to_list()
non_visual_query_df = final_query_df.copy()

In [None]:
exclusion_patterns = [
    r'\b(rh1_)?exorh',                  # Matches 'exorh', 'exorha', 'exorh_1'
    r'\btmt([1-9])?',           # Matches 'tmt', 'tmt1a', 'tmt2_b'
    r'\bmelanopsin',             # Matches 'melanopsin', 'melanopsin_a'
    r'\bop(?:n|sin)\s?[3-9]',    # Matches 'opn4m_1', 'opsin 5b', 'opn3' etc.
    r'\bneur',                   # Matches 'neur', 'neuropsin', 'neural'
    r'\bchain\b'
]

In [None]:
# Join the patterns into a single regex string
exclusion_regex = '|'.join(exclusion_patterns)
# Apply the streamlined filters
final_query_df = final_query_df[
    (final_query_df['Accession'].isin(seqs_to_keep)) &
    (~final_query_df['Gene_Description'].str.contains(exclusion_regex, case=False, regex=True)) &
    (final_query_df['Prot_Len'] <= 600) & (final_query_df['Prot_Len'] >= 300) &
    (~final_query_df['Full_Species'].str.contains(r'\bDaphnia\b', case=False, regex=True))
]
final_query_df.reset_index(inplace=True)

In [None]:
final_query_df.shape

In [None]:
final_query_df.to_csv(f'{query_report_dir}/ncbi_q_merged_w_acc_seq_db_visual_only.csv')

fasta_file = f'./{query_report_dir}/mined_and_acc_seqs_visual_only.fasta'
with open(fasta_file, 'w') as f:
    for id, seq in zip(final_query_df['Accession'], final_query_df['Protein']):
        f.write(f'>{id}\n{seq}\n')

Let's also save all the non-visual opsin hits just for the sake of posterity

In [None]:
non_visual_blast_df = blast_analysis_df.copy()
non_visual_blast_df = non_visual_blast_df[~non_visual_blast_df['sseqid'].str.contains('visual_opsin')]
non_visual_blast_df.shape

In [None]:
non_visual_blast_df.head()

In [None]:
seqs_to_keep = non_visual_blast_df['qseqid']

non_visual_query_df = non_visual_query_df[
    (non_visual_query_df['Accession'].str.contains('|'.join(seqs_to_keep))) &
    (non_visual_query_df['Gene_Description'].str.contains(exclusion_regex, case=False, regex=True))
]

In [None]:
#non_visual_query_df = non_visual_query_df[(non_visual_query_df['Accession'].str.contains('|'.join(seqs_to_keep))) & (non_visual_query_df['Gene_Description'].str.contains('exorh|exoRH|tmt1|tm2|tmt3|melanopsin|opn4|opn5|opn6|opn8|tmt|opn7|NEUR'))]

In [None]:
non_visual_query_df.to_csv(f'./{query_report_dir}/ncbi_q_merged_w_acc_seq_db_nonvisual_only.csv', index=False)

## <font color=#c994c7>Part 3: OPTICS Predictions</font> - Predict Lmax of all queried opsin sequences 

In [None]:
%reload_ext autoreload
%load_ext autoreload
%autoreload 2

In [None]:
import sys
# Get the absolute path to the 'optics' directory.  This is crucial for robustness.
optics_path = 'D:\\safra\\Github\\optics'
# Add the 'optics' directory to the Python path
if optics_path not in sys.path: # Avoid adding multiple times
    sys.path.append(optics_path)
# Now you can import modules from 'optics' as usual
from optics_predictions import run_optics_predictions

In [None]:
#OPTICS Predictions
#query_report_dir = 'mnm_data/mnm_on_all_dbs_2025-10-01_14-51-49'
#fasta_file = f'./{query_report_dir}/mined_and_acc_seqs_visual_only.fasta'
optics_df, optics_pred_file = run_optics_predictions(input_sequence=fasta_file, pred_dir=f'./{query_report_dir}', output='mnm_data',
                           model="whole-dataset", encoding_method='aa_prop', blastp=True,
                           iden_report='blastp_report', refseq='bovine',
                           bootstrap=False, bootstrap_num=100, visualize_bootstrap=False, preload_to_memory=False, n_jobs=10, tolerate_non_standard_aa=True)

In [None]:
optics_df.head()

In [None]:
optics_df.shape

## <font color=#c994c7>Part 4: Matching Predictions to Physiology Data</font> - Match sequence to it's closest physiologically measured lmax value based on OPTICS predictions

In [None]:
import warnings
import pandas as pd
from deepBreaks.preprocessing import read_data
from mnm_scripts.mine_n_match_functions import mine_n_match
email = 'sethfrazer@ucsb.edu'
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

#### <font color=#c994c7>Load NCBI Query Data</font>

In [None]:
#query_report_dir = 'mnm_data/mnm_on_mnm_on_all_dbs_2025-09-22_15-52-30' #re-define the report directory if needed
ncbi_q_file = f'./{query_report_dir}/ncbi_q_merged_w_acc_seq_db_visual_only.csv'
ncbi = pd.read_csv(ncbi_q_file, index_col=0)
ncbi_filtered = ncbi[ncbi['Accession'].isin(optics_df['Names'])]
ncbi_filtered.reset_index(inplace=True)
ncbi_filtered.head()

In [None]:
ncbi_filtered.shape

#### <font color=#c994c7>Load OPTICS Predictions</font>

In [None]:
#pred_dir = '.\prediction_outputs\optics_on_mnm_all_dbs_2025-02-24_17-22-27'
#optics_pred_file = f'{pred_dir}/mnm_all_dbs_predictions.tsv'
optics = pd.read_csv(optics_pred_file, sep='\t')
optics.head()

In [None]:
optics.shape

#### <font color=#c994c7>Load Lmax Compendium Data</font>

In [None]:
source_file = './data_sources/lmax/VPOD_in_vivo_v1.0_2025-05-09_18-17-49.csv'
#source_file = merged_df_file
comp_db = pd.read_csv(source_file,index_col=0)
comp_db.head()

#### <font color=#c994c7>Run Mine-n-Match Script!</font>

In [None]:
final_err_filtered_df = mine_n_match(email, query_report_dir, source_file, ncbi_filtered, optics_pred_file, out='vpod_in_vivo', err_filter = 11, per_identity_minimum=20, prediction_to_use='Single_Prediction')

#### <font color=#c994c7>Check the final dataframe</font>

- Need to add a portion to the MNM backend which further filters for sequences matched to synonyms of the same species w/ same lmax. 

In [None]:
#final_mnm_file_name = './mnm_data/mnm_on_all_dbs_2025-02-24_16-29-54/mnm_on_vpod_in_vivo_results_fully_filtered.csv'
#final_err_filtered_df = pd.read_csv(final_mnm_file_name, index_col=0)
final_err_filtered_df.head()

In [None]:
final_err_filtered_df.shape

#### <font color=#c994c7>Generate dataframe of all in-vivo values that did not match any sequences</font>

In [None]:
no_match_in_vivo_df = comp_db.drop(labels=final_err_filtered_df['comp_db_id'], inplace=False).copy()
no_match_in_vivo_df.shape

In [None]:
no_match_in_vivo_df.head()

In [None]:
no_match_in_vivo_df.to_csv(f'./{query_report_dir}/no_match_in_vivo_mnm.csv', index=True)

#### <font color=#c994c7>Generate dataframe of all sequence data that did not match any in-vivo Lmax values</font>

In [None]:
no_match_seq_data_df = ncbi_filtered[~ncbi_filtered['Accession'].isin(final_err_filtered_df['Accession'])].copy()
no_match_seq_data_df.shape

In [None]:
no_match_seq_data_df.head()

In [None]:
no_match_seq_data_df.to_csv(f'./{query_report_dir}/no_match_seq_data_mnm.csv', index=True)