# <font color=#c51b8a>VPOD 'Mine-n-Match':</font>
## <font color=#c994c7>Part 1 Objective</font> - Use Species Names from Microspectrophotemetry Data Sheet to Query NCBI for All Related Opsin Sequences  

In [1]:
import os
import re
import datetime
import time
import json
import pandas as pd
import numpy as np
from deepBreaks.preprocessing import read_data
from Bio import Entrez, SeqIO
email = 'sethfrazer@ucsb.edu'

In [2]:
from mnm_scripts.mine_n_match_functions import ncbi_fetch_opsins, merge_accessory_dbs, fasta_to_dataframe, correct_species_name

## <font color=#c51b8a>Load data-tables with all of the species and Lambda Max data from accessory lmax databases</font> 

### <font color=#c994c7>VPOD Single Cell Microspectrophotmetry (SCP) Datatable </font>
### In this case our dataframe does not have full species name in one column so we must create a list by directly combining the genus and species names. Then filter to create a list of all unique species names 


In [3]:
report_dir = './data_sources/lmax'
species_list = []
df_list = []

In [4]:
scp_file = f'{report_dir}/vpod/scp_cleaned.txt'
scp_df = pd.read_csv(scp_file, index_col=0, sep='\t')
scp_df['Full_Species'] = (scp_df['Genus'] + ' ' + scp_df['Species']).to_list()
df_list.append(scp_df)
scp_df.head()

Unnamed: 0_level_0,Genus,Species,phylum,CellType,CellSubType,LambdaMax,error,Chromophore,Stage,useable,expid,opsinid,refid,Notes,Column1,isGenusInHetero,isSpeciesInHetero,isInHetero,isVert,Full_Species
maxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Acipenser,transmontanus,Chordata,rod,,540.0,,,adult,1.0,,463.0,30.0,partial cds but is rod opsin,,,,0,1.0,Acipenser transmontanus
2,Acipenser,transmontanus,Chordata,cone,,464.0,,,adult,0.0,,0.0,30.0,,,,,0,,Acipenser transmontanus
3,Acipenser,transmontanus,Chordata,cone,,531.0,,,adult,0.0,,0.0,30.0,,,,,0,,Acipenser transmontanus
4,Acipenser,transmontanus,Chordata,cone,,605.0,,,adult,0.0,,0.0,30.0,,,,,0,,Acipenser transmontanus
5,Acipenser,medirostris,Chordata,rod,,540.0,,,adult,0.0,,0.0,43.0,,,,,0,,Acipenser medirostris


In [5]:
species_list+=scp_df['Full_Species'].to_list()

### <font color=#c994c7>Longcore - 'Animal Photopigments' Datatable</font>

### In this next case our dataframe has the full species name in one column so we can create a list directly and filter to create a list of all unique species names


In [6]:
longcore_file = f'{report_dir}/longcore_data/AnimalPhotopigmentsV1_1.csv'
longcore_df = pd.read_csv(longcore_file, index_col=0)
df_list.append(longcore_df)
longcore_df.head()

Unnamed: 0_level_0,Phylum,Class,Order,Family,Full_Species,Type,LambdaMax,Band,Oil,Nocturnal Activity,Diurnal Activity,Reference
longcore_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,360,,,Y,Y,Yamashita & Tateda 1978
1,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,490,,,Y,Y,Yamashita & Tateda 1978
2,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,540,,,Y,Y,Yamashita & Tateda 1978
3,Arthropoda,Arachnida,Aranae,Araneidae,Argiope bruennichi,,360,,,Y,Y,Yamashita & Tateda 1978
4,Arthropoda,Arachnida,Aranae,Araneidae,Argiope bruennichi,,490,,,Y,Y,Yamashita & Tateda 1978


In [7]:
species_list+=longcore_df['Full_Species'].to_list()

### <font color=#c994c7>Murphy and Westerman Datatable</font>

In [8]:
murphy_westerman_file = f'{report_dir}/murphy_westerman/Murphy and Westerman.csv'
murphy_westerman_df = pd.read_csv(murphy_westerman_file, index_col=0)
df_list.append(murphy_westerman_df)
murphy_westerman_df.head()

Unnamed: 0_level_0,Full_Species,LambdaMax,SD,N,refid,Lineage
murwes_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Aglais urticae,360.0,–,†,[1],Arthropod
1,Agraulis vanillae,370.0,–,†,[2],Arthropod
2,Alima pacifica,467.0,–,5,[4],Arthropod
3,Antheraea polyphemus,355.0,–,20,[5],Arthropod
4,Apodemia mormo,556.0,–,†,[3],Arthropod


In [9]:
species_list+=murphy_westerman_df['Full_Species'].to_list()

### <font color=#c994c7>Caves 'Fish' Datatable</font>

In [10]:
caves_fish_db_file = f'{report_dir}/caves/caves_fish_db.csv'
caves_df = pd.read_csv(caves_fish_db_file, index_col=0)
caves_df['Full_Species'] = (caves_df['Genus'] + ' ' + caves_df['Species']).to_list()
df_list.append(caves_df)
caves_df.head()

Unnamed: 0_level_0,Family,Genus,Species,LambdaMax,refid,Full_Species
caves_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Acipenseridae,Acipenser,baerii,549,[22],Acipenser baerii
1,Acipenseridae,Acipenser,medirostrus,540,[60],Acipenser medirostrus
2,Acipenseridae,Acipenser,ruthenus,545,[21],Acipenser ruthenus
3,Acipenseridae,Acipenser,transmontanus,540,[39] ; [58],Acipenser transmontanus
4,Acipenseridae,Scaphirhynchus,albus,538,[60],Scaphirhynchus albus


In [11]:
species_list+=caves_df['Full_Species'].to_list()

### <font color=#c994c7>Morgan Porter's Datatables</font>

In [12]:
porter_file1 = f'{report_dir}/megan_porter/megan_porter_extract_clean_2005.csv'
porter_df1 = pd.read_csv(porter_file1, index_col=0)
porter_df2 = pd.read_csv(f'{report_dir}/megan_porter/porter_2006_table1_clean.tsv', sep='\t', index_col=0)
df_list.append(porter_df1)
df_list.append(porter_df2)
porter_df1.head()

Unnamed: 0_level_0,Full_Species,Habitat,method,LambdaMax,Reference,Accession
porter2005_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Artemia salina,,EON,410,(Hertel 1972),
1,Daphnia magna,,VC,"348, 434, 525, 608",(Smith and Macagno 1990),
2,Acartia tonsa,,BP,450-520,(Stearns and Forward 1984),
3,Balanus amphitrite,,ERG,532,(Hillman et al. 1973),
4,Balanus amphitrite,,MSP,532,(Minke and Kirschfield 1978),


In [13]:
porter_df2.head()

Unnamed: 0_level_0,Full_Species,Accession,LambdaMax,Reference
porter2006_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Loligo forbesii,X56788,494,Morris et al. (1993)
1,Loligo pealii,AY450853,493,"Brown PK and Brown PS (1958), Hubbard and St G..."
2,Loligo subulata,Z49108,499,Morris et al. (1993)
3,Sepia officinalis,AF000947,492,Brown PK and Brown PS (1958)
4,Todarodes pacificus,X70498,480,Naito et al. (1981)


In [14]:
species_list += (porter_df1['Full_Species'].to_list() + porter_df2['Full_Species'].to_list())

### <font color=#c994c7>J. Van Der Kooi's Datatable</font>

In [27]:
kooi_insect_db_file = f'{report_dir}/j_van_der_kooi/j_van_der_kooi_2021_main_table.csv'
kooi_df = pd.read_csv(kooi_insect_db_file, index_col=0)
kooi_df['Full_Species'] = (kooi_df['Genus'] + ' ' + kooi_df['Species']).to_list()
df_list.append(caves_df)
kooi_df.head()

Unnamed: 0_level_0,Family,Genus,Species,First,Seco,Third,Fourth,Fifth,Sixth,Remaining,Full_Species
kooi_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Pieridae,Colias,erate,360,440,480.0,580.0,660.0,,,Colias erate
1,Pieridae,Colias,erate,360,430,460.0,580.0,640.0,620.0,660.0,Colias erate
2,Pieridae,Leptidae,amurensis,360,440,450.0,520.0,,,,Leptidae amurensis
3,Pieridae,Pieris,brassicae,360,450,,560.0,560.0,,,Pieris brassicae
4,Pieridae,Pieris,rapae crucivora,360,440,460.0,560.0,620.0,640.0,,Pieris rapae crucivora


In [None]:
# Need a way to add/collapse the different lmax columns into a central column, making sure the corresponding species information is retained.

### <font color=#c994c7>Merge Accessory Lambda Max Databases</font>

In [15]:
# Call the function to merge all the species, lambdamax, and potential accession information into one dataframe
# merged_df = merge_accessory_dbs(df_list, report_dir)
#merged_df.head()

In [16]:
#merged_df.shape

### <font color=#c994c7>All unique species names have been extracted from accessory databases. Now we iteratively query NCBI for opsins from each species.</font>

In [17]:
len(species_list)

4098

In [18]:
species_list = list(set(species_list))
len(species_list)

1273

## <font color=#c51b8a>Run NCBI Query Mining Process</font>

In [21]:
ncbi_query_df, query_report_dir = ncbi_fetch_opsins(email=email, job_label='mnm_on_all_dbs', out='mnm_on_all_dbs', species_list=species_list[0:10])

Creating Job Directory

Saving Species Query List to Text

Constructing Taxon Dictionary, Including Species Synonyms

Existing Taxon Dictionary Found! One Moment While We Update It...

Taxon Dictionary Complete!

Starting Queries to NCBI for Opsin Sequences



  0% (0 of 10) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--


Here is the query ID return list:
['2154371071', '2154368583', '2154368061', '2154368053', '2154366825', '2154366823', '2154366821', '2154359214', '2154359197', '2154359195', '2154359192', '2154359184', '2154359181', '2154359180', '2154359175', '2154355812', '2154351657', '2154351655', '2154351652', '2154351649', '2154345030', '2154345027', '2154344058', '2154344036', '2154341425', '2154338363', '2154325160', '2154325154', '2154318689', '2154293566', '2154293562', '2154293558', '2154289014', '2154289010', '2154284796', '2154267127', '2154260939']

Fetching record 2154371071 from cache.
Fetching record 2154368583 from cache.
Fetching record 2154368061 from cache.
Fetching record 2154368053 from cache.
Fetching record 2154366825 from cache.
Fetching record 2154366823 from cache.
Fetching record 2154366821 from cache.
Fetching record 2154359214 from cache.
Fetching record 2154359197 from cache.
Fetching record 2154359195 from cache.
Fetching record 2154359192 from cache.
Fetching record 2

 10% (1 of 10) |##                       | Elapsed Time: 0:00:03 ETA:   0:00:27


Here is the query ID return list:
[]



 20% (2 of 10) |#####                    | Elapsed Time: 0:00:06 ETA:   0:00:24


Here is the query ID return list:
[]



 30% (3 of 10) |#######                  | Elapsed Time: 0:00:08 ETA:   0:00:20


Here is the query ID return list:
[]



 40% (4 of 10) |##########               | Elapsed Time: 0:00:11 ETA:   0:00:16


Here is the query ID return list:
[]



 50% (5 of 10) |############             | Elapsed Time: 0:00:14 ETA:   0:00:14


Here is the query ID return list:
[]



 60% (6 of 10) |###############          | Elapsed Time: 0:00:16 ETA:   0:00:10


Here is the query ID return list:
[]



 70% (7 of 10) |#################        | Elapsed Time: 0:00:19 ETA:   0:00:08


Here is the query ID return list:
[]



 80% (8 of 10) |####################     | Elapsed Time: 0:00:21 ETA:   0:00:05


Here is the query ID return list:
[]



 90% (9 of 10) |######################   | Elapsed Time: 0:00:24 ETA:   0:00:02
100% (10 of 10) |########################| Elapsed Time: 0:00:24 Time:  0:00:24
100% (10 of 10) |########################| Elapsed Time: 0:00:24 Time:  0:00:24


Here is the query ID return list:
[]

NCBI Queries Complete!
Now Extracting and Formatting Results For DataFrame...

DataFrame Formatted and Saved to CSV file for future use :)

FASTA File Saved...

Saving txt file with names of species that retrieved no results for opsins...



In [20]:
STOP

NameError: name 'STOP' is not defined

In [68]:
ncbi_query_df.shape

(1957, 12)

## <font color=#c51b8a>Load Accessory Opsin Sequence Databases</font> 

### <font color=#c994c7>Load Previous MnM Data</font>

In [69]:
#species_list = []
#with open(f'{query_report_dir}/species_queried.txt', "r") as f:
#    lines = f.readlines()
#    for line in lines:
#        species_list.append(line.replace('\n',''))
#len(species_list)

In [70]:
#query_report_dir = "mnm_data/mnm_on_mnm_on_all_dbs_2025-01-22_20-33-16 - Copy"
#ncbi_query_file = f'{query_report_dir}/mnm_on_all_dbs_ncbi_q_data_cleaned.csv'
#ncbi_query_file = f'./{query_report_dir}/mnm_on_all_dbs_ncbi_q_data_cleaned.csv'
#ncbi_query_df = pd.read_csv(ncbi_query_file)
ncbi_sp_list = ncbi_query_df['Full_Species'].to_list()
ncbi_prot_list = ncbi_query_df['Protein'].to_list()
ncbi_query_df.head()

Unnamed: 0,Accession,Phylum,Subphylum,Class,Genus,Species,Full_Species,DNA,Protein,Gene_Description,Species_Synonym_Used,Prot_Len
0,XM_024085765.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,ACACAATTCATTTAACGTGTTCAGAATAGATGCACCACCGCATTAG...,MATQSCFWSDIFDEQSRANLNTGFVVTGAVSTLLGSWLNLALLLTN...,PREDICTED: Bicyclus anynana rhodopsin (LOC1120...,,313
1,XM_024089198.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,GTTCGGGATGACAGATATACGCTCGAGAATAACATTTTGCTACGGG...,MSPATKIIVIIFLIDIKCKNVDTLPEVKAELWSFKNNLKKSVVNHF...,"PREDICTED: Bicyclus anynana opsin, ultraviolet...",,572
2,AY918895.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,TTTGGTATTAACCATCTCGGGGCGACCTATCCACCTGTAACCCTTT...,MAITSMDPGPGIAALQAWGGHAAAYGSNETVVDKVLPDMLHLVDPH...,Bicyclus anynana long wavelength sensitive ops...,,380
3,XM_061286956.1,Chordata,Craniata,Actinopteri,Syngnathus,typhle,Syngnathus typhle,ACCGCACGCGTGGATTACATTGCAGCGTGGAGGAAATGGGGAAAAA...,MDARMFSGQVALNLSGSNLSDDPELAEPSVEVEARLSPTGFVVLSV...,PREDICTED: Syngnathus typhle teleost multiple ...,,381
4,XM_061286026.1,Chordata,Craniata,Actinopteri,Syngnathus,typhle,Syngnathus typhle,GGACGCGCGCTCCAGCCGCACATCGAAGCCACCACAATGACCGAAA...,MNPYNETRAEHNLFAFGTYKLLAVTIGTIGVFGFCNNVLVILLYCK...,"PREDICTED: Syngnathus typhle opsin 3 (opn3), mRNA",,381


In [71]:
report_dir2 = './data_sources/seqs'
sequence_list = []
acc_list = []
db_sp_list = []
prot_descriptions = []
source_list = []

### <font color=#c994c7>Bilaterian Animal Opsins: Ramirez et al 2016</font>

In [72]:
ramirez_file = f'{report_dir2}/bilaterian_animal_ops/Supplemental_Table_T1_cleaned.csv'
ramirez_df = pd.read_csv(ramirez_file)
ramirez_df.head()

Unnamed: 0,gene_ID,Organism,Protein names,aligned_seqs
0,KX714605,Acanthopleura granulata,xenopsin,----------------------------------------------...
1,KX714606,Acanthopleura granulata,canonical_r-opsin,---------YII-------------------------------GVY...
2,1_Acroporadigitifera,Acropora digitifera,58percent_match_PREDICTED visual pigment-like ...,---------Y------------------------------------...
3,XP_015763203,Acropora digitifera,na,---------IVY-------------------------------VVI...
4,XP_015773304,Acropora digitifera,PREDICTED: melanopsin-like,---------IAY-------------------------------GVV...


In [73]:
ramirez_df.shape

(768, 4)

In [74]:
ramirez_df_filtered = ramirez_df[ramirez_df['Organism'].isin(species_list)]
ramirez_df_filtered['raw_seqs'] = ramirez_df_filtered['aligned_seqs'].str.replace('-', '')
prot_len_list = []
for prot in ramirez_df_filtered['raw_seqs']:
    prot_len = len(prot)
    prot_len_list.append(prot_len)
ramirez_df_filtered['prot_len'] = prot_len_list
ramirez_df_filtered = ramirez_df_filtered[ramirez_df_filtered['prot_len']>320].reset_index(drop=True)
ramirez_df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ramirez_df_filtered['raw_seqs'] = ramirez_df_filtered['aligned_seqs'].str.replace('-', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ramirez_df_filtered['prot_len'] = prot_len_list


Unnamed: 0,gene_ID,Organism,Protein names,aligned_seqs,raw_seqs,prot_len
0,U3IU26,Anas platyrhynchos,Uncharacterized protein,TQTEH----NIV-------------------------------AAY...,TQTEHNIVAAYLITAGVISIFSNIVVLGIFVKYKELRTATNAIIIN...,340
1,A0A0R4IEG0,Danio rerio,Novopsin-10,SPSAD----LSI-------------------------------AVF...,SPSADLSIAVFLIITGVVSVFGNGLVLLVYGQRRKKLRAHELMTIN...,378
2,Q80XL3,Mus musculus,Rrh protein,SRTEH----SVI-------------------------------AAY...,SRTEHSVIAAYLIVAGITSILSNVVVLGIFIKYKELRTPTNAVIIN...,331
3,M3YAY1,Mustela putorius furo,Uncharacterized protein,SQTEH----NIV-------------------------------AAY...,SQTEHNIVAAYLITAGIISIFSNLIVLGIFIKYKELRTPTNAIIIN...,328
4,G1PJY3,Myotis lucifugus,Uncharacterized protein,SQTEH----NIV-------------------------------ATY...,SQTEHNIVATYLIMAGMISLLSNIIVLGIFITYKELRTPTNAIIIN...,331


In [75]:
ramirez_df_filtered.shape

(6, 6)

In [76]:
len(set(ramirez_df_filtered['Organism'].to_list()))

6

In [77]:
ramirez_df_filtered2 = ramirez_df_filtered[~ramirez_df_filtered['raw_seqs'].isin(ncbi_prot_list)]
ramirez_df_filtered2 = ramirez_df_filtered2.reset_index(drop=True)
ramirez_df_filtered2['source'] = 'Ramirez et al 2016'
ramirez_df_filtered2.shape

(6, 7)

In [78]:
ramirez_df_filtered2.to_csv(f'{report_dir2}/bilaterian_animal_ops/Supplemental_Table_T1_filtered.csv')
len(set(ramirez_df_filtered2['raw_seqs'].to_list()))

6

In [79]:
sequence_list += ramirez_df_filtered2['raw_seqs'].to_list()
acc_list += ramirez_df_filtered2['gene_ID'].to_list()
db_sp_list += ramirez_df_filtered2['Organism'].to_list()
prot_descriptions += ramirez_df_filtered2['Protein names'].to_list()
source_list += ramirez_df_filtered2['source'].to_list()

### <font color=#c994c7>Crustacean Conundrums: Palecanda et al 2022</font>

In [80]:
palecanda_file = f'{report_dir2}/crustacean_conundrums/rstb20210289_si_003.csv'
palecanda_df = pd.read_csv(palecanda_file)
palecanda_df.head()

Unnamed: 0,Class,Order,Family,Genus species,Opsin Type,NCBI Accession #,Transcript ID,Untrimmed Nucleotide Sequence,Protein Sequence
0,Malacostraca,Amphipoda,Corophiidae,Grandidierella japonica,LWS,BK059768,AMP|GrJap|LWS|IACS01070746.1,ACTGAGAAATATCTACGGAAAAATGTCTTGGAATAGCCCAATGAAC...,LPEGMASSNPFGNYTVVDAAPKEILHMVDPHWYQFPPMNPLWYGLL...
1,Malacostraca,Amphipoda,Corophiidae,Grandidierella japonica,MWS1,BK059769,AMP|GrJap|MWS|IACS01055269.1,GGAGTCACCCAGGTTGACCTGGTTCCGGATTACATGAAGGACATGA...,GVTQVDLVPDYMKDMIHPHWSNYPPVNPMWHHLLGLVYIIIGSCAT...
2,Malacostraca,Amphipoda,Corophiidae,Grandidierella japonica,NEUR,BK059770,AMP|GrJap|NEUR|IACS01028177.1,AGTGTACGGCTACGTATGCTTCGCTCTGGGTATCGTCAATATTCTC...,VYGYVCFALGIVNILSIYALAVVRYLKTCHYRNIGSKIDRKEVSIV...
3,Malacostraca,Amphipoda,Gammaridae,Echinogammarus berilloni,LWS,,AMP|EcBer|LWS|GHCT01076719.1,CGTGGCTTCAACGCGCTGTGTAGACCAACAGGACCTCCGCCTCACT...,LPEGVVSSNPFGNYTVVDAAPKELLPLIDSHWYQYPPLNPMWYGLL...
4,Malacostraca,Amphipoda,Gammaridae,Echinogammarus berilloni,LWS,,AMP|EcBer|LWS|GHCT01076725.1,CTCAATATATAATTTGAGCGTCCAGAAGATTCTGTCTCAGTGAGCC...,APEGAVSTNPFGNYTVVDTAAKDILHMISPHWYQFPPLNPMWYGLL...


In [81]:
palecanda_df.shape

(628, 9)

In [82]:
palecanda_df_filtered = palecanda_df[palecanda_df['Genus species'].isin(species_list)].reset_index(drop=True)
#palecanda_df_filtered['raw_seqs'] = palecanda_df_filtered['aligned_seqs'].str.replace('-', '')
prot_len_list = []
for prot in palecanda_df_filtered['Protein Sequence']:
    prot_len = len(prot)
    prot_len_list.append(prot_len)
palecanda_df_filtered['prot_len'] = prot_len_list
palecanda_df_filtered = palecanda_df_filtered[palecanda_df_filtered['prot_len']>320].reset_index(drop=True)

palecanda_df_filtered.head()

Unnamed: 0,Class,Order,Family,Genus species,Opsin Type,NCBI Accession #,Transcript ID,Untrimmed Nucleotide Sequence,Protein Sequence,prot_len
0,Malacostraca,Amphipoda,Talitridae,Talitrus saltator,LWS,,AMP|TeSal|LWS|GDUJ01056749.1,TCACGTGGCCTAGCGCGCTGTGTAGACCGAATCCAACTCCTTGTCT...,HPEGMVSSNPFGNFTVVDAAPKELLHMVDPHWYQYPPLNPLWYGLL...,336
1,Malacostraca,Amphipoda,Talitridae,Talitrus saltator,LWS,,AMP|TeSal|LWS|GDUJ01056750.1,CTCACAACGAATCCAACTCATCAGTAATAATGAGGTGGAGGCGCGT...,LPDSGYISTNPFGNFTVVDSAPKEILHMIDPHWYQFPPLNPMWYGL...,337
2,Malacostraca,Amphipoda,Talitridae,Talitrus saltator,MWS1,,AMP|TeSal|MWS|GDUJ01058424.1,TGGAGGGCTTCTTGGGGCCTCCCAGCTGCATTCATGTCCCTAACAC...,SYGSPSSSALTGANQFAFGYPPGVSVVDIVPPHMKDLIHPHWSNFP...,373
3,Malacostraca,Amphipoda,Talitridae,Talitrus saltator,NEUR,,AMP|TeSal|NEUR|GDUJ01037444.1,ATGGGTTAAAAATCAGCGCTGCATTGACGAGAAGTTTGACTCATCT...,VVQTRPIFIISTRFTLHFDTEWLLACMTAGVVALVGNSASVAMFWR...,330
4,Malacostraca,Decapoda,Cambaridae,Procambarus clarkii,LWS,ALJ26467,DEC|Proclar|LWS|ALJ26467,,SWSNQPAMDDYGLPSSNPYGNFTVVDMAPKDILHMIHPHWYQYPPM...,363


In [83]:
palecanda_df_filtered.shape

(84, 10)

In [84]:
len(set(palecanda_df_filtered['Genus species'].to_list()))

15

In [85]:
palecanda_df_filtered2 = palecanda_df_filtered[~palecanda_df_filtered['Protein Sequence'].isin(ncbi_prot_list)]
palecanda_df_filtered2=palecanda_df_filtered2.reset_index(drop=True)
palecanda_df_filtered2['source'] = 'Palecanda et al 2022'
palecanda_df_filtered2.shape

(84, 11)

In [86]:
palecanda_df_filtered2.to_csv(f'{report_dir2}/crustacean_conundrums/palecanda_2022_filtered_seq_data.csv')
len(set(palecanda_df_filtered2['Protein Sequence'].to_list()))

84

In [87]:
sequence_list += palecanda_df_filtered2['Protein Sequence'].to_list()
acc_list += palecanda_df_filtered2['Transcript ID'].to_list()
db_sp_list += palecanda_df_filtered2['Genus species'].to_list()
prot_descriptions += palecanda_df_filtered2['Opsin Type'].to_list()
source_list += palecanda_df_filtered2['source'].to_list()

### <font color=#c994c7>Ray-Finned Fish Opsin DB: Policarpo et al 2024</font>

In [88]:
ray_finned_fasta_file = f'{report_dir2}/ray_finned_fish/mined_alignments/Complete_opsins.aln.txt'
ray_finned_df = fasta_to_dataframe(ray_finned_fasta_file)

In [89]:
ray_finned_df

Unnamed: 0,species_name,opsin_type,accession,aln_sequence,sequence,seq_length
0,Lepisosteus oculatus,va,CM001408.1-11749388-11761955,MDSFRVSVNG-VV--------------------------YTEAAE-...,MDSFRVSVNGVVYTEAAEILKPGDPFSGPIENIAPWNFKFLAALMF...,386
1,Lepisosteus oculatus,pinopsin,CM001425.1-1263197-1266918,MPILVNSSAA-FP--------------------------L--EKN-...,MPILVNSSAAFPLEKNSTPGPFDGPQWHQAPRSTYLMVAVLMGTVV...,357
2,Lepisosteus oculatus,sws1,CM001411.1-10398717-10404663,MAGTEE--FY-L---------------------------F----E-...,MAGTEEFYLFENISSVGPWDGPQYHIAPKWAFYFQTIFMGLVFFWG...,346
3,Lepisosteus oculatus,rrh,CM001407.1-48235198-48246929,MENFSRIPIT-KV--------------------------LDNETS-...,MENFSRIPITKVLDNETSNLSQAVDTSEHSVFSQTEHNIVAAYLIT...,343
4,Lepisosteus oculatus,opn5,CM001404.1-37427058-37472360,MAQTDNGTSHNVP--------------------------HYL----...,MAQTDNGTSHNVPHYLLSGDPFASKLSKEADFVAAFYICIVGIMST...,352
...,...,...,...,...,...,...
17336,Callorhinchus milii,pinopsin,XP007894735.2,----MDLPTT-SV--------------------------G--IEN-...,MDLPTTSVGIENGTVGAFDGPQWYFAPKSTYMAVATLMGTVVILAS...,352
17337,Carcharodon carcharias,pinopsin,XP041052689.1,----MGFSTG-SI--------------------------T--THN-...,MGFSTGSITTHNITIGAFHGPQWDVAPRSTYMAVAALMGNVVILAT...,352
17338,Callorhinchus milii,rgr,XM007898380.1,MVTSHP----------------------------------------...,MVTSHPGLEGFTDFEVFGLLVEALVGLLLNGLTLLAFYKIKELRTP...,291
17339,Rhincodon typus,rh1,A0A7T8R2L6,MNGTEGENFY-IP--------------------------MSNKTG-...,MNGTEGENFYIPMSNKTGVVRSPFEYPQYYLAEPWKFSLLAAYMFF...,353


In [90]:
ray_finned_df.to_csv(f'{report_dir2}/ray_finned_fish/mined_alignments/extracted_fish_ops.csv')

In [91]:
ray_finned_df_filtered = ray_finned_df[ray_finned_df['species_name'].isin(species_list)].reset_index(drop=True)
ray_finned_df_filtered.shape

(2494, 6)

In [92]:
ray_finned_df_filtered2 = ray_finned_df_filtered[~ray_finned_df_filtered['sequence'].isin(ncbi_prot_list)].reset_index(drop=True)
ray_finned_df_filtered2.shape

(2457, 6)

In [93]:
ray_finned_df_filtered3 = ray_finned_df_filtered2[ray_finned_df_filtered2['seq_length'] > 320].reset_index(drop=True)
ray_finned_df_filtered3['source'] = 'Policarpo et al 2024'
ray_finned_df_filtered3.shape

(2135, 7)

In [94]:
ray_finned_df_filtered3.to_csv(f'{report_dir2}/ray_finned_fish/mined_alignments/extracted_fish_ops_filtered.csv')

In [95]:
sequence_list += ray_finned_df_filtered3['sequence'].to_list()
acc_list += ray_finned_df_filtered3['accession'].to_list()
db_sp_list += ray_finned_df_filtered3['species_name'].to_list()
prot_descriptions += ray_finned_df_filtered3['opsin_type'].to_list()
source_list += ray_finned_df_filtered3['source'].to_list()

In [96]:
gn_list = []
sp_list = []
for sp in db_sp_list:
#    print(sp)
    gn_list.append(sp.split(' ', 1)[0])
    sp_list.append(sp.split(' ', 1)[1])

In [97]:
# Load our existing taxonomy dictionary and pull relevant taxon info
taxon_file = './data_sources/taxonomy/ncbi_taxon_dict.json'
if os.path.isfile(taxon_file):
    with open(taxon_file, 'r') as f:
        species_taxon_dict = json.load(f)
        
phylum_list = []
subphylum_list = []
class_list = []      
for sp in db_sp_list:
    phylum_list.append(species_taxon_dict[sp]["Phylum"])
    subphylum_list.append(species_taxon_dict[sp]["Subphylum"])
    class_list.append(species_taxon_dict[sp]["Class"])

### <font color=#c994c7>MATEdb2: Martínez-Redondo et. al. 2024</font>
For now I have no pipeline set up to extract sequences from the assembled transcriptomes but this may be coming in the future

In [98]:
matedb2_links_file = f'{report_dir2}/matedb2/Table_S1_cleaned.csv'
matedb2_links_df = pd.read_csv(matedb2_links_file)
matedb2_links_df.head()

Unnamed: 0,PHYLUM,LINEAGE,CLASS,ORDER,FAMILY,COMMON NAME,CODE (5 letter code),SCIENTIFIC NAME,TAXON ID (see NCBI),GENOME OR TRANSCRIPTOME?,...,C (LONGEST),S (LONGEST),D (LONGEST),F (LONGEST),M (LONGEST),C+F (GOOD if >85*).1,Busco database.1,No. GENES,EggNOG-mapper,FANTASIA
0,ACOELA,Acoelomorpha,-,Acoela,Anaperidae,,NFUS1,Neochildia fusca,66402,Transcriptome,...,74.7,72.7,2.0,3.0,22.3,77.7,metazoa_odb10,22370,12845,22351
1,ACOELA,Acoelomorpha,-,Acoela,Childiidae,,CSUB1,Childia submaculatum,243778,Transcriptome,...,71.7,65.9,5.8,5.3,23.0,77.0,metazoa_odb10,15861,10576,15605
2,ACOELA,Acoelomorpha,-,Acoela,Convolutidae,,CNAI1,Convoluta (Praesagittifera) naikaiensis,31270,Transcriptome,...,74.1,73.2,0.9,5.2,20.7,79.3,metazoa_odb10,17797,10900,17779
3,ACOELA,Acoelomorpha,-,Acoela,Diopisthoporidae,,DLON1,Diopisthoporus longitubus,996724,Transcriptome,...,71.6,64.8,6.8,3.7,24.7,75.3,metazoa_odb10,19356,13097,18921
4,ACOELA,Acoelomorpha,-,Acoela,Hofsteniidae,,HMIA1,Hofstenia miamia,442651,Genome,...,80.0,78.1,1.9,5.5,14.5,85.5,metazoa_odb10,22632,15809,21891


In [99]:
matedb2_links_df.shape

(970, 33)

In [100]:
matedb2_links_filtered = matedb2_links_df[matedb2_links_df['SCIENTIFIC NAME'].isin(species_list)].reset_index(drop=True)
matedb2_links_filtered.head()

Unnamed: 0,PHYLUM,LINEAGE,CLASS,ORDER,FAMILY,COMMON NAME,CODE (5 letter code),SCIENTIFIC NAME,TAXON ID (see NCBI),GENOME OR TRANSCRIPTOME?,...,C (LONGEST),S (LONGEST),D (LONGEST),F (LONGEST),M (LONGEST),C+F (GOOD if >85*).1,Busco database.1,No. GENES,EggNOG-mapper,FANTASIA
0,ARTHROPODA,Chelicerata,??,Xiphosura,Limulidae,Horseshoe crabs,LPOL1,Limulus polyphemus,6850,Genome,...,92.5,86.0,6.5,4.7,2.8,97.2,metazoa_odb10,22879,20689,22686
1,ARTHROPODA,Chelicerata,Arachnida,Araneae,Araneidae,Spiders,ABRU1,Argiope bruennichi,94029,Genome,...,84.5,82.6,1.9,8.5,7.0,93.0,metazoa_odb10,23259,15836,23229
2,ARTHROPODA,Pancrustacea,Crustacea/Branchiopoda,Anostraca,Artemiidae,,ASAL1,Artemia salina,85549,Transcriptome,...,63.2,63.0,0.2,15.7,21.1,78.9,metazoa_odb10,17880,12058,17869
3,ARTHROPODA,Pancrustacea,Crustacea/Branchiopoda,Diplostraca,Daphniidae,Water fleas,DMAG2,Daphnia magna,35525,Genome,...,96.8,90.6,6.2,0.3,2.9,97.1,metazoa_odb10,16891,14670,16206
4,ARTHROPODA,Pancrustacea,Crustacea/Malacostraca,Amphipoda,Talitridae,,TASA1,Talitrus saltator,191375,Transcriptome,...,67.8,67.3,0.5,13.1,19.1,80.9,metazoa_odb10,15927,10006,15912


In [101]:
matedb2_links_filtered.shape

(37, 33)

In [102]:
matedb2_links_filtered2 = matedb2_links_filtered[~matedb2_links_filtered['SCIENTIFIC NAME'].isin(ncbi_sp_list)]
matedb2_links_filtered2.shape

(14, 33)

### <font color=#c994c7>Create Merged Dataframe From All Accessory Opsin Sequence DBs</font>

In [103]:
#make a merged df of all the accessory seq dbs, filter out reedundant datapoints, then append to the end of the NCBI query sheet?
data = {'Accession': acc_list, 'Phylum': phylum_list, 'Subphylum': subphylum_list, 'Class': class_list,'Genus': gn_list, 'Species': sp_list, 'Full_Species': db_sp_list, 'Gene_Description': prot_descriptions, 'Protein' : sequence_list, 'source' : source_list} 
#data = {'Accession': acc_list, 'Genus': gn_list, 'Species': sp_list, 'Full_Species': db_sp_list, 'Gene_Description': prot_descriptions, 'Protein' : sequence_list, 'source' : source_list} 

acc_seq_db_df = pd.DataFrame(data)
acc_seq_db_df.head()

Unnamed: 0,Accession,Phylum,Subphylum,Class,Genus,Species,Full_Species,Gene_Description,Protein,source
0,U3IU26,Chordata,Craniata,Aves,Anas,platyrhynchos,Anas platyrhynchos,Uncharacterized protein,TQTEHNIVAAYLITAGVISIFSNIVVLGIFVKYKELRTATNAIIIN...,Ramirez et al 2016
1,A0A0R4IEG0,Chordata,Craniata,Actinopteri,Danio,rerio,Danio rerio,Novopsin-10,SPSADLSIAVFLIITGVVSVFGNGLVLLVYGQRRKKLRAHELMTIN...,Ramirez et al 2016
2,Q80XL3,Chordata,Craniata,Mammalia,Mus,musculus,Mus musculus,Rrh protein,SRTEHSVIAAYLIVAGITSILSNVVVLGIFIKYKELRTPTNAVIIN...,Ramirez et al 2016
3,M3YAY1,Chordata,Craniata,Mammalia,Mustela,putorius furo,Mustela putorius furo,Uncharacterized protein,SQTEHNIVAAYLITAGIISIFSNLIVLGIFIKYKELRTPTNAIIIN...,Ramirez et al 2016
4,G1PJY3,Chordata,Craniata,Mammalia,Myotis,lucifugus,Myotis lucifugus,Uncharacterized protein,SQTEHNIVATYLIMAGMISLLSNIIVLGIFITYKELRTPTNAIIIN...,Ramirez et al 2016


In [104]:
acc_seq_db_df.shape

(2225, 10)

In [105]:
acc_seq_db_df_filtered = acc_seq_db_df.copy()
acc_seq_db_df_filtered.drop_duplicates(subset=['Full_Species', 'Protein'],  keep='first', inplace=True)
acc_seq_db_df_filtered=acc_seq_db_df_filtered.reset_index(drop=True)
acc_seq_db_df_filtered.shape

(2214, 10)

In [106]:
len(set(acc_seq_db_df_filtered['Full_Species'].to_list()))

87

In [107]:
acc_seq_db_df_filtered.to_csv(f'{report_dir2}/vpod_comp_accessory_seq_dbs.csv')

In [108]:
fasta_file = f'./{report_dir2}/acc_db_seqs.fasta'
with open(fasta_file, 'w') as f:
    for id, seq in zip(acc_seq_db_df_filtered['Accession'], acc_seq_db_df_filtered['Protein']):
        f.write(f'>{id}\n{seq}\n')

### <font color=#c994c7>Merge the Formated Accessory Sequence DBs w/the Mined NCBI Data</font>

In [109]:
ncbi_query_df.head()

Unnamed: 0,Accession,Phylum,Subphylum,Class,Genus,Species,Full_Species,DNA,Protein,Gene_Description,Species_Synonym_Used,Prot_Len
0,XM_024085765.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,ACACAATTCATTTAACGTGTTCAGAATAGATGCACCACCGCATTAG...,MATQSCFWSDIFDEQSRANLNTGFVVTGAVSTLLGSWLNLALLLTN...,PREDICTED: Bicyclus anynana rhodopsin (LOC1120...,,313
1,XM_024089198.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,GTTCGGGATGACAGATATACGCTCGAGAATAACATTTTGCTACGGG...,MSPATKIIVIIFLIDIKCKNVDTLPEVKAELWSFKNNLKKSVVNHF...,"PREDICTED: Bicyclus anynana opsin, ultraviolet...",,572
2,AY918895.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,TTTGGTATTAACCATCTCGGGGCGACCTATCCACCTGTAACCCTTT...,MAITSMDPGPGIAALQAWGGHAAAYGSNETVVDKVLPDMLHLVDPH...,Bicyclus anynana long wavelength sensitive ops...,,380
3,XM_061286956.1,Chordata,Craniata,Actinopteri,Syngnathus,typhle,Syngnathus typhle,ACCGCACGCGTGGATTACATTGCAGCGTGGAGGAAATGGGGAAAAA...,MDARMFSGQVALNLSGSNLSDDPELAEPSVEVEARLSPTGFVVLSV...,PREDICTED: Syngnathus typhle teleost multiple ...,,381
4,XM_061286026.1,Chordata,Craniata,Actinopteri,Syngnathus,typhle,Syngnathus typhle,GGACGCGCGCTCCAGCCGCACATCGAAGCCACCACAATGACCGAAA...,MNPYNETRAEHNLFAFGTYKLLAVTIGTIGVFGFCNNVLVILLYCK...,"PREDICTED: Syngnathus typhle opsin 3 (opn3), mRNA",,381


In [110]:
acc_seq_db_df_filtered.drop(columns='source', inplace =True)
acc_seq_db_df_filtered.head()

Unnamed: 0,Accession,Phylum,Subphylum,Class,Genus,Species,Full_Species,Gene_Description,Protein
0,U3IU26,Chordata,Craniata,Aves,Anas,platyrhynchos,Anas platyrhynchos,Uncharacterized protein,TQTEHNIVAAYLITAGVISIFSNIVVLGIFVKYKELRTATNAIIIN...
1,A0A0R4IEG0,Chordata,Craniata,Actinopteri,Danio,rerio,Danio rerio,Novopsin-10,SPSADLSIAVFLIITGVVSVFGNGLVLLVYGQRRKKLRAHELMTIN...
2,Q80XL3,Chordata,Craniata,Mammalia,Mus,musculus,Mus musculus,Rrh protein,SRTEHSVIAAYLIVAGITSILSNVVVLGIFIKYKELRTPTNAVIIN...
3,M3YAY1,Chordata,Craniata,Mammalia,Mustela,putorius furo,Mustela putorius furo,Uncharacterized protein,SQTEHNIVAAYLITAGIISIFSNLIVLGIFIKYKELRTPTNAIIIN...
4,G1PJY3,Chordata,Craniata,Mammalia,Myotis,lucifugus,Myotis lucifugus,Uncharacterized protein,SQTEHNIVATYLIMAGMISLLSNIIVLGIFITYKELRTPTNAIIIN...


In [111]:
final_query_df = pd.concat([ncbi_query_df, acc_seq_db_df_filtered]).reset_index(drop = True)
final_query_df.to_csv(f'{query_report_dir}/ncbi_q_merged_w_acc_seq_db.csv', index=False)
#final_query_df.to_csv('mnm_data/mnm_on_mnm_on_all_dbs_2024-12-10_23-31-03/ncbi_q_merged_w_acc_seq_db.csv')
final_query_df.head()

Unnamed: 0,Accession,Phylum,Subphylum,Class,Genus,Species,Full_Species,DNA,Protein,Gene_Description,Species_Synonym_Used,Prot_Len
0,XM_024085765.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,ACACAATTCATTTAACGTGTTCAGAATAGATGCACCACCGCATTAG...,MATQSCFWSDIFDEQSRANLNTGFVVTGAVSTLLGSWLNLALLLTN...,PREDICTED: Bicyclus anynana rhodopsin (LOC1120...,,313.0
1,XM_024089198.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,GTTCGGGATGACAGATATACGCTCGAGAATAACATTTTGCTACGGG...,MSPATKIIVIIFLIDIKCKNVDTLPEVKAELWSFKNNLKKSVVNHF...,"PREDICTED: Bicyclus anynana opsin, ultraviolet...",,572.0
2,AY918895.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,TTTGGTATTAACCATCTCGGGGCGACCTATCCACCTGTAACCCTTT...,MAITSMDPGPGIAALQAWGGHAAAYGSNETVVDKVLPDMLHLVDPH...,Bicyclus anynana long wavelength sensitive ops...,,380.0
3,XM_061286956.1,Chordata,Craniata,Actinopteri,Syngnathus,typhle,Syngnathus typhle,ACCGCACGCGTGGATTACATTGCAGCGTGGAGGAAATGGGGAAAAA...,MDARMFSGQVALNLSGSNLSDDPELAEPSVEVEARLSPTGFVVLSV...,PREDICTED: Syngnathus typhle teleost multiple ...,,381.0
4,XM_061286026.1,Chordata,Craniata,Actinopteri,Syngnathus,typhle,Syngnathus typhle,GGACGCGCGCTCCAGCCGCACATCGAAGCCACCACAATGACCGAAA...,MNPYNETRAEHNLFAFGTYKLLAVTIGTIGVFGFCNNVLVILLYCK...,"PREDICTED: Syngnathus typhle opsin 3 (opn3), mRNA",,381.0


In [112]:
final_query_df.shape

(4171, 12)

In [113]:
fasta_file = f'./{query_report_dir}/mined_and_acc_seqs.fasta'
with open(fasta_file, 'w') as f:
    for id, seq in zip(final_query_df['Accession'], final_query_df['Protein']):
        f.write(f'>{id}\n{seq}\n')

In [114]:
STOP

NameError: name 'STOP' is not defined

### Make this section a notebook version of using OPTICS 

In [None]:
#Optics function here

## <font color=#c994c7>Part 2 Objective</font> - Match sequence to it's closest MSP value based on OPTICS predictions

We'll need to...

- Query OPTICS with all the sequence data [bootstrap enabled]
- Extract Predictions
- Match to closest MSP value species-by-species [will need a list of the unique species names] // Match to MaxId as the foreign key

In [34]:
import os
import time 
import datetime
import warnings
import pandas as pd
from deepBreaks.preprocessing import read_data
from mnm_scripts.mine_n_match_functions import mine_n_match
email = 'sethfrazer@ucsb.edu'
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

In [28]:
report_dir = 'mnm_data/mnm_on_mnm_on_all_dbs_2025-02-05_23-04-27' #re-define the report directory if needed
ncbi_q_file = f'./{report_dir}/ncbi_q_merged_w_acc_seq_db.csv'
ncbi = pd.read_csv(ncbi_q_file)
ncbi.head()

Unnamed: 0,Accession,Phylum,Subphylum,Class,Genus,Species,Full_Species,DNA,Protein,Gene_Description,Species_Synonym_Used,Prot_Len
0,XM_024085765.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,ACACAATTCATTTAACGTGTTCAGAATAGATGCACCACCGCATTAG...,MATQSCFWSDIFDEQSRANLNTGFVVTGAVSTLLGSWLNLALLLTN...,PREDICTED: Bicyclus anynana rhodopsin (LOC1120...,,313.0
1,XM_024089198.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,GTTCGGGATGACAGATATACGCTCGAGAATAACATTTTGCTACGGG...,MSPATKIIVIIFLIDIKCKNVDTLPEVKAELWSFKNNLKKSVVNHF...,"PREDICTED: Bicyclus anynana opsin, ultraviolet...",,572.0
2,AY918895.2,Arthropoda,Hexapoda,Insecta,Bicyclus,anynana,Bicyclus anynana,TTTGGTATTAACCATCTCGGGGCGACCTATCCACCTGTAACCCTTT...,MAITSMDPGPGIAALQAWGGHAAAYGSNETVVDKVLPDMLHLVDPH...,Bicyclus anynana long wavelength sensitive ops...,,380.0
3,XM_061286956.1,Chordata,Craniata,Actinopteri,Syngnathus,typhle,Syngnathus typhle,ACCGCACGCGTGGATTACATTGCAGCGTGGAGGAAATGGGGAAAAA...,MDARMFSGQVALNLSGSNLSDDPELAEPSVEVEARLSPTGFVVLSV...,PREDICTED: Syngnathus typhle teleost multiple ...,,381.0
4,XM_061286026.1,Chordata,Craniata,Actinopteri,Syngnathus,typhle,Syngnathus typhle,GGACGCGCGCTCCAGCCGCACATCGAAGCCACCACAATGACCGAAA...,MNPYNETRAEHNLFAFGTYKLLAVTIGTIGVFGFCNNVLVILLYCK...,"PREDICTED: Syngnathus typhle opsin 3 (opn3), mRNA",,381.0


In [29]:
ncbi.shape

(4171, 12)

In [30]:
# Example use for OPTICS
# python optics_predictions.py -in msp_mined_seqs.fasta -rd mined_msp_seqs -out mined_seq_predictions.tsv -m wildtype -e aa_prop -b True -ir msp_mined_seq_blastp_report.tsv -r bovine -s False -bsv msp_bs_viz.pdf
pred_dir = 'e:\safra\Documents\GitHub\optics\prediction_outputs\optics_on_mnm_all_dbs_2025-02-06_23-25-42'
optics_pred_file = f'{pred_dir}/mnm_all_dbs_predictions.tsv'
optics = pd.read_csv(optics_pred_file, sep='\t')

optics.head()

Unnamed: 0,Names,Single_Prediction,Prediction_Means,Prediction_Medians,Prediction_Lower_Bounds,Prediction_Upper_Bounds,Std_Deviation,%Identity_Nearest_VPOD_Sequence,Sequence_Length,Lmax_Hex_Color
0,XM_024085765.2,436.6,439.5,440.9,396.4,477.4,22.8,29.814,313,#0600fe
1,XM_024089198.2,388.9,425.6,427.3,369.3,475.4,28.3,47.619,572,#4600dc
2,AY918895.2,549.2,538.2,538.1,518.0,554.4,10.1,81.746,380,#7bff00
3,XM_061286956.1,483.8,477.8,477.2,434.9,522.0,24.6,40.161,381,#00ccff
4,XM_061286026.1,461.7,489.1,488.0,449.0,526.6,20.7,39.245,381,#00fbff


In [31]:
optics.shape

(4171, 10)

In [32]:
source_file = './data_sources/lmax/clean_vpod_comp_acc_dbs_2025-01-24_18-44-29_2.tsv'
comp_db = pd.read_csv(source_file,delimiter='\t',index_col=0)
comp_db.head()

Unnamed: 0_level_0,Full_Species,Accession,maxid,longcore_id,murwes_id,caves_id,porter2005_id,porter2006_id,LambdaMax,Column1
comp_db_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Drosophila melanogaster,Z86118,,,,,,42.0,508.0,
1,Loligo subulata,Z49108,,,,,,2.0,499.0,
2,Schistocerca gregaria,X80072,,,,,,46.0,430.0,
3,Schistocerca gregaria,X80071,,,,,,39.0,520.0,
4,Sphodromantis sp.,X71665,,,,,,40.0,515.0,


In [35]:
final_err_filtered_df = mine_n_match(email, report_dir, source_file, ncbi_q_file, optics_pred_file, out='vpod_acc_dbs', err_filter = 15)

There were 0 unmatched species
Existing Taxon Dictionary Found! One Moment While We Update It...



In [36]:
final_err_filtered_df.head()

Unnamed: 0_level_0,Accession,Phylum,Subphylum,Class,Genus,Species,Full_Species,%Identity_Nearest_VPOD_Sequence,prediction_value,LambdaMax,abs_diff,comp_db_id,Protein,Gene_Description,Notes
mnm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,OU342715.1-13552739-13554338,Chordata,Craniata,Actinopteri,Taurulus,bubalis,Taurulus bubalis,87.755,510.7,507.0,3.7,1605,ANGTEGKNFYIPMSNRTGIVRSPFEYQQYYLADPIMFKLLAFYMFF...,rh2,
1,STO|Pscil|UVS|contig03244,Arthropoda,Crustacea,Malacostraca,Pseudosquilla,ciliata,Pseudosquilla ciliata,55.172,409.6,403.17,6.43,646,APLPKLDALVALPPALLANLTLRIGRGIRHCSGPKGSVFASRYEVK...,SWS/UVS,
2,DEC|Nenor|LWS|TRINITY_DN39887_c5_g3_i2,Arthropoda,Crustacea,Malacostraca,Nephrops,norvegicus,Nephrops norvegicus,70.642,510.8,515.0,4.2,891,ASNNLPSTNPYGNYTVVDTVPKEILHMVDPHWYQFPPMNPLWYGLV...,LWS,
3,CM010448.1-19773550-19776969,Chordata,Craniata,Actinopteri,Carassius,auratus,Carassius auratus,34.328,444.3,452.0,7.7,835,DIHVLNVTVYRVSNGGETAIGVYLVILGLSWIGNGVVILLLTKQRK...,opn6,
4,DEC|Sydeb|MWS|KP234272.1,Arthropoda,Crustacea,Malacostraca,Systellaspis,debilis,Systellaspis debilis,40.659,483.8,493.0,9.2,765,DKMAGTQNASYAMSFTRGGSLISYGYPEGVTLMDLLPEDVKPLIHS...,MWS2,


In [37]:
final_err_filtered_df.shape

(498, 15)