# <font color=#c51b8a>VPOD 'Mine-n-Match':</font>
## <font color=#c994c7>Part 1 Objective</font> - Use Species Names from Microspectrophotemetry Data Sheet to Query NCBI for All Related Opsin Sequences  

In [37]:
import os
import re
import datetime
import time
import pandas as pd
import numpy as np
from deepBreaks.preprocessing import read_data
from Bio import Entrez, SeqIO
email = 'sethfrazer@ucsb.edu'

In [38]:
from mnm_scripts.mine_n_match_functions import ncbi_fetch_opsins, merge_accessory_dbs, fasta_to_dataframe

## <font color=#c51b8a>Load data-tables with all of the species and Lambda Max data from accessory lmax databases</font> 

### <font color=#c994c7>VPOD Single Cell Microspectrophotmetry (SCP) Datatable </font>
### In this case our dataframe does not have full species name in one column so we must create a list by directly combining the genus and species names. Then filter to create a list of all unique species names 


In [39]:
report_dir = './data_sources/lmax'
species_list = []
df_list = []

In [40]:
scp_file = f'{report_dir}/vpod/scp_raw.tsv'
scp_df = pd.read_csv(scp_file, index_col=0, sep='\t')
scp_df['Full_Species'] = (scp_df['Genus'] + ' ' + scp_df['Species']).to_list()
df_list.append(scp_df)
scp_df.head()

Unnamed: 0_level_0,Genus,Species,phylum,CellType,CellSubType,LambdaMax,error,Chromophore,Stage,useable,expid,opsinid,refid,Notes,Unnamed: 15,isGenusInHetero,isSpeciesInHetero,isInHetero,isVert,Full_Species
maxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Acipenser,transmontanus,Chordata,rod,,540.0,,,adult,1.0,,463,30,partial cds but is rod opsin,,,,0,1.0,Acipenser transmontanus
2,Acipenser,transmontanus,Chordata,cone,,464.0,,,adult,0.0,,0,30,,,,,0,,Acipenser transmontanus
3,Acipenser,transmontanus,Chordata,cone,,531.0,,,adult,0.0,,0,30,,,,,0,,Acipenser transmontanus
4,Acipenser,transmontanus,Chordata,cone,,605.0,,,adult,0.0,,0,30,,,,,0,,Acipenser transmontanus
5,Acipenser,medirostris,Chordata,rod,,540.0,,,adult,0.0,,0,43,,,,,0,,Acipenser medirostris


In [41]:
species_list+=scp_df['Full_Species'].to_list()

### <font color=#c994c7>Longcore - 'Animal Photopigments' Datatable</font>

### In this next case our dataframe has the full species name in one column so we can create a list directly and filter to create a list of all unique species names


In [42]:
longcore_file = f'{report_dir}/longcore_data/AnimalPhotopigmentsV1_1.csv'
longcore_df = pd.read_csv(longcore_file, index_col=0)
df_list.append(longcore_df)
longcore_df.head()

Unnamed: 0_level_0,Phylum,Class,Order,Family,Full_Species,Type,LambdaMax,Band,Oil,Nocturnal Activity,Diurnal Activity,Reference
longcore_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,360,,,Y,Y,Yamashita & Tateda 1978
1,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,490,,,Y,Y,Yamashita & Tateda 1978
2,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,540,,,Y,Y,Yamashita & Tateda 1978
3,Arthropoda,Arachnida,Aranae,Araneidae,Argiope bruennichi,,360,,,Y,Y,Yamashita & Tateda 1978
4,Arthropoda,Arachnida,Aranae,Araneidae,Argiope bruennichi,,490,,,Y,Y,Yamashita & Tateda 1978


In [43]:
species_list+=longcore_df['Full_Species'].to_list()

### <font color=#c994c7>Murphy and Westerman Datatable</font>

In [44]:
murphy_westerman_file = f'{report_dir}/murphy_westerman/Murphy and Westerman.csv'
murphy_westerman_df = pd.read_csv(murphy_westerman_file, index_col=0)
df_list.append(murphy_westerman_df)
murphy_westerman_df.head()

Unnamed: 0_level_0,Full_Species,LambdaMax,SD,N,refid,Lineage
murwes_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Aglais urticae,360.0,–,†,[1],Arthropod
1,Agraulis vanillae,370.0,–,†,[2],Arthropod
2,Alima pacifica,467.0,–,5,[4],Arthropod
3,Antheraea polyphemus,355.0,–,20,[5],Arthropod
4,Apodemia mormo,556.0,–,†,[3],Arthropod


In [45]:
species_list+=murphy_westerman_df['Full_Species'].to_list()

### <font color=#c994c7>Caves 'Fish' Datatable</font>

In [46]:
caves_fish_db_file = f'{report_dir}/caves/caves_fish_db.csv'
caves_df = pd.read_csv(caves_fish_db_file, index_col=0)
caves_df['Full_Species'] = (caves_df['Genus'] + ' ' + caves_df['Species']).to_list()
df_list.append(caves_df)
caves_df.head()

Unnamed: 0_level_0,Family,Genus,Species,LambdaMax,refid,Full_Species
caves_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Acipenseridae,Acipenser,baerii,549,[22],Acipenser baerii
1,Acipenseridae,Acipenser,medirostrus,540,[60],Acipenser medirostrus
2,Acipenseridae,Acipenser,ruthenus,545,[21],Acipenser ruthenus
3,Acipenseridae,Acipenser,transmontanus,540,[39] ; [58],Acipenser transmontanus
4,Acipenseridae,Scaphirhynchus,albus,538,[60],Scaphirhynchus albus


In [47]:
species_list+=caves_df['Full_Species'].to_list()

### <font color=#c994c7>Morgan Porter's Datatables</font>

In [48]:
porter_file1 = f'{report_dir}/megan_porter/megan_porter_extract_clean_2005.csv'
porter_df1 = pd.read_csv(porter_file1, index_col=0)
porter_df2 = pd.read_csv(f'{report_dir}/megan_porter/porter_2006_table1_clean.tsv', sep='\t', index_col=0)
df_list.append(porter_df1)
df_list.append(porter_df2)
porter_df1.head()

Unnamed: 0_level_0,Full_Species,Habitat,method,LambdaMax,Reference,Accession
porter2005_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Artemia salina,,EON,410,(Hertel 1972),
1,Daphnia magna,,VC,"348, 434, 525, 608",(Smith and Macagno 1990),
2,Acartia tonsa,,BP,450-520,(Stearns and Forward 1984),
3,Balanus amphitrite,,ERG,532,(Hillman et al. 1973),
4,Balanus amphitrite,,MSP,532,(Minke and Kirschfield 1978),


In [49]:
porter_df2.head()

Unnamed: 0_level_0,Full_Species,Accession,LambdaMax,Reference
porter2006_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Loligo forbesii,X56788,494,Morris et al. (1993)
1,Loligo pealii,AY450853,493,"Brown PK and Brown PS (1958), Hubbard and St G..."
2,Loligo subulata,Z49108,499,Morris et al. (1993)
3,Sepia officinalis,AF000947,492,Brown PK and Brown PS (1958)
4,Todarodes pacificus,X70498,480,Naito et al. (1981)


In [50]:
species_list += (porter_df1['Full_Species'].to_list() + porter_df2['Full_Species'].to_list())

### <font color=#c994c7>Merge Accessory Lambda Max Databases</font>

In [None]:
# Call the function to merge all the species, lambdamax, and potential accession information into one dataframe
merged_df = merge_accessory_dbs(df_list, report_dir)
merged_df.head()

In [None]:
merged_df.shape

### <font color=#c994c7>All unique species names have been extracted from accessory databases. Now we iteratively query NCBI for opsins from each species.</font>

In [51]:
len(species_list)

4098

In [52]:
species_list = list(set(species_list))
len(species_list)

1289

## <font color=#c51b8a>Run NCBI Query Mining Process</font>

In [None]:
ncbi_query_df, query_report_dir = ncbi_fetch_opsins(email=email, job_label='mnm_on_all_dbs', out='mnm_on_all_dbs', species_list=species_list[0:5])

## <font color=#c51b8a>Load Accessory Opsin Sequence Databases</font> 

### <font color=#c994c7>Load Previous MnM Data</font>

In [125]:
#ncbi_query_file = f'{query_report_dir}/mnm_on_all_dbs_ncbi_q_data_cleaned.csv'
#ncbi_query_file = f'mnm_data/mnm_on_mnm_on_all_dbs_2024-12-10_23-31-03/mnm_on_all_dbs_ncbi_q_data_cleaned.csv'
#ncbi_query_df = pd.read_csv(ncbi_query_file)
ncbi_query_df.drop_duplicates(subset=['Full_Species', 'Protein'],  keep='first', inplace=True)
ncbi_sp_list = ncbi_query_df['Full_Species'].to_list()
ncbi_prot_list = ncbi_query_df['Protein'].to_list()
ncbi_query_df.head()

Unnamed: 0,Accession,Genus,Species,Full_Species,Protein,Gene_Description
0,MN519158.1,Carcharhinus,melanopterus,Carcharhinus melanopterus,MNGTEGENFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSIIAAYVFL...,Carcharhinus melanopterus isolate U16228 green...
1,MN519147.1,Carcharhinus,melanopterus,Carcharhinus melanopterus,MNGTEGENFYVPMSNKTGVVRSPFEYSQHYLAEPWMFSVLTAYMFF...,Carcharhinus melanopterus isolate CL4386 rhodo...
2,XM_022258443.2,Pieris,rapae,Pieris rapae,MMQITKIILILIPIIVPCDNSVTNELDIKCVKKSVLTKVYCTNLVR...,"PREDICTED: Pieris rapae opsin, ultraviolet-sen..."
3,AB208675.1,Pieris,rapae,Pieris rapae,MFDTVNATADGGAIAYAFKMVSSEVQENMLGFNIPPEHQDLVHEHW...,"Pieris rapae PrB mRNA for opsin, complete cds"
4,AB208674.1,Pieris,rapae,Pieris rapae,MELNYTAGDPIAFPFKMVSGEVQQHMLGWNIPAEHQGLVHEHWRQF...,"Pieris rapae PrV mRNA for opsin, complete cds"


In [15]:
mnm_df.shape

(2248, 6)

In [103]:
report_dir2 = './data_sources/seqs'
sequence_list = []
acc_list = []
db_sp_list = []
prot_descriptions = []
source_list = []

### <font color=#c994c7>Bilaterian Animal Opsins: Ramirez et al 2016</font>

In [54]:
ramirez_file = f'{report_dir2}/bilaterian_animal_ops/Supplemental_Table_T1_cleaned.csv'
ramirez_df = pd.read_csv(ramirez_file)
ramirez_df.head()

Unnamed: 0,gene_ID,Organism,Protein names,aligned_seqs
0,KX714605,Acanthopleura granulata,xenopsin,----------------------------------------------...
1,KX714606,Acanthopleura granulata,canonical_r-opsin,---------YII-------------------------------GVY...
2,1_Acroporadigitifera,Acropora digitifera,58percent_match_PREDICTED visual pigment-like ...,---------Y------------------------------------...
3,XP_015763203,Acropora digitifera,na,---------IVY-------------------------------VVI...
4,XP_015773304,Acropora digitifera,PREDICTED: melanopsin-like,---------IAY-------------------------------GVV...


In [55]:
ramirez_df.shape

(768, 4)

In [56]:
ramirez_df_filtered = ramirez_df[ramirez_df['Organism'].isin(species_list)]
ramirez_df_filtered['raw_seqs'] = ramirez_df_filtered['aligned_seqs'].str.replace('-', '')
prot_len_list = []
for prot in ramirez_df_filtered['raw_seqs']:
    prot_len = len(prot)
    prot_len_list.append(prot_len)
ramirez_df_filtered['prot_len'] = prot_len_list
ramirez_df_filtered = ramirez_df_filtered[ramirez_df_filtered['prot_len']>320].reset_index(drop=True)
ramirez_df_filtered.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ramirez_df_filtered['raw_seqs'] = ramirez_df_filtered['aligned_seqs'].str.replace('-', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ramirez_df_filtered['prot_len'] = prot_len_list


Unnamed: 0,gene_ID,Organism,Protein names,aligned_seqs,raw_seqs,prot_len
0,U3IU26,Anas platyrhynchos,Uncharacterized protein,TQTEH----NIV-------------------------------AAY...,TQTEHNIVAAYLITAGVISIFSNIVVLGIFVKYKELRTATNAIIIN...,340
1,A0A0R4IEG0,Danio rerio,Novopsin-10,SPSAD----LSI-------------------------------AVF...,SPSADLSIAVFLIITGVVSVFGNGLVLLVYGQRRKKLRAHELMTIN...,378
2,Q80XL3,Mus musculus,Rrh protein,SRTEH----SVI-------------------------------AAY...,SRTEHSVIAAYLIVAGITSILSNVVVLGIFIKYKELRTPTNAVIIN...,331
3,M3YAY1,Mustela putorius furo,Uncharacterized protein,SQTEH----NIV-------------------------------AAY...,SQTEHNIVAAYLITAGIISIFSNLIVLGIFIKYKELRTPTNAIIIN...,328
4,G1PJY3,Myotis lucifugus,Uncharacterized protein,SQTEH----NIV-------------------------------ATY...,SQTEHNIVATYLIMAGMISLLSNIIVLGIFITYKELRTPTNAIIIN...,331


In [57]:
ramirez_df_filtered.shape

(6, 6)

In [58]:
len(set(ramirez_df_filtered['Organism'].to_list()))

6

In [108]:
ramirez_df_filtered2 = ramirez_df_filtered[~ramirez_df_filtered['raw_seqs'].isin(ncbi_prot_list)]
ramirez_df_filtered2['source'] = 'Ramirez et al 2016'
ramirez_df_filtered2.shape

(6, 7)

In [104]:
ramirez_df_filtered2.to_csv(f'{report_dir2}/bilaterian_animal_ops/Supplemental_Table_T1_filtered.csv')
len(set(ramirez_df_filtered2['raw_seqs'].to_list()))

6

In [105]:
sequence_list += ramirez_df_filtered2['raw_seqs'].to_list()
acc_list += ramirez_df_filtered2['gene_ID'].to_list()
db_sp_list += ramirez_df_filtered2['Organism'].to_list()
prot_descriptions += ramirez_df_filtered2['Protein names'].to_list()
source_list += ramirez_df_filtered2['source'].to_list()

### <font color=#c994c7>Crustacean Conundrums: Palecanda et al 2022</font>

In [62]:
palecanda_file = f'{report_dir2}/crustacean_conundrums/rstb20210289_si_003.csv'
palecanda_df = pd.read_csv(palecanda_file)
palecanda_df.head()

Unnamed: 0,Class,Order,Family,Genus species,Opsin Type,NCBI Accession #,Transcript ID,Untrimmed Nucleotide Sequence,Protein Sequence
0,Malacostraca,Amphipoda,Corophiidae,Grandidierella japonica,LWS,BK059768,AMP|GrJap|LWS|IACS01070746.1,ACTGAGAAATATCTACGGAAAAATGTCTTGGAATAGCCCAATGAAC...,LPEGMASSNPFGNYTVVDAAPKEILHMVDPHWYQFPPMNPLWYGLL...
1,Malacostraca,Amphipoda,Corophiidae,Grandidierella japonica,MWS1,BK059769,AMP|GrJap|MWS|IACS01055269.1,GGAGTCACCCAGGTTGACCTGGTTCCGGATTACATGAAGGACATGA...,GVTQVDLVPDYMKDMIHPHWSNYPPVNPMWHHLLGLVYIIIGSCAT...
2,Malacostraca,Amphipoda,Corophiidae,Grandidierella japonica,NEUR,BK059770,AMP|GrJap|NEUR|IACS01028177.1,AGTGTACGGCTACGTATGCTTCGCTCTGGGTATCGTCAATATTCTC...,VYGYVCFALGIVNILSIYALAVVRYLKTCHYRNIGSKIDRKEVSIV...
3,Malacostraca,Amphipoda,Gammaridae,Echinogammarus berilloni,LWS,,AMP|EcBer|LWS|GHCT01076719.1,CGTGGCTTCAACGCGCTGTGTAGACCAACAGGACCTCCGCCTCACT...,LPEGVVSSNPFGNYTVVDAAPKELLPLIDSHWYQYPPLNPMWYGLL...
4,Malacostraca,Amphipoda,Gammaridae,Echinogammarus berilloni,LWS,,AMP|EcBer|LWS|GHCT01076725.1,CTCAATATATAATTTGAGCGTCCAGAAGATTCTGTCTCAGTGAGCC...,APEGAVSTNPFGNYTVVDTAAKDILHMISPHWYQFPPLNPMWYGLL...


In [63]:
palecanda_df.shape

(628, 9)

In [69]:
palecanda_df_filtered = palecanda_df[palecanda_df['Genus species'].isin(species_list)].reset_index(drop=True)
#palecanda_df_filtered['raw_seqs'] = palecanda_df_filtered['aligned_seqs'].str.replace('-', '')
prot_len_list = []
for prot in palecanda_df_filtered['Protein Sequence']:
    prot_len = len(prot)
    prot_len_list.append(prot_len)
palecanda_df_filtered['prot_len'] = prot_len_list
palecanda_df_filtered = palecanda_df_filtered[palecanda_df_filtered['prot_len']>320].reset_index(drop=True)

palecanda_df_filtered.head()

Unnamed: 0,Class,Order,Family,Genus species,Opsin Type,NCBI Accession #,Transcript ID,Untrimmed Nucleotide Sequence,Protein Sequence,prot_len
0,Malacostraca,Amphipoda,Talitridae,Talitrus saltator,LWS,,AMP|TeSal|LWS|GDUJ01056749.1,TCACGTGGCCTAGCGCGCTGTGTAGACCGAATCCAACTCCTTGTCT...,HPEGMVSSNPFGNFTVVDAAPKELLHMVDPHWYQYPPLNPLWYGLL...,336
1,Malacostraca,Amphipoda,Talitridae,Talitrus saltator,LWS,,AMP|TeSal|LWS|GDUJ01056750.1,CTCACAACGAATCCAACTCATCAGTAATAATGAGGTGGAGGCGCGT...,LPDSGYISTNPFGNFTVVDSAPKEILHMIDPHWYQFPPLNPMWYGL...,337
2,Malacostraca,Amphipoda,Talitridae,Talitrus saltator,MWS1,,AMP|TeSal|MWS|GDUJ01058424.1,TGGAGGGCTTCTTGGGGCCTCCCAGCTGCATTCATGTCCCTAACAC...,SYGSPSSSALTGANQFAFGYPPGVSVVDIVPPHMKDLIHPHWSNFP...,373
3,Malacostraca,Amphipoda,Talitridae,Talitrus saltator,NEUR,,AMP|TeSal|NEUR|GDUJ01037444.1,ATGGGTTAAAAATCAGCGCTGCATTGACGAGAAGTTTGACTCATCT...,VVQTRPIFIISTRFTLHFDTEWLLACMTAGVVALVGNSASVAMFWR...,330
4,Malacostraca,Decapoda,Cambaridae,Procambarus clarkii,LWS,ALJ26467,DEC|Proclar|LWS|ALJ26467,,SWSNQPAMDDYGLPSSNPYGNFTVVDMAPKDILHMIHPHWYQYPPM...,363


In [70]:
palecanda_df_filtered.shape

(84, 10)

In [71]:
len(set(palecanda_df_filtered['Genus species'].to_list()))

15

In [109]:
palecanda_df_filtered2 = palecanda_df_filtered[~palecanda_df_filtered['Protein Sequence'].isin(ncbi_prot_list)]
palecanda_df_filtered2['source'] = 'Palecanda et al 2022'
palecanda_df_filtered2.shape

(84, 11)

In [110]:
palecanda_df_filtered2.to_csv(f'{report_dir2}/crustacean_conundrums/palecanda_2022_filtered_seq_data.csv')
len(set(palecanda_df_filtered2['Protein Sequence'].to_list()))

84

In [111]:
sequence_list += palecanda_df_filtered2['Protein Sequence'].to_list()
acc_list += palecanda_df_filtered2['Transcript ID'].to_list()
db_sp_list += palecanda_df_filtered2['Genus species'].to_list()
prot_descriptions += palecanda_df_filtered2['Opsin Type'].to_list()
source_list += palecanda_df_filtered2['source'].to_list()

### <font color=#c994c7>Ray-Finned Fish Opsin DB: Policarpo et al 2024</font>

In [76]:
ray_finned_fasta_file = f'{report_dir2}/ray_finned_fish/mined_alignments/Complete_opsins.aln.txt'
ray_finned_df = fasta_to_dataframe(ray_finned_fasta_file)

In [77]:
ray_finned_df

Unnamed: 0,species_name,opsin_type,accession,aln_sequence,sequence,seq_length
0,Lepisosteus oculatus,va,CM001408.1-11749388-11761955,MDSFRVSVNG-VV--------------------------YTEAAE-...,MDSFRVSVNGVVYTEAAEILKPGDPFSGPIENIAPWNFKFLAALMF...,386
1,Lepisosteus oculatus,pinopsin,CM001425.1-1263197-1266918,MPILVNSSAA-FP--------------------------L--EKN-...,MPILVNSSAAFPLEKNSTPGPFDGPQWHQAPRSTYLMVAVLMGTVV...,357
2,Lepisosteus oculatus,sws1,CM001411.1-10398717-10404663,MAGTEE--FY-L---------------------------F----E-...,MAGTEEFYLFENISSVGPWDGPQYHIAPKWAFYFQTIFMGLVFFWG...,346
3,Lepisosteus oculatus,rrh,CM001407.1-48235198-48246929,MENFSRIPIT-KV--------------------------LDNETS-...,MENFSRIPITKVLDNETSNLSQAVDTSEHSVFSQTEHNIVAAYLIT...,343
4,Lepisosteus oculatus,opn5,CM001404.1-37427058-37472360,MAQTDNGTSHNVP--------------------------HYL----...,MAQTDNGTSHNVPHYLLSGDPFASKLSKEADFVAAFYICIVGIMST...,352
...,...,...,...,...,...,...
17336,Callorhinchus milii,pinopsin,XP007894735.2,----MDLPTT-SV--------------------------G--IEN-...,MDLPTTSVGIENGTVGAFDGPQWYFAPKSTYMAVATLMGTVVILAS...,352
17337,Carcharodon carcharias,pinopsin,XP041052689.1,----MGFSTG-SI--------------------------T--THN-...,MGFSTGSITTHNITIGAFHGPQWDVAPRSTYMAVAALMGNVVILAT...,352
17338,Callorhinchus milii,rgr,XM007898380.1,MVTSHP----------------------------------------...,MVTSHPGLEGFTDFEVFGLLVEALVGLLLNGLTLLAFYKIKELRTP...,291
17339,Rhincodon typus,rh1,A0A7T8R2L6,MNGTEGENFY-IP--------------------------MSNKTG-...,MNGTEGENFYIPMSNKTGVVRSPFEYPQYYLAEPWKFSLLAAYMFF...,353


In [78]:
ray_finned_df.to_csv(f'{report_dir2}/ray_finned_fish/mined_alignments/extracted_fish_ops.csv')

In [79]:
ray_finned_df_filtered = ray_finned_df[ray_finned_df['species_name'].isin(species_list)].reset_index(drop=True)
ray_finned_df_filtered.shape

(2494, 6)

In [80]:
ray_finned_df_filtered2 = ray_finned_df_filtered[~ray_finned_df_filtered['sequence'].isin(ncbi_prot_list)].reset_index(drop=True)
ray_finned_df_filtered2.shape

(2447, 6)

In [114]:
ray_finned_df_filtered3 = ray_finned_df_filtered2[ray_finned_df_filtered2['seq_length'] > 320].reset_index(drop=True)
ray_finned_df_filtered3['source'] = 'Policarpo et al 2024'
ray_finned_df_filtered3.shape

(2125, 7)

In [115]:
ray_finned_df_filtered3.to_csv(f'{report_dir2}/ray_finned_fish/mined_alignments/extracted_fish_ops_filtered.csv')

In [116]:
sequence_list += ray_finned_df_filtered3['sequence'].to_list()
acc_list += ray_finned_df_filtered3['accession'].to_list()
db_sp_list += ray_finned_df_filtered3['species_name'].to_list()
prot_descriptions += ray_finned_df_filtered3['opsin_type'].to_list()
source_list += ray_finned_df_filtered3['source'].to_list()

In [146]:
gn_list = []
sp_list = []
for sp in db_sp_list:
#    print(sp)
    gn_list.append(sp.split(' ', 1)[0])
    sp_list.append(sp.split(' ', 1)[1])

### <font color=#c994c7>MATEdb2: Martínez-Redondo et. al. 2024</font>
For now I have no pipeline set up to extract sequences from the assembled transcriptomes but this may be coming in the future

In [None]:
matedb2_links_file = f'{report_dir2}/matedb2/Table_S1_cleaned.csv'
matedb2_links_df = pd.read_csv(matedb2_links_file)
matedb2_links_df.head()

In [None]:
matedb2_links_df.shape

In [None]:
matedb2_links_filtered = matedb2_links_df[matedb2_links_df['SCIENTIFIC NAME'].isin(species_list)].reset_index(drop=True)
matedb2_links_filtered.head()

In [None]:
matedb2_links_filtered.shape

In [None]:
matedb2_links_filtered2 = matedb2_links_filtered[~matedb2_links_filtered['SCIENTIFIC NAME'].isin(ncbi_sp_list)]
matedb2_links_filtered2.shape

### <font color=#c994c7>Create Merged Dataframe From All Accessory Opsin Sequence DBs</font>

In [147]:
#make a merged df of all the accessory seq dbs, filter out reedundant datapoints, then append to the end of the NCBI query sheet?
data = {'Accession': acc_list, 'Genus': gn_list, 'Species': sp_list, 'Full_Species': db_sp_list, 'Gene_Description': prot_descriptions, 'Protein' : sequence_list, 'source' : source_list} 
acc_seq_db_df = pd.DataFrame(data)
acc_seq_db_df.head()

Unnamed: 0,Accession,Genus,Species,Full_Species,Gene_Description,Protein,source
0,U3IU26,Anas,platyrhynchos,Anas platyrhynchos,Uncharacterized protein,TQTEHNIVAAYLITAGVISIFSNIVVLGIFVKYKELRTATNAIIIN...,Ramirez et al 2016
1,A0A0R4IEG0,Danio,rerio,Danio rerio,Novopsin-10,SPSADLSIAVFLIITGVVSVFGNGLVLLVYGQRRKKLRAHELMTIN...,Ramirez et al 2016
2,Q80XL3,Mus,musculus,Mus musculus,Rrh protein,SRTEHSVIAAYLIVAGITSILSNVVVLGIFIKYKELRTPTNAVIIN...,Ramirez et al 2016
3,M3YAY1,Mustela,putorius furo,Mustela putorius furo,Uncharacterized protein,SQTEHNIVAAYLITAGIISIFSNLIVLGIFIKYKELRTPTNAIIIN...,Ramirez et al 2016
4,G1PJY3,Myotis,lucifugus,Myotis lucifugus,Uncharacterized protein,SQTEHNIVATYLIMAGMISLLSNIIVLGIFITYKELRTPTNAIIIN...,Ramirez et al 2016


In [129]:
acc_seq_db_df.shape

(2215, 5)

In [148]:
acc_seq_db_df_filtered = acc_seq_db_df.copy()
acc_seq_db_df_filtered.drop_duplicates(subset=['Full_Species', 'Protein'],  keep='first', inplace=True)
acc_seq_db_df_filtered.shape

(2204, 7)

In [132]:
len(set(acc_seq_db_df_filtered['Full_Species'].to_list()))

87

In [149]:
acc_seq_db_df_filtered.to_csv(f'{report_dir2}/vpod_comp_accessory_seq_dbs.csv')

In [144]:
acc_seq_db_df_filtered2 = acc_seq_db_df_filtered[~acc_seq_db_df_filtered['Protein'].isin(ncbi_prot_list)]
acc_seq_db_df_filtered2.shape

(2204, 7)

### <font color=#c994c7>Merge the Formated Accessory Sequence DBs w/the Mined NCBI Data</font>

In [150]:
ncbi_query_df.head()

Unnamed: 0,Accession,Genus,Species,Full_Species,Protein,Gene_Description
0,MN519158.1,Carcharhinus,melanopterus,Carcharhinus melanopterus,MNGTEGENFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSIIAAYVFL...,Carcharhinus melanopterus isolate U16228 green...
1,MN519147.1,Carcharhinus,melanopterus,Carcharhinus melanopterus,MNGTEGENFYVPMSNKTGVVRSPFEYSQHYLAEPWMFSVLTAYMFF...,Carcharhinus melanopterus isolate CL4386 rhodo...
2,XM_022258443.2,Pieris,rapae,Pieris rapae,MMQITKIILILIPIIVPCDNSVTNELDIKCVKKSVLTKVYCTNLVR...,"PREDICTED: Pieris rapae opsin, ultraviolet-sen..."
3,AB208675.1,Pieris,rapae,Pieris rapae,MFDTVNATADGGAIAYAFKMVSSEVQENMLGFNIPPEHQDLVHEHW...,"Pieris rapae PrB mRNA for opsin, complete cds"
4,AB208674.1,Pieris,rapae,Pieris rapae,MELNYTAGDPIAFPFKMVSGEVQQHMLGWNIPAEHQGLVHEHWRQF...,"Pieris rapae PrV mRNA for opsin, complete cds"


In [151]:
acc_seq_db_df_filtered.drop(columns='source', inplace =True)
acc_seq_db_df_filtered.head()

Unnamed: 0,Accession,Genus,Species,Full_Species,Gene_Description,Protein
0,U3IU26,Anas,platyrhynchos,Anas platyrhynchos,Uncharacterized protein,TQTEHNIVAAYLITAGVISIFSNIVVLGIFVKYKELRTATNAIIIN...
1,A0A0R4IEG0,Danio,rerio,Danio rerio,Novopsin-10,SPSADLSIAVFLIITGVVSVFGNGLVLLVYGQRRKKLRAHELMTIN...
2,Q80XL3,Mus,musculus,Mus musculus,Rrh protein,SRTEHSVIAAYLIVAGITSILSNVVVLGIFIKYKELRTPTNAVIIN...
3,M3YAY1,Mustela,putorius furo,Mustela putorius furo,Uncharacterized protein,SQTEHNIVAAYLITAGIISIFSNLIVLGIFIKYKELRTPTNAIIIN...
4,G1PJY3,Myotis,lucifugus,Myotis lucifugus,Uncharacterized protein,SQTEHNIVATYLIMAGMISLLSNIIVLGIFITYKELRTPTNAIIIN...


In [None]:
final_query_df = pd.concat([ncbi_query_df, acc_seq_db_df_filtered]).reset_index(drop = True)
final_query_df.to_csv(f'{query_report_dir}/ncbi_q_merged_w_acc_seq_db.csv')
#final_query_df.to_csv('mnm_data/mnm_on_mnm_on_all_dbs_2024-12-10_23-31-03/ncbi_q_merged_w_acc_seq_db.csv')
final_query_df.head()

Unnamed: 0,Accession,Genus,Species,Full_Species,Protein,Gene_Description
0,MN519158.1,Carcharhinus,melanopterus,Carcharhinus melanopterus,MNGTEGENFYVPFSNKTGVVRSPFEYPQYYLAEPWQFSIIAAYVFL...,Carcharhinus melanopterus isolate U16228 green...
1,MN519147.1,Carcharhinus,melanopterus,Carcharhinus melanopterus,MNGTEGENFYVPMSNKTGVVRSPFEYSQHYLAEPWMFSVLTAYMFF...,Carcharhinus melanopterus isolate CL4386 rhodo...
2,XM_022258443.2,Pieris,rapae,Pieris rapae,MMQITKIILILIPIIVPCDNSVTNELDIKCVKKSVLTKVYCTNLVR...,"PREDICTED: Pieris rapae opsin, ultraviolet-sen..."
3,AB208675.1,Pieris,rapae,Pieris rapae,MFDTVNATADGGAIAYAFKMVSSEVQENMLGFNIPPEHQDLVHEHW...,"Pieris rapae PrB mRNA for opsin, complete cds"
4,AB208674.1,Pieris,rapae,Pieris rapae,MELNYTAGDPIAFPFKMVSGEVQQHMLGWNIPAEHQGLVHEHWRQF...,"Pieris rapae PrV mRNA for opsin, complete cds"


In [155]:
final_query_df.shape

(4452, 6)

In [156]:
fasta_file = f'mnm_data/mnm_on_mnm_on_all_dbs_2024-12-10_23-31-03/mined_and_acc_seqs.fasta'
with open(fasta_file, 'w') as f:
    for id, seq in zip(final_query_df['Accession'], final_query_df['Protein']):
        f.write(f'>{id}\n{seq}\n')

## <font color=#c994c7>Part 2 Objective</font> - Match sequence to it's closest MSP value based on OPTICS predictions

We'll need to...

- Query OPTICS with all the sequence data [bootstrap enabled]
- Extract Predictions
- Match to closest MSP value species-by-species [will need a list of the unique species names] // Match to MaxId as the foreign key

In [1]:
import os
import time 
import datetime
import warnings
import pandas as pd
from deepBreaks.preprocessing import read_data
from mnm_scripts.mine_n_match_functions_old import mine_n_match, post_process_matching, get_prots_from_acc

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

In [None]:
report_dir = 'mnm_data/mnm_on_mnm_on_all_dbs_2024-12-10_23-31-03' #re-define the report directory if needed
ncbi_q_file = f'./{report_dir}/mnm_on_all_dbs_ncbi_q_data_cleaned.csv'
ncbi = pd.read_csv(ncbi_q_file)
ncbi.head()

In [None]:
ncbi.shape

In [None]:
# Example use for OPTICS
# python optics_predictions.py -in msp_mined_seqs.fasta -rd mined_msp_seqs -out mined_seq_predictions.tsv -m wildtype -e aa_prop -b True -ir msp_mined_seq_blastp_report.tsv -r bovine -s False -bsv msp_bs_viz.pdf
pred_dir = 'e:/safra/Documents/GitHub/optics/prediction_outputs/mnm_opsins/optics_on_all_dbs_2024-12-12_11-49-37'
optics_pred_file = f'{pred_dir}/all_dbs_predictions.tsv'
optics = pd.read_csv(optics_pred_file, sep='\t')

optics.head()

In [None]:
optics.shape

In [None]:
source_file = './data_sources/lmax/cleaned_vpod_comp_accessory_dbs_2024-12-17_14-17-46.csv'
comp_db = pd.read_csv(source_file)
comp_db.head()

In [None]:
final_err_filtered_df = mine_n_match(report_dir, source_file, ncbi_q_file, optics_pred_file, out='vpod_acc_dbs', err_filter = 15)

In [None]:
final_err_filtered_df.head()

In [None]:
final_err_filtered_df.shape