# <font color=#c51b8a>VPOD 'Mine-n-Match':</font>
## <font color=#c994c7>Part 1 Objective</font> - Use Species Names from Microspectrophotemetry Data Sheet to Query NCBI for All Related Opsin Sequences  

In [2]:
import os
import re
import datetime
import time
import pandas as pd
from deepBreaks.preprocessing import read_data
from Bio import Entrez, SeqIO

### <font color=#c994c7>Load data-table with all of the species and sequence data</font> 

In [11]:
scp_file = './longcore_data/AnimalPhotopigmentsV1_1.csv'
scp_df = read_data(scp_file, seq_type = None, is_main=False)

In [12]:
scp_df.head()

Unnamed: 0_level_0,Phylum,Class,Order,Family,Full_Species,Type,Wavelength,Band,Oil,Nocturnal Activity,Diurnal Activity,Source
max_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,360,,,Y,Y,Yamashita & Tateda 1978
1,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,490,,,Y,Y,Yamashita & Tateda 1978
2,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,540,,,Y,Y,Yamashita & Tateda 1978
3,Arthropoda,Arachnida,Aranae,Araneidae,Argiope bruennichi,,360,,,Y,Y,Yamashita & Tateda 1978
4,Arthropoda,Arachnida,Aranae,Araneidae,Argiope bruennichi,,490,,,Y,Y,Yamashita & Tateda 1978


### <font color=#c994c7>In this case our dataframe has the full species name in one column so we can create a list directly and filter to create a list of all unique species names</font> 

In [14]:
species_list = scp_df['Full_Species'].to_list()
len(species_list)

980

In [15]:
unique_species_list = list(set(species_list)) 
len(unique_species_list)

322

In [18]:
scp_query_list = []
for species in unique_species_list:
    NCBI_seq = ncbi_fetch("sethfrazer@ucsb.edu", 
                    f"{species}[ORGN] AND (opsin[All Fields] AND complete[All Fields] AND cds[All Fields] NOT voucher)")
    scp_query_list.append(NCBI_seq)
#Find the field for taxonomy name
    

{'Count': '0', 'RetMax': '0', 'RetStart': '0', 'IdList': [], 'TranslationSet': [{'From': 'Ptychozoon lionotum[ORGN]', 'To': '"Ptychozoon lionotum"[Organism]'}], 'TranslationStack': [{'Term': '"Ptychozoon lionotum"[Organism]', 'Field': 'Organism', 'Count': '22', 'Explode': 'Y'}, {'Term': 'opsin[All Fields]', 'Field': 'All Fields', 'Count': '45822', 'Explode': 'N'}, {'Term': 'complete[All Fields]', 'Field': 'All Fields', 'Count': '64650156', 'Explode': 'N'}, 'AND', {'Term': 'cds[All Fields]', 'Field': 'All Fields', 'Count': '190106406', 'Explode': 'N'}, 'AND', {'Term': 'voucher[All Fields]', 'Field': 'All Fields', 'Count': '12834040', 'Explode': 'N'}, 'NOT', 'GROUP', 'AND'], 'QueryTranslation': '"Ptychozoon lionotum"[Organism] AND (opsin[All Fields] AND complete[All Fields] AND cds[All Fields] NOT voucher[All Fields])'}
{'Count': '0', 'RetMax': '0', 'RetStart': '0', 'IdList': [], 'TranslationSet': [{'From': 'Bombus morio[ORGN]', 'To': '"Bombus morio"[Organism]'}], 'TranslationStack': [{'

In [41]:
# create empty lists
Accession = []
DNA = []
Genus = []
Species = []
gene_des = []
version = []
Protein = []
full_sp_names = []
# loop through the result list obtained from the NCBI search
# may take over 10 minutes
for query in scp_query_list:
    for seq in query:
        # get genus nd speceis name seperately
        spe_name = seq.annotations["organism"]
        g_s_name = spe_name.split()

        # get and append protein sequence
        if seq.features:
            for feature in seq.features:
                if feature.type == "CDS":
                    if "translation" in feature.qualifiers.keys():
                        pro_seq = feature.qualifiers['translation'][0]
                    

        # attached them to lists
        Accession.append(str(seq.name))
        Genus.append(str(g_s_name[0]))
        Species.append(str(g_s_name[1]))
        full_sp_names.append(str(g_s_name[0]) + ' ' + str(g_s_name[1]))
        gene_des.append(str(seq.description))
        version.append(str(seq.id))
        Protein.append(str(pro_seq))



In [43]:
# create a dataframe for the information
ncbi_q_op = pd.DataFrame(
    {'Accession': version,
     'Genus': Genus,
     'Species': Species,
     'Full_Species': full_sp_names,
     'Protein': Protein,
     'Gene_Description': gene_des
    })

In [44]:
ncbi_q_op.head()

Unnamed: 0,Accession,Genus,Species,full_sp_name,Protein,gene_des
0,NC_083357.1,Meriones,unguiculatus,Meriones_unguiculatus,MCDMGGLDNLIANTAYLQARKSGDVDTKDMQKRRKNINLPKVEECV...,Meriones unguiculatus strain TT.TT164.6M chrom...
1,NC_083359.1,Meriones,unguiculatus,Meriones_unguiculatus,MCDMGGLDNLIANTAYLQARKSGDVDTKDMQKRRKNINLPKVEECV...,Meriones unguiculatus strain TT.TT164.6M chrom...
2,NC_083363.1,Meriones,unguiculatus,Meriones_unguiculatus,MCDMGGLDNLIANTAYLQARKSGDVDTKDMQKRRKNINLPKVEECV...,Meriones unguiculatus strain TT.TT164.6M chrom...
3,NC_083368.1,Meriones,unguiculatus,Meriones_unguiculatus,MCDMGGLDNLIANTAYLQARKSGDVDTKDMQKRRKNINLPKVEECV...,Meriones unguiculatus strain TT.TT164.6M chrom...
4,NC_083369.1,Meriones,unguiculatus,Meriones_unguiculatus,MCDMGGLDNLIANTAYLQARKSGDVDTKDMQKRRKNINLPKVEECV...,Meriones unguiculatus strain TT.TT164.6M chrom...


In [49]:
mnm_job_label = 'longcore_ncbi_data'
dt_label = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
report_dir = f'mnm_on_{mnm_job_label}_{dt_label}'
os.makedirs(report_dir)
ncbi_q_op.to_csv(path_or_buf=f"./{report_dir}/ncbi_longcore_ops_query.csv", index=False)

In [50]:
ncbi_q_op.shape

(441, 6)

## <font color=#c994c7>Part 2 and Part 3 Objective</font> - Clean-up NCBI data and format it to query OPTICS to obtain lmax predictions then match the sequence to it's closest MSP value based on those predictions

We'll need to...

- Format all sequences to a FASTA file from the existing df
- Query OPTICS with all the sequence data [bootstrap enabled]
- Extract Predictions
- Match to closest MSP value species-by-species [will need a list of the unique species names] // Match to MaxId as the foreign key

In [47]:
import os
import time 
import datetime
import warnings
import pandas as pd
from deepBreaks.preprocessing import read_data

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

## <font color=#c994c7>Part 2: Clean-up NCBI Data and Format for OPTICS predictions</font>

In [62]:
# Loading mined seq data and cleaning it of all redundant entries (cases where Species and Sequence are redundant - we keep cases where identical sequence is present in different species) 
#report_dir = 'mnm_on_msp_data_16_10_2024' #re-define the report directory if needed
ncbi_query_file = f"./{report_dir}/ncbi_longcore_ops_query.csv"
ncbi_data = read_data(ncbi_query_file, seq_type = None, is_main=False)
#ncbi_data['full_species'] = ncbi_data['Genus'] + '_' + ncbi_data['Species']
ncbi_data_filtered = ncbi_data.copy().drop_duplicates(subset=['Full_Species', 'Protein'])
ncbi_data_filtered.to_csv(path_or_buf=f"./{report_dir}/filtered_ncbi_longcore_ops_query2.csv", index=True)
ncbi_data_filtered.head()

Unnamed: 0_level_0,Genus,Species,Full_Species,Protein,gene_des
Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NC_083357.1,Meriones,unguiculatus,Meriones_unguiculatus,MCDMGGLDNLIANTAYLQARKSGDVDTKDMQKRRKNINLPKVEECV...,Meriones unguiculatus strain TT.TT164.6M chrom...
MT024769.1,Meriones,unguiculatus,Meriones_unguiculatus,MAQRLTGEQTLDSYEESTHASIFTYTNSNSTRGPFEGPNYHIAPRW...,Meriones unguiculatus middle/long wavelength-s...
LC573540.1,Ampelophaga,rubiginosa,Ampelophaga_rubiginosa,MANQSDDHYYGAHYEALKSAGPVEMLGDGLTGDDLAAIPEHWLSYP...,Ampelophaga rubiginosa Aru1 UV mRNA for ultrav...
LC573530.1,Ampelophaga,rubiginosa,Ampelophaga_rubiginosa,MATNFTQELYEIGPMAYPLKMISKEVAEHMLGWNIPEEHQDLVHEH...,Ampelophaga rubiginosa Aru1 SW mRNA for short ...
LC573520.1,Ampelophaga,rubiginosa,Ampelophaga_rubiginosa,MDPGPGLAALQAWGGQVAAYGASNQTVVDKVPPDMMHMIDPHWYQF...,Ampelophaga rubiginosa Aru1 LW mRNA for long w...


In [63]:
ncbi_data_filtered.shape

(301, 5)

In [53]:
fasta_file = f'{report_dir}/ncbi_mined_longcore_seqs.fasta'
with open(fasta_file, 'w') as f:
    for id, seq in zip(ncbi_data_filtered.index.to_list(), ncbi_data_filtered['Protein']):
        f.write(f'>{id}\n{seq}\n')

## <font color=#c994c7>Part 3: Match sequences to closest MSP entry based on species and OPTICS predictions</font>

In [77]:
#report_dir = 'mnm_on_msp_data_16_10_2024' #re-define the report directory if needed
filtered_ncbi_query_file = f'./{report_dir}/filtered_ncbi_longcore_ops_query.csv'
ncbi_data_filtered = read_data(filtered_ncbi_query_file, seq_type = None, is_main=False)

In [78]:
# Example use for OPTICS
# python optics_predictions.py -in msp_mined_seqs.fasta -rd mined_msp_seqs -out mined_seq_predictions.tsv -m wildtype -e aa_prop -b True -ir msp_mined_seq_blastp_report.tsv -r bovine -s False -bsv msp_bs_viz.pdf
predFileData = f'./{report_dir}/optics_on_mined_longcore_seqs_2024-10-17_21-14-10/longcore_predictions.tsv'
pred_df = read_data(predFileData, seq_type = None, is_main=False)
pred_df.head()

Unnamed: 0_level_0,Single_Prediction,Prediction_Means,Prediction_Medians,Prediction_Lower_Bounds,Prediction_Upper_Bounds,Std_Deviation,%Identity_Nearest_VPOD_Sequence,Sequence_Length,Lmax_Hex_Color
Names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
NC_083357.1,475.4,475.0,475.5,445.0,503.4,15.5,blastp unsuccessful,559,#00c0ff
MT024769.1,507.9,511.8,512.5,494.3,529.5,8.8,93.036,372,#0eff00
LC573540.1,358.1,375.0,368.1,346.5,418.5,21.9,66.092,390,#610061
LC573530.1,432.8,434.0,434.7,415.9,448.2,8.8,84.906,397,#2600f1
LC573520.1,533.0,535.5,533.2,527.2,554.5,7.4,94.695,390,#72ff00


In [79]:
mnm_merged_df = pd.merge(ncbi_data_filtered, pred_df, left_index=True, right_index=True)
mnm_merged_df.head()

Unnamed: 0,Genus,Species,Full_Species,Protein,gene_des,Single_Prediction,Prediction_Means,Prediction_Medians,Prediction_Lower_Bounds,Prediction_Upper_Bounds,Std_Deviation,%Identity_Nearest_VPOD_Sequence,Sequence_Length,Lmax_Hex_Color
NC_083357.1,Meriones,unguiculatus,Meriones unguiculatus,MCDMGGLDNLIANTAYLQARKSGDVDTKDMQKRRKNINLPKVEECV...,Meriones unguiculatus strain TT.TT164.6M chrom...,475.4,475.0,475.5,445.0,503.4,15.5,blastp unsuccessful,559,#00c0ff
MT024769.1,Meriones,unguiculatus,Meriones unguiculatus,MAQRLTGEQTLDSYEESTHASIFTYTNSNSTRGPFEGPNYHIAPRW...,Meriones unguiculatus middle/long wavelength-s...,507.9,511.8,512.5,494.3,529.5,8.8,93.036,372,#0eff00
LC573540.1,Ampelophaga,rubiginosa,Ampelophaga rubiginosa,MANQSDDHYYGAHYEALKSAGPVEMLGDGLTGDDLAAIPEHWLSYP...,Ampelophaga rubiginosa Aru1 UV mRNA for ultrav...,358.1,375.0,368.1,346.5,418.5,21.9,66.092,390,#610061
LC573530.1,Ampelophaga,rubiginosa,Ampelophaga rubiginosa,MATNFTQELYEIGPMAYPLKMISKEVAEHMLGWNIPEEHQDLVHEH...,Ampelophaga rubiginosa Aru1 SW mRNA for short ...,432.8,434.0,434.7,415.9,448.2,8.8,84.906,397,#2600f1
LC573520.1,Ampelophaga,rubiginosa,Ampelophaga rubiginosa,MDPGPGLAALQAWGGQVAAYGASNQTVVDKVPPDMMHMIDPHWYQF...,Ampelophaga rubiginosa Aru1 LW mRNA for long w...,533.0,535.5,533.2,527.2,554.5,7.4,94.695,390,#72ff00


In [83]:
scp_file = f'./{report_dir}/longcore_data/AnimalPhotopigmentsV1_1.csv'
scp_df = read_data(scp_file, seq_type = None, is_main=False)
gn_list = []
sp_list = []
og_sp_list = scp_df['Full_Species'].to_list()
for sp in og_sp_list:
    gn_list.append(sp.split(' ')[0])
    sp_list.append(sp.split(' ')[1])
scp_df['Genus'] = gn_list
scp_df['Species'] = sp_list
scp_df.head()

Unnamed: 0_level_0,Phylum,Class,Order,Family,Full_Species,Type,LambdaMax,Band,Oil,Nocturnal Activity,Diurnal Activity,Source,Genus,Species
max_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,360,,,Y,Y,Yamashita & Tateda 1978,Argiope,amoena
1,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,490,,,Y,Y,Yamashita & Tateda 1978,Argiope,amoena
2,Arthropoda,Arachnida,Aranae,Araneidae,Argiope amoena,,540,,,Y,Y,Yamashita & Tateda 1978,Argiope,amoena
3,Arthropoda,Arachnida,Aranae,Araneidae,Argiope bruennichi,,360,,,Y,Y,Yamashita & Tateda 1978,Argiope,bruennichi
4,Arthropoda,Arachnida,Aranae,Araneidae,Argiope bruennichi,,490,,,Y,Y,Yamashita & Tateda 1978,Argiope,bruennichi


In [81]:
# Get unique species from predictions
unique_species = list(mnm_merged_df['Full_Species'].unique())
len(unique_species)

72

In [84]:
# Initialize a list to store the matched results
matched_results = []
i = 0
# Iterate through each species
for species in unique_species:
  scp_df_copy = scp_df.copy()
  # Filter predictions and measurements for the current species
  species_predictions = mnm_merged_df[mnm_merged_df['Full_Species'] == species]
  species_measurements = scp_df_copy[scp_df_copy['Full_Species'] == species]
  if species_measurements.shape[0] == 0:
    i+=1
    print(f'\nThis species is missing a match: {species}\n')
  else:
      #print(species)
    # Iterate through each prediction for the current species
    for _, prediction_row in species_predictions.iterrows():
      prediction_value = prediction_row['Prediction_Means']
      try:
        accession = prediction_row['Accession']
      except:
        accession = prediction_row.name

      # Calculate absolute differences between the prediction and all measurements for the species
      species_measurements.loc[:, 'abs_diff'] = (species_measurements['LambdaMax'] - prediction_value).abs()
      # Find the closest measurement (handling ties)
      closest_measurement_row = species_measurements.sort_values('abs_diff').iloc[0]
      min_diff = closest_measurement_row['abs_diff']
      
      existing_match_index = next((i for i, match in enumerate(matched_results) if match['Accession'] == accession), None)

      if (existing_match_index is not None): 
        if (min_diff < matched_results[existing_match_index]['abs_diff']):
          # If it exists, compare the absolute differences and keep the better match
          matched_results[existing_match_index].update({
              'prediction_value': prediction_value,
              'closest_measurement': closest_measurement_row['LambdaMax'],
              'abs_diff': min_diff,
              'max_id': closest_measurement_row.name
          })
      else:
          # If it doesn't exist, add the new match
          matched_results.append({
              'Accession': accession,
              'Full_Species': species,
              'prediction_value': prediction_value,
              'closest_measurement': closest_measurement_row['LambdaMax'],
              'abs_diff': min_diff,
              'max_id': closest_measurement_row.name
          })
        
# Create a new dataframe from the matched results
matched_df = pd.DataFrame(matched_results)

print(f'There were {i} unmatched species')
print(matched_df.head())


This species is missing a match: Ictidomys tridecemlineatus


This species is missing a match: synthetic construct


This species is missing a match: Tharsalea rubidus


This species is missing a match: Gekko gecko


This species is missing a match: Callithrix jacchus


This species is missing a match: Tharsalea nivalis


This species is missing a match: Canis lupus


This species is missing a match: Byasa alcinous


This species is missing a match: Tharsalea heteronea


This species is missing a match: Aquarana catesbeiana

There were 10 unmatched species
     Accession            Full_Species  prediction_value  closest_measurement  \
0  NC_083357.1   Meriones unguiculatus             475.0                  493   
1   MT024769.1   Meriones unguiculatus             511.8                  493   
2   LC573540.1  Ampelophaga rubiginosa             375.0                  460   
3   LC573530.1  Ampelophaga rubiginosa             434.0                  460   
4   LC573520.1  Ampelophaga rub

In [85]:
matched_df.shape

(271, 6)

In [86]:
matched_df.to_csv(path_or_buf=f"./{report_dir}/mnm_unfiltered_match_results.csv", index=True)

In [87]:
len(matched_df['Accession'].unique())

271

In [88]:
len(matched_df['max_id'].unique())

134

In [89]:
#Match %Idenitities to Accessions
iden_list = []
prot_des_list = []
aa_seq_list = []
genus_list = []
species_list = []
for _, d in matched_df.iterrows():
    #print(d)
    acc = d['Accession']
    iden_list.append(mnm_merged_df.loc[acc]['%Identity_Nearest_VPOD_Sequence'])
    prot_des_list.append(mnm_merged_df.loc[acc]['Gene_Description'])
    aa_seq_list.append(mnm_merged_df.loc[acc]['Protein'])
    genus_list.append(mnm_merged_df.loc[acc]['Genus'])
    species_list.append(mnm_merged_df.loc[acc]['Species'])

matched_df['%Identity_Nearest_VPOD_Sequence'] = iden_list
matched_df['Gene_Description'] = prot_des_list
matched_df['Protein'] = aa_seq_list
matched_df['Genus'] = genus_list
matched_df['Species'] = species_list
matched_df = matched_df.reindex(columns=['Accession','Genus','Species','%Identity_Nearest_VPOD_Sequence','prediction_value','closest_measurement','abs_diff','max_id','Protein','Gene_Description','Notes'])

In [90]:
matched_df.head()

Unnamed: 0,Accession,Genus,Species,%Identity_Nearest_VPOD_Sequence,prediction_value,closest_measurement,abs_diff,max_id,Protein,Gene_Description,Notes
0,NC_083357.1,Meriones,unguiculatus,blastp unsuccessful,475.0,493,18.0,643,MCDMGGLDNLIANTAYLQARKSGDVDTKDMQKRRKNINLPKVEECV...,Meriones unguiculatus strain TT.TT164.6M chrom...,
1,MT024769.1,Meriones,unguiculatus,93.036,511.8,493,18.8,643,MAQRLTGEQTLDSYEESTHASIFTYTNSNSTRGPFEGPNYHIAPRW...,Meriones unguiculatus middle/long wavelength-s...,
2,LC573540.1,Ampelophaga,rubiginosa,66.092,375.0,460,85.0,354,MANQSDDHYYGAHYEALKSAGPVEMLGDGLTGDDLAAIPEHWLSYP...,Ampelophaga rubiginosa Aru1 UV mRNA for ultrav...,
3,LC573530.1,Ampelophaga,rubiginosa,84.906,434.0,460,26.0,354,MATNFTQELYEIGPMAYPLKMISKEVAEHMLGWNIPEEHQDLVHEH...,Ampelophaga rubiginosa Aru1 SW mRNA for short ...,
4,LC573520.1,Ampelophaga,rubiginosa,94.695,535.5,540,4.5,355,MDPGPGLAALQAWGGQVAAYGASNQTVVDKVPPDMMHMIDPHWYQF...,Ampelophaga rubiginosa Aru1 LW mRNA for long w...,


Once we have the dataframe of matched seqs - each accession is matched to an id - but some IDs may have several matches... 
At this point we do another 2 step filter

1. Filter by abs. error - keeping the accession with the lowest abs. error
2. If there is a tie - then filter by percent identitiy to sequences in VPOD 

In [91]:
import pandas as pd

# Group by 'max_id' and count unique accessions
grouped_counts = matched_df.groupby('max_id')['Accession'].nunique()

# Filter groups with more than one unique accession
duplicate_max_id_groups = grouped_counts[grouped_counts > 1]

if not duplicate_max_id_groups.empty:
    filtered_results = []

    # Iterate through each max_id with duplicates
    for max_id in duplicate_max_id_groups.index:
        # Filter rows with the current max_id
        duplicates = matched_df[matched_df['max_id'] == max_id]

        # Sort by abs_diff (ascending) and then percent_identity (descending)
        duplicates = duplicates.sort_values(['abs_diff', 'Accession'], ascending=[True, False])
        duplicates = duplicates.sort_values(['abs_diff', '%Identity_Nearest_VPOD_Sequence'], ascending=[True, False])

        # Keep only the first row (lowest abs_diff, highest percent_identity if tied)
        filtered_results.append(duplicates.iloc[0])

    # Combine filtered results with non-duplicate rows
    non_duplicates = matched_df[~matched_df['max_id'].isin(duplicate_max_id_groups.index)]
    final_filtered_df = pd.concat([pd.DataFrame(filtered_results), non_duplicates], ignore_index=True)
    final_filtered_df = final_filtered_df.sort_values(['abs_diff', '%Identity_Nearest_VPOD_Sequence'], ascending=[True, False])
    final_filtered_df.reset_index(drop=True, inplace=True)
    final_filtered_df.index.name = 'mnm_id'

else:
    final_filtered_df = matched_df 

final_filtered_df.head()

Unnamed: 0_level_0,Accession,Genus,Species,%Identity_Nearest_VPOD_Sequence,prediction_value,closest_measurement,abs_diff,max_id,Protein,Gene_Description,Notes
mnm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,AY745192.1,Setonix,brachyurus,86.686,538.0,538,0.0,544,MTQAWDPAGFLAWRRDENEETTRASLFVYTNSNNTKGPFEGPNYHI...,Setonix brachyurus long-wave sensitive opsin m...,
1,MH011574.1,Arizona,elegans,92.055,538.1,538,0.1,857,MTEAWNVAVFAARRHHDDEDTTKESLFVYTNSNNTRDPFEGPNYHI...,Arizona elegans long-wavelength sensitive cone...,
2,M92038.1,Gallus,gallus,98.592,506.2,506,0.2,708,MNGTEGINFYVPMSNKTGVVRSPFEYPQYYLAEPWKYRLVCCYIFF...,"Gallus gallus green sensitive cone opsin mRNA,...",
3,KU324006.1,Arizona,elegans,98.58,483.8,484,0.2,855,MNGTEGLNFYIPMSNKTGIVRSPYEYPQYYLADPWKYSALAAYMFL...,"Arizona elegans rod visual pigment (RHO) gene,...",
4,KU645259.1,Hemidactylus,frenatus,82.421,364.8,365,0.2,876,MSGEEDFYLFANISKVGPFEGPQYHIAPMWAFYFQTAFMGFVFFAG...,"Hemidactylus frenatus SWS1 opsin gene, complet...",


In [92]:
final_filtered_df.shape

(134, 11)

In [93]:
final_filtered_df.to_csv(path_or_buf=f"./{report_dir}/mnm_results_id_filtered.csv", index=True)

In [97]:
#note that prediction values from optics are taken from the mean of the bootstrap predictions
final_err_filtered_df = final_filtered_df[final_filtered_df['abs_diff'] <= 15]
final_err_filtered_df = final_err_filtered_df[final_err_filtered_df['%Identity_Nearest_VPOD_Sequence'] != 'blastp unsuccessful']
final_err_filtered_df = final_err_filtered_df[final_err_filtered_df['%Identity_Nearest_VPOD_Sequence'] != 100.000]

In [98]:
final_err_filtered_df.shape

(81, 11)

In [96]:
final_err_filtered_df.to_csv(path_or_buf=f"./{report_dir}/mnm_final_results_err_filtered.csv", index=True)

## Clean-up post processing! 

- Upload mnm data to VPOD sheets
- If scp data came from a different database, also upload that to VPOD sheets

In [107]:
mnm_file = f"./mine_n_match_raw.tsv"
mnm_data = read_data(mnm_file, seq_type = None, is_main=False)

In [108]:
mnm_data.shape

(225, 12)

In [109]:
mnm_data.head()

Unnamed: 0_level_0,Accession,Genus,Species,%Identity_Nearest_VPOD_Sequence,prediction_value,closest_measurement,abs_diff,max_id,lngcor_id,Protein,Gene_Description,Notes
mnm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,DQ168661.1,Poecilia,reticulata,98.034,560.0,560.0,0.0,1274.0,,MAEEWGKQVFAARRHEDTTRGAAFTYTNSNHTKDPFEGPNYHIAPR...,Poecilia reticulata LWS_QUEm5_L06 long wave-se...,
1,DQ088628.1,Melanochromis,vermivorus,93.838,555.0,555.0,0.0,389.0,,MAEEWGKQSFAARRYHEDSTRGSAFAYTNSNNTRDPFEGPNYHIAP...,Melanochromis vermivorus putative red sensitiv...,
2,L11865.1,Carassius,auratus,90.544,509.0,509.0,0.0,1349.0,,MNGTEGKNFYVPMSNRTGLVRSPFEYPQYYLAEPWQFKILALYLFF...,"Carassius auratus green cone opsin mRNA, compl...",
3,AY745192.1,Setonix,brachyurus,86.686,538.0,538.0,0.0,1475.0,,MTQAWDPAGFLAWRRDENEETTRASLFVYTNSNNTKGPFEGPNYHI...,Setonix brachyurus long-wave sensitive opsin m...,
4,MW219662.1,Myripristis,berndti,98.281,495.1,495.0,0.1,67.0,,MNGTEGPYFYIPMSNATGVVRSPYEYPQYYLVYPAAFAVLGAYMFF...,"Myripristis berndti rhodopsin RH1 mRNA, comple...",


In [132]:
# Sort the dataframe by `abs_diff` in ascending order
mnm_data = mnm_data.sort_values('abs_diff')

# Drop duplicate `Accession` values, keeping only the first (lowest) `abs_diff` value
mnm_data_unique = mnm_data.drop_duplicates(subset='Accession', keep='first')
mnm_data_unique.reset_index(inplace=True, drop=True)
mnm_data_unique.index.name = 'mnm_id'


In [133]:
mnm_data_unique.head()

Unnamed: 0_level_0,Accession,Genus,Species,%Identity_Nearest_VPOD_Sequence,prediction_value,closest_measurement,abs_diff,max_id,lngcor_id,Protein,Gene_Description,Notes
mnm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,DQ168661.1,Poecilia,reticulata,98.034,560.0,560.0,0.0,1274.0,,MAEEWGKQVFAARRHEDTTRGAAFTYTNSNHTKDPFEGPNYHIAPR...,Poecilia reticulata LWS_QUEm5_L06 long wave-se...,
1,AY745192.1,Setonix,brachyurus,86.686,538.0,538.0,0.0,1475.0,,MTQAWDPAGFLAWRRDENEETTRASLFVYTNSNNTKGPFEGPNYHI...,Setonix brachyurus long-wave sensitive opsin m...,
2,DQ088628.1,Melanochromis,vermivorus,93.838,555.0,555.0,0.0,389.0,,MAEEWGKQSFAARRYHEDSTRGSAFAYTNSNNTRDPFEGPNYHIAP...,Melanochromis vermivorus putative red sensitiv...,
3,L11865.1,Carassius,auratus,90.544,509.0,509.0,0.0,1349.0,,MNGTEGKNFYVPMSNRTGLVRSPFEYPQYYLAEPWQFKILALYLFF...,"Carassius auratus green cone opsin mRNA, compl...",
4,AF425072.1,Oncorhynchus,mykiss,85.876,497.9,498.0,0.1,1208.0,,MNGTEGPDFYVPMSNATGIVRNPYEYPQYYLVSPAAYSLMAAYMFF...,"Oncorhynchus mykiss RH1 opsin mRNA, complete cds",


In [134]:
mnm_data_unique.shape

(211, 12)

In [140]:
mnm_data_unique.to_csv(path_or_buf=f"./mine_n_match_curated.csv", index=True)

In [135]:
mnm_duplicates = mnm_data[mnm_data.duplicated(subset=['Accession'], keep=False)]
mnm_duplicates.head()

Unnamed: 0_level_0,Accession,Genus,Species,%Identity_Nearest_VPOD_Sequence,prediction_value,closest_measurement,abs_diff,max_id,lngcor_id,Protein,Gene_Description,Notes
mnm_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,AY745192.1,Setonix,brachyurus,86.686,538.0,538.0,0.0,1475.0,,MTQAWDPAGFLAWRRDENEETTRASLFVYTNSNNTKGPFEGPNYHI...,Setonix brachyurus long-wave sensitive opsin m...,
144,AY745192.1,Setonix,brachyurus,86.686,538.0,538.0,0.0,,544.0,MTQAWDPAGFLAWRRDENEETTRASLFVYTNSNNTKGPFEGPNYHI...,Setonix brachyurus long-wave sensitive opsin m...,
10,KY368274.1,Coccinella,septempunctata,77.926,520.2,520.0,0.2,1568.0,,MMGEPSFPWSVHRSGGFGGNLTVVDKVLPDMLHMVHPHWYQFPPMN...,Coccinella septempunctata long wavelength sens...,
149,KY368274.1,Coccinella,septempunctata,77.926,520.2,520.0,0.2,,33.0,MMGEPSFPWSVHRSGGFGGNLTVVDKVLPDMLHMVHPHWYQFPPMN...,Coccinella septempunctata long wavelength sens...,
13,MK209489.1,Carollia,perspicillata,97.126,500.4,500.0,0.4,1482.0,,MNGTEGPNFYVPFSNKTGIVRSPFEYPQYYLAEPWQFSMLAAYMFL...,Carollia perspicillata isolate PE070 rhodopsin...,


In [136]:
mnm_duplicates.shape

(28, 12)

In [139]:
mnm_duplicates.to_csv(path_or_buf=f"./mine_n_match_duplicates.csv", index=True)