# <font color=#c51b8a>VPOD 'Mine-n-Match':</font>
## <font color=#c994c7>Part 1 Objective</font> - Use Species Names from Microspectrophotemetry Data Sheet to Query NCBI for All Related Opsin Sequences  

In [2]:
import os
import re
import datetime
import time
import pandas as pd
from deepBreaks.preprocessing import read_data
from Bio import Entrez, SeqIO

### <font color=#c994c7>Load data-table with all of the species and sequence data</font> 

In [11]:
scp_file = './longcore_data/AnimalPhotopigmentsV1_1.csv'
scp_df = read_data(scp_file, seq_type = None, is_main=False)

In [None]:
scp_df.head()

### <font color=#c994c7>In this case our dataframe has the full species name in one column so we can create a list directly and filter to create a list of all unique species names</font> 

In [None]:
species_list = scp_df['Full_Species'].to_list()
len(species_list)

In [None]:
unique_species_list = list(set(species_list)) 
len(unique_species_list)

In [None]:
#ncbi query here

## <font color=#c994c7>Part 3 Objective</font> - Match sequence to it's closest MSP value based on OPTICS predictions

We'll need to...

- Query OPTICS with all the sequence data [bootstrap enabled]
- Extract Predictions
- Match to closest MSP value species-by-species [will need a list of the unique species names] // Match to MaxId as the foreign key

In [1]:
import os
import time 
import datetime
import warnings
import pandas as pd
from deepBreaks.preprocessing import read_data

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

In [None]:
report_dir = 'mnm_on_longcore_ncbi_data_2024-10-17' #re-define the report directory if needed
filtered_ncbi_query_file = f'./{report_dir}/filtered_ncbi_longcore_ops_query.csv'

In [None]:
# Example use for OPTICS
# python optics_predictions.py -in msp_mined_seqs.fasta -rd mined_msp_seqs -out mined_seq_predictions.tsv -m wildtype -e aa_prop -b True -ir msp_mined_seq_blastp_report.tsv -r bovine -s False -bsv msp_bs_viz.pdf
predFileData = f'./{report_dir}/optics_on_mined_longcore_seqs_2024-10-17_21-14-10/longcore_predictions.tsv'

In [None]:
scp_file = f'./{report_dir}/longcore_data/AnimalPhotopigmentsV1_1.csv'

## Clean-up post processing! 

- Upload mnm data to VPOD sheets
- If scp data came from a different database, also upload that to VPOD sheets

In [107]:
mnm_file = f"./mine_n_match_raw.tsv"

In [None]:
#mnm_post_process

In [1]:
from Bio import Entrez
import random

Entrez.email = "sethfrazer@ucsb.edu"  # Always provide your email
db = "protein"
term = "protein NOT opsin NOT rhodopsin NOT photoreceptor" 
retmax = 10000  # Retrieve a large number of IDs initially

# Search for protein sequences
handle = Entrez.esearch(db=db, term=term, retmax=retmax)
record = Entrez.read(handle)
handle.close()
id_list = record["IdList"]


In [2]:
id_list

['2863134003', '2863134001', '2863133999', '2428747370', '2428746565', '2428746372', '2072406527', '1844084142', '1844084133', '1840884937', '1383482577', '985482462', '930841418', '930841416', '506327998', '475507357', '312261246', '312176391', '312176378', '312147307', '307938287', '284005427', '284005395', '284005393', '283549164', '283483986', '198278525', '193211614', '130506116', '126723535', '126723515', '126723334', '126723330', '126723275', '125628638', '118150894', '116004085', '94966759', '14149657', '8393516', '2863133995', '2863133993', '2863133991', '2863133989', '2863133987', '2863133985', '2863133983', '2863133981', '2863133979', '2863133977', '2863133975', '2863133973', '2863133971', '2863133969', '2863133967', '2863133965', '2863133963', '2863133961', '2863133959', '2863133957', '2863133955', '2863133953', '2863133951', '2863133949', '2863133947', '2863133945', '2863133943', '2863133941', '2863133939', '2863133937', '2863133935', '2863133933', '2863133931', '286313392

In [3]:

# Randomly select 1000 IDs
random_ids = random.sample(id_list, 1000)

# Fetch sequences for the selected IDs
handle = Entrez.efetch(db=db, id=random_ids, rettype="fasta", retmode="text")
sequences = handle.read()
handle.close()

# Save or process the sequences
with open("random_proteins.fasta", "w") as f:
    f.write(sequences)

In [52]:
import requests
from requests.adapters import HTTPAdapter, Retry
import re


In [61]:
# Define the UniProt API endpoint
base_url = "https://rest.uniprot.org/uniprotkb/search"

# Define the search query for opsins
query = "NOT (keyword:opsin OR keyword:rhodopsin OR keyword:OPS OR keyword:OPSB) AND reviewed:true"

# Initialize a set to store unique sequences
unique_sequences = set()

# Initialize the cursor
cursor = None

# Loop until all results are fetched
# Define the parameters for the API request
params = {
    "query": query,
    "format": "fasta",
    "size": 500,  # Fetch in batches of 500
    "cursor": cursor  # Use the cursor for pagination
}

# Send the API request
ex_response = requests.get(base_url, params=params)

In [62]:

re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))
all_sequences = []


In [None]:
def get_next_link(headers):
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_batch(batch_url):
    while batch_url:
        response = session.get(batch_url)
        response.raise_for_status()
        total = response.headers["x-total-results"]
        yield response, total
        batch_url = get_next_link(response.headers)


In [63]:

url = ex_response.url
interactions = {}
desire = 5000
while len(all_sequences) <= desire:
    for batch, total in get_batch(url):
        sequences = batch.text.split(">")[1:]
            #primaryAccession, interactsWith = line.split('\t')
            #interactions[primaryAccession] = len(interactsWith.split(';')) if interactsWith else 0

            # Add the sequences to the list
        all_sequences.extend(sequences)

        print(f'{len(all_sequences)} / {total}')

500 / 572619
1003 / 572619
1503 / 572619
2003 / 572619
2503 / 572619
3003 / 572619
3503 / 572619
4003 / 572619
4503 / 572619
5003 / 572619
5504 / 572619
6004 / 572619


KeyboardInterrupt: 

In [64]:
len(all_sequences)

6004

In [None]:
unique_sequences = set(all_sequences)
unique_sequences=list(unique_sequences)

In [None]:
len(unique_sequences)

In [None]:
# Randomly select 1000 sequences from the unique list
if len(unique_sequences) >= 2000:
    random_sequences = random.sample(unique_sequences, 2000)
else:
    random_sequences = unique_sequences  # Take all unique sequences if less than 1000

# Save the sequences to a file
with open("random_proteins_uniprot.fasta", "w") as f:
    
    #f.write(">" + ">".join(random_sequences)
    for entry in random_sequences:
        name,sequence = entry.split('\n',1)
        f.write(f'>{name.strip().replace("|","_").replace(" ","_").replace("/","")[0:30]}\n')
        f.write(f'{sequence}')