In [None]:
# For error analysis

In [1]:
import pandas as  pd

In [2]:
test_df = pd.read_csv("test_dataset.csv")
target_df = pd.read_csv("Target_ID/no_split.csv")   

In [3]:
target_df.head()

Unnamed: 0,SMILES,target_id,Kd
0,BrCCOc1ccc2nc3ccc(=O)cc3oc2c1,P05067,6.03292
1,Br[Se]c1ccccc1,O75936,4.958607
2,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P28646,6.769551
3,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P30680,5.100015
4,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,P16083,7.742321


In [4]:
test_df.head()

Unnamed: 0,SMILES,target_sequence,Kd
0,CSCC[C@H](NC(=O)[C@H](CC(C)C)NC(=O)CNC(=O)[C@@...,MGACDIVTEANISSDIDSNATGVTAFSMPGWQLALWATAYLALVLV...,5.230032
1,O\N=C1\c2ccccc2-c2nc3ccccc3nc12,MELRVGNRYRLGRKIGSGSFGDIYLGTDIAAGEEVAIKLECVKTKH...,6.420216
2,CC(C)C[C@H](NC(=O)[C@H](C)NC(=O)C[C@H](O)[C@H]...,MQPSSLLPLALCLLAAPASALVRIPLHKFTSIRRTMSEVGGSVEDL...,11.430041
3,Cc1n[nH]c2ccc(cc12)-c1cncc(OC[C@@H](N)Cc2ccccc...,MESEDLSGRELTIDSIMNKVRDIKNKFKNEDLTDELSLNKISADTT...,5.0
4,CC(C)(C)NS(=O)(=O)c1ccc(-c2sc(nc2CC2CCCCC2)C(=...,MDRAPQRQHRASRELLAAKKTHTSQIEVIPCKICGDKSSGIHYGVI...,7.920819


In [5]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Function to fetch FASTA from UniProt ID
def fetch_fasta(uniprot_id):
    if pd.isna(uniprot_id):
        return None
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            fasta = response.text
            # Remove header (first line) and join sequence lines
            sequence = ''.join(fasta.split('\n')[1:]).strip()
            return sequence
    except:
        return None
    return None

# Function to fetch all FASTA sequences in parallel
def fetch_fasta_parallel(df, id_column='target_id', output_column='target_sequence', max_workers=20):
    df[output_column] = None  # Create empty column

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(fetch_fasta, row[id_column]): idx
            for idx, row in df.iterrows()
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching FASTA sequences"):
            idx = futures[future]
            try:
                df.at[idx, output_column] = future.result()
            except Exception as e:
                df.at[idx, output_column] = None

    return df

# Assuming you already have a dataframe 'df'
# Run parallel FASTA fetching
df = fetch_fasta_parallel(target_df, id_column='target_id', output_column='target_sequence', max_workers=20)
# Show rows 0–50 (or first 5 rows if you prefer)
print(df[['target_id', 'target_sequence']].head(5))


Fetching FASTA sequences: 100%|██████████| 61256/61256 [43:40<00:00, 23.37it/s]


  target_id                                    target_sequence
0    P05067  MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...
1    O75936  MACTIQKAEALDGAHLMQILWYDEEESLYPAVWLRDNCPCSDCYLD...
2    P28646  MFPNGTAPSPTSSPSSSPGGCGEGVCSRGPGSGAADGMEEPGRNSS...
3    P30680  MELTSEQFNGSQVWIPSPFDLNGSLGPSNGSNQTEPYYDMTSNAVL...
4    P16083  MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...


In [7]:
filtered_ref_df = target_df.merge(
    test_df[['SMILES', 'target_sequence']],
    on=['SMILES', 'target_sequence'],
    how='inner'
)

In [8]:
filtered_ref_df.head()

Unnamed: 0,SMILES,target_id,Kd,target_sequence
0,Brc1c[nH]c(c1)C(=O)NCc1cccc(Cn2cnc3ccccc23)c1,P14902,6.259637,MAHAMENSWTISKEYHIDEEVGFALPNPQENLPDFYNDWMFIAKHL...
1,Brc1ccc(cc1)C(CCNC(=N)NCCCc1cnc[nH]1)c1ccccn1,P31389,6.919987,MSFLPGMTPVTLSNFSWALEDRMLEGNSTTTPTRQLMPLVVVLSSV...
2,Brc1ccc(cc1)C1CC2CCC1N2,P43144,10.346787,MNRPHSCLSFCWMYFAASGIRAVETANGKYAQKLFSDLFEDYSSAL...
3,Brc1ccc(cc1)[C@@H]1[C@H]2CN(C(=O)C[C@@H]2CS[C@...,Q00987,5.705534,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...
4,Brc1ccc2CCCC(Nc3nc4ccccc4[nH]3)c2c1,P70605,7.958607,MDTSGHFHDSGVGDLDEDPKCPCPSSGDEQQQQQQPPPPSAPPAVP...


In [9]:
len(filtered_ref_df)

6115

In [10]:
filtered_ref_df.to_csv("test_ID_dataset.csv", index=False)