This is for Deployment

In [1]:
import pandas as pd
import numpy as np
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
df = pd.read_csv('Target_ID/no_split.csv')

In [3]:
df.head()

Unnamed: 0,SMILES,target_id,Kd
0,BrCCOc1ccc2nc3ccc(=O)cc3oc2c1,P05067,6.03292
1,Br[Se]c1ccccc1,O75936,4.958607
2,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P28646,6.769551
3,Brc1[nH]c2cccc3[C@H]4C[C@H](CN[C@@H]4Cc1c23)C(...,P30680,5.100015
4,Brc1c(Br)c(Br)c2[nH]cnc2c1Br,P16083,7.742321


In [4]:
df = df.drop(columns=['SMILES', 'Kd'])

In [5]:
len(df)

61256

In [6]:
df.head()

Unnamed: 0,target_id
0,P05067
1,O75936
2,P28646
3,P30680
4,P16083


In [12]:
df = df.drop_duplicates(subset='target_id')

In [13]:
len(df)

2809

In [None]:
# Filter for Homo sapiens species (case-insensitive)
df_human = df[df['species_name'].str.lower() == 'homo sapiens']

print(f"Filtered rows: {len(df_human)}")


Filtered rows: 1321


In [28]:
df.head()

Unnamed: 0,target_id
0,P05067
1,O75936
4,P16083
6,P14902
7,P28482


In [29]:
# Function to fetch FASTA from UniProt ID
def fetch_fasta(uniprot_id):
    if pd.isna(uniprot_id):
        return None
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            fasta = response.text
            # Remove header (first line) and join sequence lines
            sequence = ''.join(fasta.split('\n')[1:]).strip()
            return sequence
    except:
        return None
    return None

# Function to fetch all FASTA sequences in parallel
def fetch_fasta_parallel(df, id_column='target_id', output_column='target_sequence', max_workers=20):
    df[output_column] = None  # Create empty column

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(fetch_fasta, row[id_column]): idx
            for idx, row in df.iterrows()
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching FASTA sequences"):
            idx = futures[future]
            try:
                df.at[idx, output_column] = future.result()
            except Exception as e:
                df.at[idx, output_column] = None

    return df

# Run parallel FASTA fetching
df = fetch_fasta_parallel(df, id_column='target_id', output_column='target_sequence', max_workers=20)

# Show rows 0–50 (or first 5 rows if you prefer)
print(df[['target_id', 'target_sequence']].head(5))

Fetching FASTA sequences: 100%|██████████| 1321/1321 [00:57<00:00, 23.15it/s]

  target_id                                    target_sequence
0    P05067  MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...
1    O75936  MACTIQKAEALDGAHLMQILWYDEEESLYPAVWLRDNCPCSDCYLD...
4    P16083  MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...
6    P14902  MAHAMENSWTISKEYHIDEEVGFALPNPQENLPDFYNDWMFIAKHL...
7    P28482  MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNV...





In [None]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd

# Function to compute protein descriptors
def compute_protein_descriptors(seq):
    try:
        analyzed_seq = ProteinAnalysis(seq)
        return [
            analyzed_seq.molecular_weight(),
            analyzed_seq.aromaticity(),
            analyzed_seq.instability_index(),
            analyzed_seq.gravy(),
            analyzed_seq.isoelectric_point()
        ]
    except:
        return None

# Example: assuming your DataFrame is df and sequence column is "protein_sequence"
prot_desc = df['target_sequence'].apply(compute_protein_descriptors)
prot_desc = prot_desc.dropna()  # remove failed ones
prot_desc_df = pd.DataFrame(prot_desc.tolist(), columns=[
    'Prot_MW', 'Aromaticity', 'Instability', 'Hydropathy', 'IsoelectricPoint'
])


In [None]:
df = df.loc[prot_desc.index].reset_index(drop=True)
df = pd.concat([df.reset_index(drop=True), prot_desc_df.reset_index(drop=True)], axis=1)


In [30]:
df.head()

Unnamed: 0,target_id,target_sequence
0,P05067,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...
1,O75936,MACTIQKAEALDGAHLMQILWYDEEESLYPAVWLRDNCPCSDCYLD...
4,P16083,MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...
6,P14902,MAHAMENSWTISKEYHIDEEVGFALPNPQENLPDFYNDWMFIAKHL...
7,P28482,MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNV...


In [31]:
df.isna().sum()

target_id          0
target_sequence    0
dtype: int64

In [32]:
df = df.dropna()

In [33]:
# 4. Encode protein sequences
aa_list = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_int = {aa: i + 1 for i, aa in enumerate(aa_list)}

def encode_protein(seq):
    return [aa_to_int.get(aa, 0) for aa in seq]

df['protein_encoded'] = df['target_sequence'].apply(encode_protein)

In [34]:
df.head()

Unnamed: 0,target_id,target_sequence,protein_encoded
0,P05067,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,"[11, 10, 13, 6, 10, 1, 10, 10, 10, 10, 1, 1, 1..."
1,O75936,MACTIQKAEALDGAHLMQILWYDEEESLYPAVWLRDNCPCSDCYLD...,"[11, 1, 2, 17, 8, 14, 9, 1, 4, 1, 10, 3, 6, 1,..."
4,P16083,MAGKKVLIVYAHQEPKSFNGSLKNVAVDELSRQGCTVTVSDLYAMN...,"[11, 1, 6, 9, 9, 18, 10, 8, 18, 20, 1, 7, 14, ..."
6,P14902,MAHAMENSWTISKEYHIDEEVGFALPNPQENLPDFYNDWMFIAKHL...,"[11, 1, 7, 1, 11, 4, 12, 16, 19, 17, 8, 16, 9,..."
7,P28482,MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNV...,"[11, 1, 1, 1, 1, 1, 1, 6, 1, 6, 13, 4, 11, 18,..."


In [None]:
len(df)

In [35]:
df.to_csv('encoded_protien.csv', index=False)