# list of 20 standard amino acids in peptides



In [117]:
#!pip install tabulate
from tabulate import tabulate

amino_acids = [
    ["A", "Ala", "Alanine", 0, 0, 0],
    ["R", "Arg", "Arginine", 1, 0, 0],
    ["N", "Asn", "Asparagine", 1, 0, 1],
    ["D", "Asp", "Aspartic acid", 1, 0, 1],
    ["C", "Cys", "Cysteine", 0, 1, 1],
    ["E", "Glu", "Glutamic acid", 1, 0, 1],
    ["Q", "Gln", "Glutamine", 1, 0, 1],
    ["G", "Gly", "Glycine", 0, 0, 0],
    ["H", "His", "Histidine", 1, 0, 1],
    ["I", "Ile", "Isoleucine", 0, 0, 0],
    ["L", "Leu", "Leucine", 0, 0, 0],
    ["K", "Lys", "Lysine", 1, 0, 0],
    ["M", "Met", "Methionine", 0, 1, 0],
    ["F", "Phe", "Phenylalanine", 0, 0, 1],
    ["P", "Pro", "Proline", 0, 0, 0],
    ["S", "Ser", "Serine", 1, 0, 0],
    ["T", "Thr", "Threonine", 1, 0, 0],
    ["W", "Trp", "Tryptophan", 1, 0, 1],
    ["Y", "Tyr", "Tyrosine", 1, 0, 1],
    ["V", "Val", "Valine", 0, 0, 0]
]

print(tabulate(amino_acids, headers=["One-Letter Code", "Three-Letter Code", "Name", "Hydrogen Bonding", "Sulfur", "Resonance Structure"], tablefmt="fancy_grid"))

╒═══════════════════╤═════════════════════╤═══════════════╤════════════════════╤══════════╤═══════════════════════╕
│ One-Letter Code   │ Three-Letter Code   │ Name          │   Hydrogen Bonding │   Sulfur │   Resonance Structure │
╞═══════════════════╪═════════════════════╪═══════════════╪════════════════════╪══════════╪═══════════════════════╡
│ A                 │ Ala                 │ Alanine       │                  0 │        0 │                     0 │
├───────────────────┼─────────────────────┼───────────────┼────────────────────┼──────────┼───────────────────────┤
│ R                 │ Arg                 │ Arginine      │                  1 │        0 │                     0 │
├───────────────────┼─────────────────────┼───────────────┼────────────────────┼──────────┼───────────────────────┤
│ N                 │ Asn                 │ Asparagine    │                  1 │        0 │                     1 │
├───────────────────┼─────────────────────┼───────────────┼─────────────

In [5]:
import pandas as pd

file_path = 'Modified_PA_Database.csv'  
df = pd.read_csv(file_path)
peptide_seq = df.iloc[:, 5]  

print(peptide_seq.head(10))
print(peptide_seq.tail(10))

0    GG
1    GG
2    AG
3    AG
4    AA
5    AA
6    LG
7    LG
8    FG
9    FG
Name: Pep Seq, dtype: object
2539    ATMEAHHVWL
2540    TALLFEEDGY
2541    TIRCGYCSQQ
2542    MQFDYHSDVN
2543    TQHPTGVRED
2544    NRFYHFGYLW
2545    LVWGKNKPII
2546    WGHDELRVAL
2547    VWGCYKISWL
2548    IWEARQPGEL
Name: Pep Seq, dtype: object


In [8]:
#!pip install biopython
from Bio.SeqUtils import ProtParam

def calculate_hydrophobicity(sequence):
    analyzed_seq = ProtParam.ProteinAnalysis(sequence)
    return analyzed_seq.gravy()

hydrophobicities = peptide_seq.apply(calculate_hydrophobicity)

df.loc['Hydrophobicity'] = peptide_seq.apply(calculate_hydrophobicity)
df.to_csv(file_path, index=False)

print(df[['Pep Seq', 'Hydrophobicity']].head(10))
print(df.iloc[569:579][['Pep Seq', 'Hydrophobicity']])

  Pep Seq  Hydrophobicity
0      GG            -0.4
1      GG            -0.4
2      AG             0.7
3      AG             0.7
4      AA             1.8
5      AA             1.8
6      LG             1.7
7      LG             1.7
8      FG             1.2
9      FG             1.2
             Pep Seq  Hydrophobicity
569         GANPNAAG       -0.500000
570    AAAAGGGEIKVAV        1.023077
571    AAAAGGGEIKVAV        1.023077
572           KKLLAK       -0.383333
573       RGDSKKLLAK       -1.150000
574  AAAAGGGLRKKLGKA       -0.080000
575  AAAAGGGLLGARKKK       -0.080000
576          AAAAAAD        1.042857
577          AAAAAAK        0.985714
578         GAAVILRR        0.837500


![Amino Acids Image](https://www.reagent.co.uk/wp-content/uploads/comm/n-/common-amino-acids.webp)

In [120]:
data = [
    ["Alanine (A)", "None", "None", "None", "None", "None"],
    ["Arginine (R)", "None", "None", "None", "None", "None"],
    ["Asparagine (N)", "None", "None", "None", "None", "None"],
    ["Aspartic Acid (D)", "None", "None", "None", "None", "None"],
    ["Cysteine (C)", "None", "None", "None", "None", "None"],
    ["Glutamine (Q)", "None", "None", "None", "None", "None"],
    ["Glutamic Acid (E)", "None", "None", "None", "None", "None"],
    ["Glycine (G)", "None", "None", "None", "None", "None"],
    ["Histidine (H)", "None", "None", "None", "None", "None"],
    ["Isoleucine (I)", "None", "None", "None", "None", "None"],
    ["Leucine (L)", "None", "None", "None", "None", "None"],
    ["Lysine (K)", "None", "None", "None", "None", "None"],
    ["Methionine (M)", "None", "None", "None", "None", "None"],
    ["Phenylalanine (F)", "None", "None", "None", "None", "None"],
    ["Proline (P)", "None", "None", "None", "None", "None"],
    ["Serine (S)", "None", "None", "None", "None", "None"],
    ["Threonine (T)", "None", "None", "None", "None", "None"],
    ["Tryptophan (W)", "None", "None", "None", "None", "None"],
    ["Tyrosine (Y)", "None", "None", "None", "None", "None"],
    ["Valine (V)", "None", "None", "None", "None", "None"]
]

headers = ["Amino Acid", "Functional Group", "Donors (Hydrogen Bonding)", "Acceptors (Hydrogen Bonding)", "Notes", "Total H-Bond"]


table = tabulate(data, headers=headers, tablefmt="fancy_grid")
print(table)



╒═══════════════════╤════════════════════╤═════════════════════════════╤════════════════════════════════╤═════════╤════════════════╕
│ Amino Acid        │ Functional Group   │ Donors (Hydrogen Bonding)   │ Acceptors (Hydrogen Bonding)   │ Notes   │ Total H-Bond   │
╞═══════════════════╪════════════════════╪═════════════════════════════╪════════════════════════════════╪═════════╪════════════════╡
│ Alanine (A)       │ None               │ None                        │ None                           │ None    │ None           │
├───────────────────┼────────────────────┼─────────────────────────────┼────────────────────────────────┼─────────┼────────────────┤
│ Arginine (R)      │ None               │ None                        │ None                           │ None    │ None           │
├───────────────────┼────────────────────┼─────────────────────────────┼────────────────────────────────┼─────────┼────────────────┤
│ Asparagine (N)    │ None               │ None                      

In [9]:
import pandas as pd
import requests

def get_logP(smiles):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/XLogP/JSON"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        return data["PropertyTable"]["Properties"][0].get("XLogP", None)
    else:
        return None

df = pd.read_csv("Modified_PA_Database.csv")
df.loc[779:, "N-Term LogP"] = df.loc[779:, "N-Term SMILES"].apply(get_logP)
df.to_csv("Modified_PA_Database.csv", index=False) 
print(df.loc[:29, ["N-Term SMILES", "N-Term LogP"]])

                                        N-Term SMILES  N-Term LogP
0                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
1                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
2                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
3                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
4                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
5                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
6                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
7                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
8                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
9                 OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
10                OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
11                OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
12                OC(OCC1C2=C(C3=C1C=CC=C3)C=CC=C2)=O          3.3
13                              CCCCCCCCCCCCCCCC(O)=O         

In [None]:
import pandas as pd

file_path = "Modified_PA_Database.csv"
df = pd.read_csv(file_path)

missing_values = df[df["Pep Seq"].isna()]
missing_value_positions = missing_values.index.tolist()

non_string_values = df[~df["Pep Seq"].apply(lambda x: isinstance(x, str))]
non_string_positions = non_string_values.index.tolist()

valid_aas = "ACDEFGHIKLMNPQRSTVWY"
invalid_sequences = df[~df["Pep Seq"].str.upper().str.match(f"^[{valid_aas}]+$", na=False)]
invalid_positions = invalid_sequences.index.tolist()

print(f" Data Check for 'Pep Seq' Column:")
print(f"-------------------------------------")
print(f" Missing Values: {len(missing_value_positions)} positions: {missing_value_positions}")
print(f" Non-String Values: {len(non_string_positions)} positions: {non_string_positions}")
print(f" Sequences with Invalid Characters: {len(invalid_positions)} positions: {invalid_positions}")
problematic_value = df.iloc[2549, 5]  
print(f"Value at row 2549: {problematic_value}")

 Data Check for 'Pep Seq' Column:
-------------------------------------
 Missing Values: 1 positions: [2549]
 Non-String Values: 1 positions: [2549]
 Sequences with Invalid Characters: 1 positions: [2549]
Value at row 2549: nan


In [None]:
#!pip install transformers
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer
import numpy as np
from tqdm import tqdm

model_name = "Rostlab/prot_bert"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
model = BertModel.from_pretrained(model_name)
model.eval()

torch.set_num_threads(4)  

def get_embedding(sequence):
    sequence = " ".join(sequence)  
    tokens = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1024)
    
    with torch.no_grad():
        output = model(**tokens) 

    return output.last_hidden_state.mean(dim=1).cpu().numpy() 


file_path = "Modified_PA_Database.csv"
df = pd.read_csv(file_path)

df_cleaned = df.drop(2549)


all_embeddings = []

with tqdm(total=len(df_cleaned["Pep Seq"]), desc="Processing Sequences") as pbar:
    for seq in df_cleaned["Pep Seq"]:
        embedding = get_embedding(seq) 
        all_embeddings.append(embedding)  
        pbar.update(1)  


embedding_df = pd.DataFrame(np.vstack(all_embeddings), columns=[f"Embedding_{i}" for i in range(1024)])

df_cleaned = pd.concat([df_cleaned, embedding_df], axis=1)

df_cleaned.to_csv("Modified_PA_Database_with_Embeddings.csv", index=False)

Processing Sequences: 100%|██████████| 2549/2549 [05:48<00:00,  7.32it/s]


In [32]:
import pandas as pd

file_path = "Modified_PA_Database_with_Embeddings.csv"
df = pd.read_csv(file_path, low_memory=False)

def categorize_nano_type(nano_type):
    nano_type = str(nano_type).lower()  
    if 'fiber' in nano_type:
        return 'fiber'
    elif 'micelle' in nano_type:
        return 'micelle'
    else:
        return 'none'

df['Nano Type,'] = df['Nano Type,'].apply(categorize_nano_type)
df.to_csv(file_path, index=False)

In [33]:
print("\nInfo for 'pH' column:")
df['pH'].info()

print("\nInfo for 'PA Conc' column:")
df['PA Conc'].info()


Info for 'pH' column:
<class 'pandas.core.series.Series'>
RangeIndex: 2549 entries, 0 to 2548
Series name: pH
Non-Null Count  Dtype  
--------------  -----  
2549 non-null   float64
dtypes: float64(1)
memory usage: 20.0 KB

Info for 'PA Conc' column:
<class 'pandas.core.series.Series'>
RangeIndex: 2549 entries, 0 to 2548
Series name: PA Conc
Non-Null Count  Dtype  
--------------  -----  
2517 non-null   float64
dtypes: float64(1)
memory usage: 20.0 KB


In [50]:
import pandas as pd

file_path = "Modified_PA_Database_with_Embeddings.csv"
df = pd.read_csv(file_path, low_memory=False)


first_580_PA_Conc = df['PA Conc'].iloc[:580]

median_PA_Conc = first_580_PA_Conc.median()
print(f"Median value used for replacing missing 'PA Conc' values: {median_PA_Conc}")
replacement_log = []

for idx, value in df['PA Conc'].items():
    if pd.isna(value):
      
        replacement_log.append({'Row': idx + 1, 'Column': 'PA Conc', 'Replaced By': median_PA_Conc})  
df['PA Conc'] = df['PA Conc'].fillna(median_PA_Conc)  

print("Replacement Log:")
for log in replacement_log:
    print(f"Row {log['Row']} in 'PA Conc' was replaced with {log['Replaced By']}")

print("\nInfo after filling missing values:")
df[['PA Conc']].info()

df.to_csv("Modified_PA_Database_with_Embeddings.csv", index=False)


Median value used for replacing missing 'PA Conc' values: 7.399045646
Replacement Log:

Info after filling missing values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2549 entries, 0 to 2548
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   PA Conc  2549 non-null   float64
dtypes: float64(1)
memory usage: 20.0 KB
