In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


<h2><b> We create train and test datasets containing SELFIES representation of small molecules. This will be a future development of the project. </b></h2>

In [None]:
import pandas as pd

binding_affinity_processed = pd.read_csv('/content/drive/MyDrive/LLama2HealthCareChatBot-master/data/BindingAffinity_data/merged_sequence.csv')

binding_affinity_processed.head(5)

Unnamed: 0,pKd_[M],Kd,Canonical SMILE,Label,Textual Description,Sequence
0,9.337242,4.6e-10,CC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of CC1=CC=C(C=C1...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
1,9.309804,4.9e-10,COC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of COC1=CC=C(C=C...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
2,9.080922,8.3e-10,C1=CSC(=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of C1=CSC(=C1)CN...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
3,9.69897,2e-10,C1C(C2=C(SC(=C2)S(=O)(=O)N)S(=O)(=O)N1CC3=CC=C...,High,Compound with SMILES sequence of C1C(C2=C(SC(=...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...
4,9.79588,1.6e-10,COC1=CC=C(C=C1)N2CC(C3=C(S2(=O)=O)SC(=C3)S(=O)...,High,Compound with SMILES sequence of COC1=CC=C(C=C...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...


In [None]:
!pip install selfies



In [None]:
import selfies as sf

def create_selfies_list(dataset, max_conversions):
  smiles_list = list(dataset['Canonical SMILE'])
  curr_count = 0;

  smiles_sf_list = []

  for i in range(len(smiles_list)):
    if i == max_conversions: # for testing
      break
    try:
        smiles_sf = sf.encoder(smiles_list[i])
        smiles_smi = sf.decoder(smiles_sf)
    except sf.EncoderError:
        pass  # sf.encoder error!
    except sf.DecoderError:
        pass  # sf.decoder error!

    len_smiles = sf.len_selfies(smiles_sf)  # 8

    symbols_selfies = list(sf.split_selfies(smiles_sf))

    smiles_sf_list.append(smiles_sf)

    i+=1

  return smiles_sf_list

In [None]:
selfies_list = create_selfies_list(binding_affinity_processed, len(binding_affinity_processed))

In [None]:
binding_affinity_processed['SELFIES'] = selfies_list

binding_affinity_processed.head(5)

Unnamed: 0,pKd_[M],Kd,Canonical SMILE,Label,Textual Description,Sequence,SELFIES
0,9.337242,4.6e-10,CC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of CC1=CC=C(C=C1...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,[C][C][=C][C][=C][Branch1][Branch1][C][=C][Rin...
1,9.309804,4.9e-10,COC1=CC=C(C=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of COC1=CC=C(C=C...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,[C][O][C][=C][C][=C][Branch1][Branch1][C][=C][...
2,9.080922,8.3e-10,C1=CSC(=C1)CNS(=O)(=O)C2=CC=C(S2)S(=O)(=O)N,High,Compound with SMILES sequence of C1=CSC(=C1)CN...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,[C][=C][S][C][=Branch1][Ring2][=C][Ring1][Bran...
3,9.69897,2e-10,C1C(C2=C(SC(=C2)S(=O)(=O)N)S(=O)(=O)N1CC3=CC=C...,High,Compound with SMILES sequence of C1C(C2=C(SC(=...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,[C][C][Branch2][Ring2][=Branch1][C][=C][Branch...
4,9.79588,1.6e-10,COC1=CC=C(C=C1)N2CC(C3=C(S2(=O)=O)SC(=C3)S(=O)...,High,Compound with SMILES sequence of COC1=CC=C(C=C...,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,[C][O][C][=C][C][=C][Branch1][Branch1][C][=C][...


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into features (X) and the target variable (y)
X = binding_affinity_processed.drop(columns=['Label', 'pKd_[M]', 'Kd', 'Textual Description'])
y = binding_affinity_processed['Label']

# Perform 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

X_train shape: (4037, 3)
y_train shape: (4037,)
X_test shape: (1731, 3)
y_test shape: (1731,)


In [None]:
# Save train and test data to separate CSV files
train_data.to_csv('binding_affinity_train.csv', index=False, header=True)
test_data.to_csv('binding_affinity_test.csv', index=False, header=True)

In [None]:
# https://github.com/XinhaoLi74 -> use for SELFIES