<a href="https://colab.research.google.com/github/Vas1l1sa/SWW_ML_project_AI-aptamers/blob/main/02_%D0%94%D0%B5%D1%81%D0%BA%D1%80%D0%B8%D0%BF%D1%82%D0%BE%D1%80%D1%8B_%D0%B4%D0%BB%D1%8FML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Для создания модели ML нам необходимо категориальные данные каким-то образом преобразовать в числовые так, чтобы эти числовые данные по максимуму отражали все характеристики наших категориальных данных. Так как в рамках этой задачи мы работаем с хим-био данными, будем "вытаскивать" различные физико-химические характеристики из наших молекул при помощи специализированных библиотек - PyBIOmed 2 и RDkit.**

In [None]:
 !pip install pybiomed
 !pip install rdkit



In [None]:
!pip install Pydna



In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
from PyBioMed import Pydna
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from sklearn.feature_selection import VarianceThreshold

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/SWW/df_filtered.csv')
df.head()

Unnamed: 0,aptamer,smiles,pKd
0,CCTGGGGGAGTATTGCGGAGGAAGG,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,6.798603
1,CTCAGTTCGGGACGACGGCAAGGTAACGTATGGGACCTTGGCACGA...,CN1COCN(CC2=CN=C(Cl)S2)\C1=N/[N+]([O-])=O,6.262569
2,TAGGGAAGAGAAGGACATATGATCTGCGTTTATCTCCGCTCGTTAA...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,6.970616
3,TAGGGAAGAGAAGGACATATGATGTCGCGCCAGCCTTCTGCGTTGA...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,6.478862
4,TAGGGAAGAGAAGGACATATGATAGTTTGAGACCTTTGCAGTCTTT...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,7.744727


**Напишем функцию для создания колонок с дескрипторами для наших молекул аптамеров при помощи библиотеки PyBIOmed 2**

In [None]:
#напишем функцию для создания колонок с дескрипторами для наших молекул аптамеров при помощи библиотеки PyBIOmed 2
def calculate_k2(i):
    return Pydna.GetKmer(i, k=2)

def calculate_k3(i):
    return Pydna.GetKmer(i, k=3)

df['k2_descriptors'] = df['aptamer'].apply(calculate_k2)
df['k3_descriptors'] = df['aptamer'].apply(calculate_k3)

k2_df = df['k2_descriptors'].apply(pd.Series)
k3_df = df['k3_descriptors'].apply(pd.Series)

k2_df.columns = [f'k2_descriptors_{col}' for col in k2_df.columns]
k3_df.columns = [f'k3_descriptors_{col}' for col in k3_df.columns]

df = pd.concat([df, k2_df, k3_df], axis=1)

df.drop(['k2_descriptors', 'k3_descriptors'], axis=1, inplace=True)

df

{'AA': 1, 'AC': 0, 'AG': 3, 'AT': 1, 'CA': 0, 'CC': 1, 'CG': 1, 'CT': 1, 'GA': 3, 'GC': 1, 'GG': 7, 'GT': 1, 'TA': 1, 'TC': 0, 'TG': 2, 'TT': 1}
{'AA': 3, 'AC': 5, 'AG': 3, 'AT': 1, 'CA': 3, 'CC': 3, 'CG': 6, 'CT': 2, 'GA': 5, 'GC': 2, 'GG': 7, 'GT': 4, 'TA': 2, 'TC': 3, 'TG': 2, 'TT': 2}
{'AA': 4, 'AC': 7, 'AG': 6, 'AT': 8, 'CA': 5, 'CC': 2, 'CG': 3, 'CT': 7, 'GA': 9, 'GC': 3, 'GG': 3, 'GT': 3, 'TA': 8, 'TC': 5, 'TG': 6, 'TT': 6}
{'AA': 4, 'AC': 6, 'AG': 7, 'AT': 5, 'CA': 4, 'CC': 5, 'CG': 4, 'CT': 6, 'GA': 10, 'GC': 5, 'GG': 3, 'GT': 4, 'TA': 5, 'TC': 3, 'TG': 8, 'TT': 6}
{'AA': 5, 'AC': 8, 'AG': 9, 'AT': 6, 'CA': 7, 'CC': 2, 'CG': 1, 'CT': 4, 'GA': 11, 'GC': 1, 'GG': 3, 'GT': 4, 'TA': 6, 'TC': 3, 'TG': 6, 'TT': 9}
{'AA': 5, 'AC': 7, 'AG': 7, 'AT': 6, 'CA': 6, 'CC': 4, 'CG': 2, 'CT': 4, 'GA': 10, 'GC': 2, 'GG': 4, 'GT': 5, 'TA': 5, 'TC': 3, 'TG': 8, 'TT': 7}
{'AA': 4, 'AC': 5, 'AG': 7, 'AT': 7, 'CA': 3, 'CC': 1, 'CG': 4, 'CT': 3, 'GA': 12, 'GC': 2, 'GG': 10, 'GT': 6, 'TA': 5, 'TC': 3

Unnamed: 0,aptamer,smiles,pKd,k2_descriptors_Kmer_1,k2_descriptors_Kmer_2,k2_descriptors_Kmer_3,k2_descriptors_Kmer_4,k2_descriptors_Kmer_5,k2_descriptors_Kmer_6,k2_descriptors_Kmer_7,...,k3_descriptors_Kmer_55,k3_descriptors_Kmer_56,k3_descriptors_Kmer_57,k3_descriptors_Kmer_58,k3_descriptors_Kmer_59,k3_descriptors_Kmer_60,k3_descriptors_Kmer_61,k3_descriptors_Kmer_62,k3_descriptors_Kmer_63,k3_descriptors_Kmer_64
0,CCTGGGGGAGTATTGCGGAGGAAGG,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,6.798603,1,0,3,1,0,1,1,...,0,0,0,1,1,0,0,0,1,0
1,CTCAGTTCGGGACGACGGCAAGGTAACGTATGGGACCTTGGCACGA...,CN1COCN(CC2=CN=C(Cl)S2)\C1=N/[N+]([O-])=O,6.262569,3,5,3,1,3,3,6,...,1,0,0,0,2,0,0,1,1,0
2,TAGGGAAGAGAAGGACATATGATCTGCGTTTATCTCCGCTCGTTAA...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,6.970616,4,7,6,8,5,2,3,...,1,3,5,1,0,0,2,1,2,1
3,TAGGGAAGAGAAGGACATATGATGTCGCGCCAGCCTTCTGCGTTGA...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,6.478862,4,6,7,5,4,5,4,...,1,2,6,1,0,1,1,2,3,0
4,TAGGGAAGAGAAGGACATATGATAGTTTGAGACCTTTGCAGTCTTT...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,7.744727,5,8,9,6,7,2,1,...,0,1,5,1,0,0,0,2,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,ATACCAGCTTATTCAATTCAGGAAGACAACTCCGACTAGAATTGAT...,CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H]...,7.259637,10,4,8,10,6,2,3,...,1,2,1,2,1,1,1,3,1,0
422,TCCAGCACTCCACGCATAACGAATTGTGCTCAATGCGCCCCTGCAG...,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)Cl)C(Cl)(Cl)Cl)Cl,6.384787,6,4,2,6,6,5,6,...,0,0,2,5,1,3,1,0,2,1
423,GCTGTGTGACTCCTGCAAACTACTGGTGTTCCTTGCCCTGTTTCAA...,C[C@@H]1C[C@@H]([C@@H]2[C@H](C[C@H]([C@@](O2)(...,7.248336,5,3,1,3,3,6,2,...,1,2,1,3,1,8,0,2,2,1
424,CGTTAGACG,CC1=CC(=NC(=N1)NS(=O)(=O)C2=CC=C(C=C2)N)C,7.609065,0,1,1,0,0,0,2,...,0,0,0,0,0,0,1,0,0,0


**Напишем функцию для создания колонок с дескрипторами для наших молекул аптамеров при помощи библиотеки RDkit**

In [None]:
#напишем функцию для создания колонок с дескрипторами для наших молекул аптамеров при помощи библиотеки RDkit
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    descriptors = {}
    for name, func in Descriptors.descList:
        descriptors[name] = func(mol)

    return descriptors

df_descriptors = df['smiles'].apply(calculate_descriptors)
descriptors_df = pd.DataFrame(df_descriptors.tolist(), index=df.index)
df = pd.concat([df, descriptors_df], axis=1)

df

Unnamed: 0,aptamer,smiles,pKd,k2_descriptors_Kmer_1,k2_descriptors_Kmer_2,k2_descriptors_Kmer_3,k2_descriptors_Kmer_4,k2_descriptors_Kmer_5,k2_descriptors_Kmer_6,k2_descriptors_Kmer_7,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CCTGGGGGAGTATTGCGGAGGAAGG,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)OP(=O)(O...,6.798603,1,0,3,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,CTCAGTTCGGGACGACGGCAAGGTAACGTATGGGACCTTGGCACGA...,CN1COCN(CC2=CN=C(Cl)S2)\C1=N/[N+]([O-])=O,6.262569,3,5,3,1,3,3,6,...,0,0,0,0,0,1,0,0,0,0
2,TAGGGAAGAGAAGGACATATGATCTGCGTTTATCTCCGCTCGTTAA...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,6.970616,4,7,6,8,5,2,3,...,0,0,0,0,0,0,0,0,0,0
3,TAGGGAAGAGAAGGACATATGATGTCGCGCCAGCCTTCTGCGTTGA...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,6.478862,4,6,7,5,4,5,4,...,0,0,0,0,0,0,0,0,0,0
4,TAGGGAAGAGAAGGACATATGATAGTTTGAGACCTTTGCAGTCTTT...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,7.744727,5,8,9,6,7,2,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,ATACCAGCTTATTCAATTCAGGAAGACAACTCCGACTAGAATTGAT...,CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H]...,7.259637,10,4,8,10,6,2,3,...,1,0,0,0,0,0,0,0,0,2
422,TCCAGCACTCCACGCATAACGAATTGTGCTCAATGCGCCCCTGCAG...,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)Cl)C(Cl)(Cl)Cl)Cl,6.384787,6,4,2,6,6,5,6,...,0,0,0,0,0,0,0,0,0,0
423,GCTGTGTGACTCCTGCAAACTACTGGTGTTCCTTGCCCTGTTTCAA...,C[C@@H]1C[C@@H]([C@@H]2[C@H](C[C@H]([C@@](O2)(...,7.248336,5,3,1,3,3,6,2,...,0,0,0,0,0,0,0,0,0,0
424,CGTTAGACG,CC1=CC(=NC(=N1)NS(=O)(=O)C2=CC=C(C=C2)N)C,7.609065,0,1,1,0,0,0,2,...,0,1,0,0,0,0,0,0,0,0


**Посмотрим, нет ли в нашем получившемся датафрейме NaN-значений**

In [None]:
#посмотрим колонки с NaN-значениями и их числом в этих колонках
NaN_values = df.isna().sum()
NaN_values[NaN_values > 0]


Unnamed: 0,0
BCUT2D_MWHI,13
BCUT2D_MWLOW,13
BCUT2D_CHGHI,13
BCUT2D_CHGLO,13
BCUT2D_LOGPHI,13
BCUT2D_LOGPLOW,13
BCUT2D_MRHI,13
BCUT2D_MRLOW,13


In [None]:
df.to_csv('/content/drive/MyDrive/SWW/аптамеры_df_descr.csv', index=False)
