In [12]:
import pandas as pd
import numpy as np
import math

from tqdm import tqdm
import time

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable

import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix

In [13]:
RANDOM_SEED = 13022022

In [14]:
new_df = pd.read_csv('df_first_draft.csv')

In [15]:
new_df.sample()

Unnamed: 0,Smiles,Active,train,len_smiles,C,O,c,1,2,[,...,7,8,e,A,K,M,g,i,L,9
3030,COc1cn(-c2ccc(-n3cccn3)cc2F)nc(-c2ccnn2-c2cccc...,0.0,1,53,1,2,22,2,6,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
train_new = new_df[new_df['train'] == 1]
test_new =new_df[new_df['train'] == 0]

In [17]:
X = train_new.drop(['Active', 'train', 'Smiles'], axis = 1)
y = train_new['Active']

In [18]:
X_test = test_new.drop(['Active', 'train', 'Smiles'], axis = 1)

In [19]:
X.sample()

Unnamed: 0,len_smiles,C,O,c,1,2,[,n,H,],...,7,8,e,A,K,M,g,i,L,9
358,40,13,0,11,2,2,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
features = np.array(X)

In [21]:
test_features = np.array(X_test)

In [22]:
x_test = torch.tensor(test_features)
x_test = x_test.type(torch.FloatTensor)
x_test.size()

torch.Size([1614, 46])

In [23]:
x_train = torch.tensor(features)
x_train = x_train.type(torch.FloatTensor)
x_train.size()

torch.Size([5557, 46])

In [24]:
x_labels = torch.tensor(y)
x_labels = x_labels.type(torch.FloatTensor)
x_labels.size()

torch.Size([5557])

In [221]:
nX = 46
nH1 = 23
nH2 = 5
nY = 1
 
class TwoLayersNet(nn.Module):
    def __init__(self, nX, nH1, nH2, nY):        
        super(TwoLayersNet, self).__init__()     # конструктор предка с этим именем
         
        self.fc1 = nn.Linear(nX, nH1)             # создаём параметры модели
        self.fc2 = nn.Linear(nH1, nH2)             # создаём параметры модели
        self.fc3 = nn.Linear(nH2, nY)             # в полносвязных слоях
          
    def forward(self, x):                        # задаётся прямой проход
        x = self.fc1(x)                          # выход первого слоя
        x = nn.Sigmoid()(x)                      # пропускаем через Sigmoid
        x = self.fc2(x)                          # выход второго слоя
        x = nn.Sigmoid()(x)                      # пропускаем через сигмоид 
        x = self.fc3(x)                          # выход второго слоя
        x = nn.Sigmoid()(x)

        return x
          
model = TwoLayersNet(nX, nH1, nH2, nY)

In [222]:
model = TwoLayersNet(nX, nH1, nH2, nY)                           # экземпляр сети        
 
loss      = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(),          # параметры модели
                            lr=0.5, momentum=0.8) 

In [223]:
def fit(model, x_train,x_labels, batch_size=100, train=True):    
    model.train(train)                                 # важно для Dropout, BatchNorm
    sumL, sumA, numB = 0, 0, int( len(x_train)/batch_size )  # ошибка, точность, батчей
       
    for i in range(0, numB*batch_size, batch_size):          
        xb = x_train[i: i+batch_size]                          # текущий батч,
        yb = x_labels[i: i+batch_size]                          # X,Y - torch тензоры
              
        y = model(xb)[:,0]                                    # прямое распространение

        L = loss(y, yb)                                  # вычисляем ошибку
  
        if train:                                        # в режиме обучения
            optimizer.zero_grad()                        # обнуляем градиенты        
            L.backward()                                 # вычисляем градиенты            
            optimizer.step()                             # подправляем параметры
                                     
        sumL += L.item()                                 # суммарная ошибка (item из графа)
        sumA += (y.round() == yb).float().mean()         # точность определения класса
         
    return sumL/numB,  sumA/numB 

In [224]:
print( "before:      loss: %.4f accuracy: %.4f" %  fit(model, x_train,x_labels, train=False) )
 
epochs = 1                                            # число эпох
for epoch in tqdm(range(epochs)):                              # эпоха - проход по всем примерам
    L,A = fit(model, x_train, x_labels)                               # одна эпоха
     
    if epoch % 500 == 0 or epoch == epochs-1:                 
        print(f'epoch: {epoch:5d} loss: {L:.4f} accuracy: {A:.4f}' ) 

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.46it/s]

before:      loss: 0.6447 accuracy: 0.9629
epoch:     0 loss: 0.1783 accuracy: 0.9629





In [225]:
y_pred = model(x_test)

In [226]:
y_pred = y_pred[:,0]

In [227]:
y_pred

tensor([0.0446, 0.0448, 0.0447,  ..., 0.0446, 0.0448, 0.0449],
       grad_fn=<SelectBackward0>)

In [164]:
test = pd.read_csv('Task/test.csv')
test.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [228]:
test['Active'] = y_pred.detach().numpy()

In [229]:
test.sample()

Unnamed: 0,Smiles,Active
293,CCC[C@@H]1C[C@@H](C(=O)N[C@H]([C@H](C)Cl)[C@H]...,0.044618


In [231]:
test['Active'] = test['Active'].apply(lambda x: x*10000-446)

In [216]:
test['Active'] = test['Active'].apply(lambda x: int(abs(round(x, 0))))

In [210]:
test['Active'] = test['Active'].apply(lambda x: 1 if x > 1 else x)

In [230]:
test['Active'].value_counts()

0.044614    10
0.044614     7
0.044616     3
0.044671     3
0.044614     3
            ..
0.044639     1
0.044751     1
0.044474     1
0.044639     1
0.044678     1
Name: Active, Length: 1520, dtype: int64

In [212]:
test.sample(5)

Unnamed: 0,Smiles,Active
1377,NC(N)=NS(=O)(=O)c1ccc(N)cc1,1
1409,CN(C)CC[C@@H](c1ccc(Cl)cc1)c1ccccn1.O=C(O)/C=C...,0
563,C/C(=C(\CCOP(=O)(O)O)SC(=O)c1ccccc1)N(C=O)Cc1c...,0
1571,CC[C@@H](NC(=O)c1cc(C(=O)N2CCC[C@@H]2C)n2c1COC...,0
1552,CC(C)(C)c1cc(C(C)(C)C)c2c(c1)C(O)(C(F)(F)F)C(=...,1


In [213]:
test.to_csv('test2.csv', index=False)