In [352]:
import pandas as pd
import numpy as np
import math

from tqdm import tqdm
import time

from sklearn import model_selection, datasets, metrics, tree 

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable

import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix

In [353]:
RANDOM_SEED = 2022022

In [354]:
test = pd.read_csv('Task/test.csv')
test.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [355]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1614 entries, 0 to 1613
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  1614 non-null   object
dtypes: object(1)
memory usage: 12.7+ KB


In [356]:
test.sample(3)

Unnamed: 0,Smiles
174,CN1C(=O)C[C@@H](C(=O)N[C@@H](Cc2c[nH]cn2)C(=O)...
986,CCN(C/C=C/C#CC(C)(C)C)Cc1cccc(OCc2cc(-c3ccsc3)...
1212,CC(C)OC(=O)C(C)NP(=O)(COC(C)Cn1cnc2c(N)ncnc21)...


In [357]:
train = pd.read_csv('Task/train.csv')
train.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [358]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5557 entries, 0 to 5556
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  5557 non-null   object
 1   Active  5557 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 49.0+ KB


In [359]:
train.sample(3)

Unnamed: 0,Smiles,Active
4215,COc1cc2ncnc(Nc3ccc(F)c(Cl)c3)c2cc1NC(=O)/C=C/C...,False
1741,Cc1cccc(Nc2cc(Cl)nc(SCC(=O)O)n2)c1C,False
439,NC[C@H]1O[C@H](O[C@H]2C(O[C@@H]3O[C@H](CO)[C@@...,False


In [360]:
list(train['Active'].unique())

[False, True]

In [361]:
list_active = {False: 0, True: 1}

In [362]:
train['Active'] = train['Active'].map(list_active)

In [363]:
train.sample()

Unnamed: 0,Smiles,Active
3057,CSCC[C@H](NC(=O)[C@@H](N)Cc1ccccc1)C(=O)N[C@@H...,0


In [364]:
#train['text'] = train['Smiles'].apply(lambda x: list(x))

In [365]:
#train.set_index(['Smiles'], inplace = True)

In [366]:
train.sample()

Unnamed: 0,Smiles,Active
202,CC(C)(C)c1ccc(O)cc1,0


In [367]:
train['train'] = 1
test['train'] = 0

In [368]:
df = pd.concat([train, test], ignore_index=True)

In [369]:
from collections import Counter

atoms = Counter()
for i in df['Smiles'].unique():
    for j in i:
        atoms[j] += 1

In [370]:
atoms_voc = {}
cont = 0
for i in atoms.keys():
    new_value = len(atoms.keys()) - cont
    atoms_voc[i] = new_value
    cont += 1

In [371]:
atoms_voc

{'C': 45,
 'O': 44,
 'c': 43,
 '1': 42,
 '2': 41,
 '[': 40,
 'n': 39,
 'H': 38,
 ']': 37,
 '(': 36,
 'N': 35,
 ')': 34,
 '@': 33,
 '.': 32,
 'l': 31,
 '=': 30,
 'S': 29,
 '3': 28,
 '4': 27,
 'F': 26,
 '-': 25,
 's': 24,
 '/': 23,
 '5': 22,
 'o': 21,
 'a': 20,
 '+': 19,
 '#': 18,
 'I': 17,
 'P': 16,
 'B': 15,
 'r': 14,
 '\\': 13,
 'Z': 12,
 '6': 11,
 '7': 10,
 '8': 9,
 'e': 8,
 'A': 7,
 'K': 6,
 'M': 5,
 'g': 4,
 'i': 3,
 'L': 2,
 '9': 1}

In [372]:
atoms_voc['9']

1

In [381]:
df['text'] = df['Smiles'].apply(lambda x: list(x))

In [384]:
df.sample()

Unnamed: 0,Smiles,Active,train,text
2232,CCCC1(CCC)CCC2(CCN(CCCN(CC)CC)C2)CC1.Cl.Cl,0.0,1,"[C, C, C, C, 1, (, C, C, C, ), C, C, C, 2, (, ..."


In [386]:
for i in tqdm(df['Smiles']):
    for k, v in atoms_voc.items():
        df[df['Smiles'] == i]['text'] = df[df['Smiles'] == i]['text'].replace(k,v)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['Smiles'] == i]['text'] = df[df['Smiles'] == i]['text'].replace(k,v)
100%|██████████████████████████████████████████████████████████████████████████████| 7171/7171 [24:35<00:00,  4.86it/s]


In [387]:
df.sample()

Unnamed: 0,Smiles,Active,train,text
1049,CC#C[C@@H](CC(=O)O)c1ccc(OCc2cccc(-c3ccc(C(F)(...,0.0,1,"[C, C, #, C, [, C, @, @, H, ], (, C, C, (, =, ..."


In [191]:
from torchtext.data.functional import simple_space_split
from torchtext.data.functional import numericalize_tokens_from_iterator                                     

In [207]:
ids_iter = numericalize_tokens_from_iterator(atoms_voc, simple_space_split(str(df[df['Smiles'] == 'CCOC(=O)c1ccc(OCCC2CCN(c3ccc(C)nn3)CC2)cc1']['Smiles'])))
stroka = []
for ids in ids_iter:
    print(ids)
    stroka.append([num for num in ids])
    
print(stroka)

<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x00000289211DEC80>
<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x000002892156E5F0>
<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x00000289211DEC80>
<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x000002892156E5F0>
<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x00000289211DEC80>
<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x000002892156E5F0>
<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x00000289211DEC80>
<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x000002892156E5F0>
<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x00000289211DEC80>
<generator object numericalize_tokens_from_iterator.<locals>.<genexpr> at 0x000002892156E5F0>
<generator object numericalize_tokens_from_iterator.<locals>

KeyError: 'm'

In [187]:
for i in df['Smiles']:
    df[df['Smiles'] == i] = numericalize_tokens_from_iterator(atoms_voc, 
                                                              simple_space_split(df[df['Smiles'] == i]['text']))

TypeError: object of type 'generator' has no len()

In [127]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

#vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
#vocab.set_default_index(vocab["<unk>"])

TypeError: 'DataFrame' object is not callable

In [None]:
for i in tqdm(range(len(new_df[4]))):
    new_df.iloc[i] = new_df.iloc[i].map(atoms)

In [25]:
df['len'].unique().max()

707

In [26]:
new_df = pd.DataFrame()
for i in tqdm(range(len(df['1']))):
    e = pd.DataFrame(df.loc[i].explode())
    e.reset_index(drop=True, inplace=True)
    e = e.T
    new_df = pd.concat([new_df, e], ignore_index = True)

100%|██████████████████████████████████████████████████████████████████████████████| 7171/7171 [08:03<00:00, 14.84it/s]


In [27]:
new_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,701,702,703,704,705,706,707,708,709,710
3797,CCCn1c(=O)[nH]c2nc(-c3ccc(S(=O)(=O)N4CCN(c5ccc...,0.0,1,72,C,C,C,n,1,c,...,,,,,,,,,,
4736,C=CC(=O)Nc1cccc(Nc2nc(Nc3ccc(NC4CN(CCF)C4)cc3O...,0.0,1,63,C,=,C,C,(,=,...,,,,,,,,,,
4898,Cn1cc(-c2ccncc2)c(-c2ccc(OCc3ccc4ccccc4n3)cc2)n1,0.0,1,48,C,n,1,c,c,(,...,,,,,,,,,,
4824,Cc1cc2c(N3CCCC3)nc(N3CCCC3)nc2n1C,0.0,1,33,C,c,1,c,c,2,...,,,,,,,,,,
5751,CCOc1cc(C=O)ccc1O,,0,17,C,C,O,c,1,c,...,,,,,,,,,,


In [28]:
new_df.fillna(0, inplace = True)

In [29]:
new_df.rename(columns = {0: 'Smiles', 1: 'Active', 2: 'train', 3: 'len'}, inplace = True)
new_df = new_df.astype({"Active": "Int64"})

In [30]:
new_df.set_index(['Smiles', 'Active', 'train', 'len'], inplace = True)

In [31]:
new_df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,4,5,6,7,8,9,10,11,12,13,...,701,702,703,704,705,706,707,708,709,710
Smiles,Active,train,len,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
C#Cc1cccc(Nc2ncnc3cc4c(cc23)OCCOCCOCCO4)c1,0,1,42,C,#,C,c,1,c,c,c,c,(,...,0,0,0,0,0,0,0,0,0,0
COc1cccc(-c2ccc(NC(=O)C3=C(C(=O)O)CCC3)c(F)c2)c1,0,1,48,C,O,c,1,c,c,c,c,(,-,...,0,0,0,0,0,0,0,0,0,0
CNCc1ccc(-c2[nH]c3cc(F)cc4c3c2CCNC4=O)cc1.O=P(O)(O)O,1,1,52,C,N,C,c,1,c,c,c,(,-,...,0,0,0,0,0,0,0,0,0,0
CC1(C)OC(=O)NC1=O,0,0,17,C,C,1,(,C,),O,C,(,=,...,0,0,0,0,0,0,0,0,0,0
COc1cccc(/C=C2\SC(=O)NC2=O)c1,0,1,29,C,O,c,1,c,c,c,c,(,/,...,0,0,0,0,0,0,0,0,0,0


In [32]:
from tqdm import tqdm
import time

for i in tqdm(range(len(new_df[4]))):
    new_df.iloc[i] = new_df.iloc[i].map(atoms)

100%|█████████████████████████████████████████████████████████████████████████████| 7171/7171 [00:12<00:00, 585.88it/s]


In [33]:
new_df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,4,5,6,7,8,9,10,11,12,13,...,701,702,703,704,705,706,707,708,709,710
Smiles,Active,train,len,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
CN1CCc2nc(C(=O)N[C@@H]3C[C@@H](C(=O)N(C)C)CC[C@@H]3NC(=O)C(=O)Nc3ccc(Cl)cn3)sc2C1,0,1,81,45,35,42,45,45,43,41,39,43,36,...,0,0,0,0,0,0,0,0,0,0
CCCCC/C=C/C/C=C/C/C=C/C/C=C/CCCC(=O)O,0,1,37,45,45,45,45,45,23,45,30,45,23,...,0,0,0,0,0,0,0,0,0,0
OCCc1ccc(O)cc1,0,1,14,44,45,45,43,42,43,43,43,36,44,...,0,0,0,0,0,0,0,0,0,0
CC(C)(C)C(O)C(Oc1ccc(Cl)cc1)n1cncn1,0,1,35,45,45,36,45,34,36,45,34,45,36,...,0,0,0,0,0,0,0,0,0,0
O=C(C(c1ccccc1)c1ccccc1)N1CCN(CC(O)COc2cccc3ncccc23)CC1.O=C(O)/C=C/C(=O)O,0,1,73,44,30,45,36,45,36,43,42,43,43,...,0,0,0,0,0,0,0,0,0,0


In [35]:
new_df.reset_index(inplace = True)
new_df.set_index(['Smiles'], inplace = True)

In [36]:
new_df.sample()

Unnamed: 0_level_0,Active,train,len,4,5,6,7,8,9,10,...,701,702,703,704,705,706,707,708,709,710
Smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CCCCCCCCCCCCCCCC[n+]1ccn(Cc2ccccc2)c1C.[I-],1,1,43,45,45,45,45,45,45,45,...,0,0,0,0,0,0,0,0,0,0


In [37]:
train_new = new_df[new_df['train'] == 1]
test_new =new_df[new_df['train'] == 0]

In [38]:
X = train_new.drop(['Active', 'train'], axis = 1)
y = train_new['Active']

In [39]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5557 entries, COc1ccc2[nH]cc(CCN)c2c1 to CCCCNc1ccc(C(=O)OCCN(C)C)cc1.Cl
Columns: 708 entries, len to 710
dtypes: int64(1), object(707)
memory usage: 30.1+ MB


In [42]:
batch_size = 100
train_loader = torch.utils.data.DataLoader(train_new, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_new, batch_size=batch_size)

In [49]:
def output_label(Active):
    output_mapping = {
        0: "False",
        1: "True",
           }
    input = (Active.item() if type(Active) == torch.Tensor else Active)
    return output_mapping[input]

In [50]:
class GlobalAi(nn.Module):
    
    def __init__(self):
        super(GlobalAi, self).__init__()
        
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.fc1 = nn.Linear(in_features=64*6*6, out_features=200)
        self.drop = nn.Dropout2d(0.25)
        self.fc2 = nn.Linear(in_features=200, out_features=120)
        self.fc3 = nn.Linear(in_features=120, out_features=10)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.drop(out)
        out = self.fc2(out)
        out = self.fc3(out)
        
        return out

In [51]:
model = GlobalAi()
#model.to(device)

error = nn.CrossEntropyLoss()

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(model)

GlobalAi(
  (layer1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Linear(in_features=2304, out_features=200, bias=True)
  (drop): Dropout2d(p=0.25, inplace=False)
  (fc2): Linear(in_features=200, out_features=120, bias=True)
  (fc3): Linear(in_features=120, out_features=10, bias=True)
)


In [47]:
num_epochs = 5
count = 0
# Lists for visualization of loss and accuracy 
loss_list = []
iteration_list = []
accuracy_list = []

# Lists for knowing classwise accuracy
predictions_list = []
labels_list = []

for epoch in range(num_epochs):
    for Active in train_loader:
        # Transfering images and labels to GPU if available
        #images, labels = images.to(device), labels.to(device)
        #train = images
        
        # Forward pass 
        outputs = model(train)
        loss = error(outputs, Active)
        
        # Initializing a gradient as 0 so there is no mixing of gradient among the batches
        optimizer.zero_grad()
        
        #Propagating the error backward
        loss.backward()
        
        # Optimizing the parameters
        optimizer.step()
    
        count += 1
    
    # Testing the model
    
        if not (count % 50):    # It's same as "if count % 50 == 0"
            total = 0
            correct = 0
        
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                labels_list.append(labels)
            
                test = images
            
                outputs = model(test)
            
                predictions = torch.max(outputs, 1)[1].to(device)
                predictions_list.append(predictions)
                correct += (predictions == labels).sum()
            
                total += len(labels)
            
            accuracy = correct * 100 / total
            loss_list.append(loss.data)
            iteration_list.append(count)
            accuracy_list.append(accuracy)
        
        if not (count % 500):
            print(f"Epoch: {epoch}, Iteration: {count}, Loss: {loss.data}, Accuracy: {accuracy:4.2f}")

KeyError: 0

In [582]:
test_new_data = test_new.drop(['Active','train'], axis = 1) 

In [583]:
predictions = clf.predict(test_new_data)

In [584]:
test_new_data['Active'] = predictions

In [585]:
test_new_data.sample(5)

Unnamed: 0_level_0,len,4,5,6,7,8,9,10,11,12,...,702,703,704,705,706,707,708,709,710,Active
Smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CNC(=O)c1ccccc1Nc1nc(Nc2ccc(N3CCOCC3)cc2OC)ncc1Cl,49,45,35,45,36,30,44,34,43,42,...,0,0,0,0,0,0,0,0,0,0
O=S(=O)(N[C@H]1CC[C@@](c2cc(F)ccc2F)(S(=O)(=O)c2ccc(Cl)cc2)CC1)C(F)(F)F,71,44,30,29,36,30,44,34,36,35,...,0,0,0,0,0,0,0,0,0,0
CC1Nc2ccc(Cl)cc2S(=O)(=O)N1,27,45,45,42,35,43,41,43,43,43,...,0,0,0,0,0,0,0,0,0,0
CC(C)(O)CC(=O)O.CC(C)(O)CC(=O)O,31,45,45,36,45,34,36,44,34,45,...,0,0,0,0,0,0,0,0,0,0
O=C(CBr)N1CC([N+](=O)[O-])([N+](=O)[O-])C1,42,44,30,45,36,45,15,14,34,35,...,0,0,0,0,0,0,0,0,0,0


In [586]:
test_new_data.reset_index(inplace = True)

In [587]:
test_new_data.sample(5)

Unnamed: 0,Smiles,len,4,5,6,7,8,9,10,11,...,702,703,704,705,706,707,708,709,710,Active
1550,NS(=O)(=O)Oc1ccc2c3c(c(=O)oc2c1)CCCCC3,38,35,29,36,30,44,34,36,30,...,0,0,0,0,0,0,0,0,0,0
634,COc1ccc(C(=O)CCC(=O)O)c2ccccc12,31,45,44,43,42,43,43,43,36,...,0,0,0,0,0,0,0,0,0,0
1363,OC(CN1CCC(Cc2ccc(F)cc2)CC1)c1ccc(Cl)cc1,39,44,45,36,45,35,42,45,45,...,0,0,0,0,0,0,0,0,0,0
628,O=c1cc[nH]c(=O)[nH]1,20,44,30,43,42,43,43,40,39,...,0,0,0,0,0,0,0,0,0,0
1179,CSc1nc2ccc3nc(NC(=O)C(c4ccccc4)c4ccccc4)sc3c2s1,47,45,29,43,42,39,43,41,43,...,0,0,0,0,0,0,0,0,0,0


In [588]:
submission = test_new_data[['Smiles', 'Active']]

In [589]:
submission.sample(4)

Unnamed: 0,Smiles,Active
391,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,0
348,C[C@H](CCCC(C)(C)O)[C@H]1CC[C@H]2[C@@H]3CC=C4C...,0
1139,C[C@]12C[C@H](O)[C@H]3[C@@H](CCC4=CC(=O)CC[C@@...,0
409,COc1cc(C#N)ccc1S(=O)(=O)Nc1ccc2c(c1)cc(C)c(=O)n2C,0


submission['Active'] = submission['Active'].apply(lambda x: int(x))

In [590]:
submission.to_csv('submission_2.csv', index=False)