In [90]:
import pandas as pd
import numpy as np
import math

from tqdm import tqdm
import time

from sklearn import model_selection, datasets, metrics, tree 

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable

import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix

In [91]:
RANDOM_SEED = 2022022

In [92]:
test = pd.read_csv('Task/test.csv')
test.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [93]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1614 entries, 0 to 1613
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  1614 non-null   object
dtypes: object(1)
memory usage: 12.7+ KB


In [94]:
test.sample(3)

Unnamed: 0,Smiles
250,C=C1/C(=C\C=C2/CCC[C@@]3(C)[C@H]2CC[C@@H]3[C@H...
326,FC(F)(F)c1c(N2CCC(c3ccccc3)CC2)ccn2c(CC3CC3)nnc12
1346,O=C(C[S+]([O-])Cc1ccco1)NC/C=C\COc1cc(CN2CCCCC...


In [95]:
train = pd.read_csv('Task/train.csv')
train.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [96]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5557 entries, 0 to 5556
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  5557 non-null   object
 1   Active  5557 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 49.0+ KB


In [97]:
train.sample(3)

Unnamed: 0,Smiles,Active
4468,CC(=O)OC1CN2CCC1CC2,False
4372,Cl.Nc1c2c(nc3ccccc13)CCCC2,False
47,CC[C@H](CC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC=C4C...,False


In [98]:
list(train['Active'].unique())

[False, True]

In [99]:
list_active = {False: 0, True: 1}

In [100]:
train['Active'] = train['Active'].map(list_active)

In [101]:
train.sample()

Unnamed: 0,Smiles,Active
914,Nc1nc(Nc2ccc3c(c2)CC[C@@H](N2CCCC2)CC3)nn1-c1c...,0


In [102]:
#train['text'] = train['Smiles'].apply(lambda x: list(x))

In [103]:
#train.set_index(['Smiles'], inplace = True)

In [104]:
train.sample()

Unnamed: 0,Smiles,Active
1069,Nc1nc(O)c2ncn(COC(CO)CO)c2n1,0


In [105]:
train['train'] = 1
test['train'] = 0

In [106]:
df = pd.concat([train, test], ignore_index=True)

In [107]:
from collections import Counter

atoms = Counter()
for i in df['Smiles'].unique():
    for j in i:
        atoms[j] += 1

In [108]:
atoms

Counter({'C': 67272,
         'O': 23771,
         'c': 70791,
         '1': 19434,
         '2': 14166,
         '[': 11223,
         'n': 7749,
         'H': 8613,
         ']': 11223,
         '(': 34223,
         'N': 12096,
         ')': 34223,
         '@': 13220,
         '.': 1245,
         'l': 2149,
         '=': 13520,
         'S': 1647,
         '3': 6745,
         '4': 2446,
         'F': 2484,
         '-': 2347,
         's': 551,
         '/': 1517,
         '5': 650,
         'o': 479,
         'a': 161,
         '+': 669,
         '#': 390,
         'I': 174,
         'P': 192,
         'B': 299,
         'r': 281,
         '\\': 373,
         'Z': 6,
         '6': 142,
         '7': 42,
         '8': 22,
         'e': 3,
         'A': 25,
         'K': 11,
         'M': 4,
         'g': 5,
         'i': 10,
         'L': 4,
         '9': 2})

In [109]:
atoms_voc = {}
cont = 0
for i in atoms.keys():
    new_value = len(atoms.keys())+10 - cont
    atoms_voc[i] = new_value
    cont += 1

In [110]:
atoms_voc

{'C': 55,
 'O': 54,
 'c': 53,
 '1': 52,
 '2': 51,
 '[': 50,
 'n': 49,
 'H': 48,
 ']': 47,
 '(': 46,
 'N': 45,
 ')': 44,
 '@': 43,
 '.': 42,
 'l': 41,
 '=': 40,
 'S': 39,
 '3': 38,
 '4': 37,
 'F': 36,
 '-': 35,
 's': 34,
 '/': 33,
 '5': 32,
 'o': 31,
 'a': 30,
 '+': 29,
 '#': 28,
 'I': 27,
 'P': 26,
 'B': 25,
 'r': 24,
 '\\': 23,
 'Z': 22,
 '6': 21,
 '7': 20,
 '8': 19,
 'e': 18,
 'A': 17,
 'K': 16,
 'M': 15,
 'g': 14,
 'i': 13,
 'L': 12,
 '9': 11}

In [111]:
num_list = {}
for i in range(1,10):
    num_list[str(i)] = i

In [112]:
num_list

{'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9}

In [113]:
for i in range(1,10):
    del atoms_voc[str(i)]

In [114]:
atoms_voc

{'C': 55,
 'O': 54,
 'c': 53,
 '[': 50,
 'n': 49,
 'H': 48,
 ']': 47,
 '(': 46,
 'N': 45,
 ')': 44,
 '@': 43,
 '.': 42,
 'l': 41,
 '=': 40,
 'S': 39,
 'F': 36,
 '-': 35,
 's': 34,
 '/': 33,
 'o': 31,
 'a': 30,
 '+': 29,
 '#': 28,
 'I': 27,
 'P': 26,
 'B': 25,
 'r': 24,
 '\\': 23,
 'Z': 22,
 'e': 18,
 'A': 17,
 'K': 16,
 'M': 15,
 'g': 14,
 'i': 13,
 'L': 12}

In [115]:
df['text'] = df['Smiles'].apply(lambda x: str(x))

In [116]:
df.sample()

Unnamed: 0,Smiles,Active,train,text
2029,CN(C)[C@@H]1C(O)=C(C(N)=O)C(=O)[C@@]2(O)C(O)=C...,0.0,1,CN(C)[C@@H]1C(O)=C(C(N)=O)C(=O)[C@@]2(O)C(O)=C...


In [117]:
for k,v in num_list.items():
    v = str(v)
    df['text'] = df['text'].str.replace(k,v+',')

In [118]:
for k,v in atoms_voc.items():
    v = str(v)
    df['text'] = df['text'].str.replace(k,v+',')

In [119]:
df.sample()

Unnamed: 0,Smiles,Active,train,text
6148,c1coc(CNc2ncnc3[nH]cnc23)c1,,0,"53,1,53,31,53,46,55,45,53,2,49,53,49,53,3,50,4..."


In [120]:
df['text'] = df['text'].apply(lambda x: x[:-1])

In [121]:
df[df.index == 5134]

Unnamed: 0,Smiles,Active,train,text
5134,OCCCCO,0.0,1,545555555554


In [122]:
df['Active'] = df['Active'].fillna(0)

In [123]:
df['Active'] = df['Active'].apply(lambda x: int(x))

In [124]:
df.info()                              

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7171 entries, 0 to 7170
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  7171 non-null   object
 1   Active  7171 non-null   int64 
 2   train   7171 non-null   int64 
 3   text    7171 non-null   object
dtypes: int64(2), object(2)
memory usage: 224.2+ KB


In [125]:
df['text'] = df['text'].apply(lambda x: x.split(','))

In [126]:
df.sample()

Unnamed: 0,Smiles,Active,train,text
3258,O=C1c2ccccc2C(=O)c2oc(-c3ccccc3)nc21,0,1,"[54, 40, 55, 1, 53, 2, 53, 53, 53, 53, 53, 2, ..."


In [127]:
df[df.index == 5134]

Unnamed: 0,Smiles,Active,train,text
5134,OCCCCO,0,1,"[54, 55, 55, 55, 55, 54]"


In [128]:
df['text'] = df['text'].apply(lambda x: [int(i) for i in x])

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7171 entries, 0 to 7170
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  7171 non-null   object
 1   Active  7171 non-null   int64 
 2   train   7171 non-null   int64 
 3   text    7171 non-null   object
dtypes: int64(2), object(2)
memory usage: 224.2+ KB


In [130]:
new_df = df.copy()

In [131]:
new_df.reset_index(inplace = True)
new_df.set_index(['Smiles'], inplace = True)

In [132]:
new_df.sample()

Unnamed: 0_level_0,index,Active,train,text
Smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,1312,0,1,"[54, 40, 55, 46, 54, 44, 55, 46, 36, 44, 46, 3..."


In [133]:
new_df.drop(['index'], axis = 1, inplace = True)

In [134]:
train_new = new_df[new_df['train'] == 1]
test_new =new_df[new_df['train'] == 0]

In [135]:
X = train_new.drop(['Active', 'train'], axis = 1)
y = train_new['Active']

In [137]:
X.sample()

Unnamed: 0_level_0,text
Smiles,Unnamed: 1_level_1
O=C1CC(c2c[nH]c3ccc(F)cc23)C(=O)N1,"[54, 40, 55, 1, 55, 55, 46, 53, 2, 53, 50, 49,..."


In [89]:
batch_size = 100
train_loader = torch.utils.data.DataLoader(X, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size)

TypeError: expected str, bytes or os.PathLike object, not DataFrame