In [408]:
import pandas as pd
import numpy as np
import math

from tqdm import tqdm
import time

from sklearn import model_selection, datasets, metrics, tree 

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable

import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix

In [409]:
RANDOM_SEED = 2022022

In [410]:
test = pd.read_csv('Task/test.csv')
test.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [411]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1614 entries, 0 to 1613
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  1614 non-null   object
dtypes: object(1)
memory usage: 12.7+ KB


In [412]:
test.sample(3)

Unnamed: 0,Smiles
382,CCc1cc(Nc2nccc(-c3c(-c4ccc(OC)c(C(=O)Nc5c(F)cc...
835,CN1C(=O)N(C)C2C1N(C)C(=O)N2C
1435,O=C1C=C2C=C[C@@H]3C[C@@]2(O1)[C@H]1CCCCN31


In [413]:
train = pd.read_csv('Task/train.csv')
train.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [414]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5557 entries, 0 to 5556
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  5557 non-null   object
 1   Active  5557 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 49.0+ KB


In [415]:
train.sample(3)

Unnamed: 0,Smiles,Active
3515,CC[N+](CC)(CCNC(=O)C(=O)NCC[N+](CC)(CC)Cc1cccc...,False
4922,CCn1cc(C(=O)O)c(=O)c2cnc(N3CCCC3)nc21,False
4323,CNC(=O)c1ccccc1Sc1ccc2c(/C=C/c3ccccn3)n[nH]c2c1,False


In [416]:
list(train['Active'].unique())

[False, True]

In [417]:
list_active = {False: 0, True: 1}

In [418]:
train['Active'] = train['Active'].map(list_active)

In [419]:
train.sample()

Unnamed: 0,Smiles,Active
5216,CCc1ccc(C(=O)C(C)CN2CCCCC2)cc1.Cl,0


In [420]:
#train['text'] = train['Smiles'].apply(lambda x: list(x))

In [421]:
#train.set_index(['Smiles'], inplace = True)

In [422]:
train.sample()

Unnamed: 0,Smiles,Active
2963,CN(C)c1nc(N(C)C)nc(N(C)C)n1,0


In [423]:
train['train'] = 1
test['train'] = 0

In [424]:
df = pd.concat([train, test], ignore_index=True)

In [425]:
from collections import Counter

atoms = Counter()
for i in df['Smiles'].unique():
    for j in i:
        atoms[j] += 1

In [426]:
atoms

Counter({'C': 67272,
         'O': 23771,
         'c': 70791,
         '1': 19434,
         '2': 14166,
         '[': 11223,
         'n': 7749,
         'H': 8613,
         ']': 11223,
         '(': 34223,
         'N': 12096,
         ')': 34223,
         '@': 13220,
         '.': 1245,
         'l': 2149,
         '=': 13520,
         'S': 1647,
         '3': 6745,
         '4': 2446,
         'F': 2484,
         '-': 2347,
         's': 551,
         '/': 1517,
         '5': 650,
         'o': 479,
         'a': 161,
         '+': 669,
         '#': 390,
         'I': 174,
         'P': 192,
         'B': 299,
         'r': 281,
         '\\': 373,
         'Z': 6,
         '6': 142,
         '7': 42,
         '8': 22,
         'e': 3,
         'A': 25,
         'K': 11,
         'M': 4,
         'g': 5,
         'i': 10,
         'L': 4,
         '9': 2})

In [427]:
atoms_voc = {}
cont = 0
for i in atoms.keys():
    new_value = len(atoms.keys())+10 - cont
    atoms_voc[i] = new_value
    cont += 1

In [428]:
atoms_voc

{'C': 55,
 'O': 54,
 'c': 53,
 '1': 52,
 '2': 51,
 '[': 50,
 'n': 49,
 'H': 48,
 ']': 47,
 '(': 46,
 'N': 45,
 ')': 44,
 '@': 43,
 '.': 42,
 'l': 41,
 '=': 40,
 'S': 39,
 '3': 38,
 '4': 37,
 'F': 36,
 '-': 35,
 's': 34,
 '/': 33,
 '5': 32,
 'o': 31,
 'a': 30,
 '+': 29,
 '#': 28,
 'I': 27,
 'P': 26,
 'B': 25,
 'r': 24,
 '\\': 23,
 'Z': 22,
 '6': 21,
 '7': 20,
 '8': 19,
 'e': 18,
 'A': 17,
 'K': 16,
 'M': 15,
 'g': 14,
 'i': 13,
 'L': 12,
 '9': 11}

In [429]:
num_list = {}
for i in range(1,10):
    num_list[str(i)] = i

In [430]:
num_list

{'1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9}

In [431]:
for i in range(1,10):
    del atoms_voc[str(i)]

In [432]:
atoms_voc

{'C': 55,
 'O': 54,
 'c': 53,
 '[': 50,
 'n': 49,
 'H': 48,
 ']': 47,
 '(': 46,
 'N': 45,
 ')': 44,
 '@': 43,
 '.': 42,
 'l': 41,
 '=': 40,
 'S': 39,
 'F': 36,
 '-': 35,
 's': 34,
 '/': 33,
 'o': 31,
 'a': 30,
 '+': 29,
 '#': 28,
 'I': 27,
 'P': 26,
 'B': 25,
 'r': 24,
 '\\': 23,
 'Z': 22,
 'e': 18,
 'A': 17,
 'K': 16,
 'M': 15,
 'g': 14,
 'i': 13,
 'L': 12}

In [433]:
df['text'] = df['Smiles'].apply(lambda x: str(x))

In [434]:
df.sample()

Unnamed: 0,Smiles,Active,train,text
3626,CC(C)C[C@H](NP(=O)(O)O[C@@H]1O[C@@H](C)[C@H](O...,0.0,1,CC(C)C[C@H](NP(=O)(O)O[C@@H]1O[C@@H](C)[C@H](O...


In [435]:
for k,v in num_list.items():
    v = str(v)
    df['text'] = df['text'].str.replace(k,v+',')

In [436]:
for k,v in atoms_voc.items():
    v = str(v)
    df['text'] = df['text'].str.replace(k,v+',')

In [437]:
df.sample()

Unnamed: 0,Smiles,Active,train,text
1570,COCCCOc1cc(C[C@@H](C[C@H](N)[C@@H](O)C[C@H](C(...,0.0,1,"55,54,55,55,55,54,53,1,53,53,46,55,50,55,43,43..."


In [438]:
df['Active'] = df['Active'].fillna(0)

In [439]:
df['Active'] = df['Active'].apply(lambda x: int(x))

In [440]:
df.info()                              

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7171 entries, 0 to 7170
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  7171 non-null   object
 1   Active  7171 non-null   int64 
 2   train   7171 non-null   int64 
 3   text    7171 non-null   object
dtypes: int64(2), object(2)
memory usage: 224.2+ KB


In [441]:
df['text'] = df['text'].apply(lambda x: x.split(','))

In [446]:
df['text'] = df['text'].apply(lambda x: x[:-1])

In [447]:
df.sample()

Unnamed: 0,Smiles,Active,train,text
5394,Nc1ccn([C@H]2CO[C@@H](CO)S2)c(=O)n1,0,1,"[45, 53, 1, 53, 53, 49, 46, 50, 55, 43, 48, 47..."


In [448]:
df[df.index == 5134]

Unnamed: 0,Smiles,Active,train,text
5134,OCCCCO,0,1,"[54, 55, 55, 55, 55, 54]"


In [449]:
df['text'] = df['text'].apply(lambda x: [int(i) for i in x])

In [450]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7171 entries, 0 to 7170
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  7171 non-null   object
 1   Active  7171 non-null   int64 
 2   train   7171 non-null   int64 
 3   text    7171 non-null   object
dtypes: int64(2), object(2)
memory usage: 224.2+ KB


In [451]:
new_df = df.copy()

In [452]:
new_df.reset_index(inplace = True)
new_df.set_index(['Smiles'], inplace = True)

In [455]:
new_df.sample()

Unnamed: 0_level_0,Active,train,text
Smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
COc1cc(OC)c(OC)cc1C=O,0,1,"[55, 54, 53, 1, 53, 53, 46, 54, 55, 44, 53, 46..."


In [454]:
new_df.drop(['index'], axis = 1, inplace = True)

In [456]:
train_new = new_df[new_df['train'] == 1]
test_new =new_df[new_df['train'] == 0]

In [457]:
X = train_new.drop(['Active', 'train'], axis = 1)
y = train_new['Active']

In [463]:
X.sample()

Unnamed: 0_level_0,text
Smiles,Unnamed: 1_level_1
O=C1NC(=O)c2c1c1c3ccccc3[nH]c1c1[nH]c3ccccc3c21,"[54, 40, 55, 1, 45, 55, 46, 40, 54, 44, 53, 2,..."


In [464]:
features = np.array(X['text'])

In [465]:
features

array([list([55, 54, 53, 1, 53, 53, 53, 2, 50, 49, 48, 47, 53, 53, 46, 55, 55, 45, 44, 53, 2, 53, 1]),
       list([55, 55, 55, 45, 1, 55, 55, 55, 50, 55, 43, 48, 47, 46, 53, 2, 53, 53, 53, 53, 46, 54, 44, 53, 2, 44, 55, 1, 42, 55, 41]),
       list([54, 40, 55, 46, 45, 54, 44, 53, 1, 53, 49, 53, 46, 45, 2, 55, 55, 45, 46, 39, 46, 40, 54, 44, 46, 40, 54, 44, 53, 3, 53, 53, 53, 4, 53, 53, 53, 53, 53, 4, 53, 3, 44, 55, 55, 2, 44, 49, 53, 1]),
       ...,
       list([54, 53, 1, 53, 53, 53, 53, 2, 53, 53, 53, 49, 53, 1, 2]),
       list([54, 55, 46, 53, 1, 53, 53, 53, 46, 35, 53, 2, 53, 53, 53, 46, 55, 45, 3, 55, 55, 45, 46, 55, 53, 4, 53, 53, 49, 53, 53, 4, 44, 55, 55, 3, 44, 53, 53, 2, 44, 53, 46, 36, 44, 53, 1, 44, 46, 55, 46, 36, 44, 46, 36, 44, 36, 44, 55, 46, 36, 44, 46, 36, 44, 36]),
       list([55, 55, 55, 55, 45, 53, 1, 53, 53, 53, 46, 55, 46, 40, 54, 44, 54, 55, 55, 45, 46, 55, 44, 55, 44, 53, 53, 1, 42, 55, 41])],
      dtype=object)

In [461]:
import tensorflow as tf

AttributeError: type object 'h5py.h5.H5PYConfig' has no attribute '__reduce_cython__'

In [None]:
(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test, y_test)

In [582]:
test_new_data = test_new.drop(['Active','train'], axis = 1) 

In [583]:
predictions = clf.predict(test_new_data)

In [584]:
test_new_data['Active'] = predictions

In [585]:
test_new_data.sample(5)

Unnamed: 0_level_0,len,4,5,6,7,8,9,10,11,12,...,702,703,704,705,706,707,708,709,710,Active
Smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CNC(=O)c1ccccc1Nc1nc(Nc2ccc(N3CCOCC3)cc2OC)ncc1Cl,49,45,35,45,36,30,44,34,43,42,...,0,0,0,0,0,0,0,0,0,0
O=S(=O)(N[C@H]1CC[C@@](c2cc(F)ccc2F)(S(=O)(=O)c2ccc(Cl)cc2)CC1)C(F)(F)F,71,44,30,29,36,30,44,34,36,35,...,0,0,0,0,0,0,0,0,0,0
CC1Nc2ccc(Cl)cc2S(=O)(=O)N1,27,45,45,42,35,43,41,43,43,43,...,0,0,0,0,0,0,0,0,0,0
CC(C)(O)CC(=O)O.CC(C)(O)CC(=O)O,31,45,45,36,45,34,36,44,34,45,...,0,0,0,0,0,0,0,0,0,0
O=C(CBr)N1CC([N+](=O)[O-])([N+](=O)[O-])C1,42,44,30,45,36,45,15,14,34,35,...,0,0,0,0,0,0,0,0,0,0


In [586]:
test_new_data.reset_index(inplace = True)

In [587]:
test_new_data.sample(5)

Unnamed: 0,Smiles,len,4,5,6,7,8,9,10,11,...,702,703,704,705,706,707,708,709,710,Active
1550,NS(=O)(=O)Oc1ccc2c3c(c(=O)oc2c1)CCCCC3,38,35,29,36,30,44,34,36,30,...,0,0,0,0,0,0,0,0,0,0
634,COc1ccc(C(=O)CCC(=O)O)c2ccccc12,31,45,44,43,42,43,43,43,36,...,0,0,0,0,0,0,0,0,0,0
1363,OC(CN1CCC(Cc2ccc(F)cc2)CC1)c1ccc(Cl)cc1,39,44,45,36,45,35,42,45,45,...,0,0,0,0,0,0,0,0,0,0
628,O=c1cc[nH]c(=O)[nH]1,20,44,30,43,42,43,43,40,39,...,0,0,0,0,0,0,0,0,0,0
1179,CSc1nc2ccc3nc(NC(=O)C(c4ccccc4)c4ccccc4)sc3c2s1,47,45,29,43,42,39,43,41,43,...,0,0,0,0,0,0,0,0,0,0


In [588]:
submission = test_new_data[['Smiles', 'Active']]

In [589]:
submission.sample(4)

Unnamed: 0,Smiles,Active
391,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,0
348,C[C@H](CCCC(C)(C)O)[C@H]1CC[C@H]2[C@@H]3CC=C4C...,0
1139,C[C@]12C[C@H](O)[C@H]3[C@@H](CCC4=CC(=O)CC[C@@...,0
409,COc1cc(C#N)ccc1S(=O)(=O)Nc1ccc2c(c1)cc(C)c(=O)n2C,0


submission['Active'] = submission['Active'].apply(lambda x: int(x))

In [590]:
submission.to_csv('submission_2.csv', index=False)