In [525]:
import pandas as pd
import numpy as np
import math

from tqdm import tqdm
import time

In [526]:
RANDOM_SEED = 13022022

In [527]:
test = pd.read_csv('Task/test.csv')
test.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [528]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1614 entries, 0 to 1613
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  1614 non-null   object
dtypes: object(1)
memory usage: 12.7+ KB


In [529]:
test.sample(3)

Unnamed: 0,Smiles
172,Nc1ncc2c(n1)CC(c1ccco1)CC2=O
684,COc1ccc(S(=O)(=O)N(C(C)=O)c2ccccc2/C=C/c2cc[n+...
479,CC(=O)C1C(=O)C=C(C)OC1=O


In [530]:
train = pd.read_csv('Task/train.csv')
train.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [531]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5557 entries, 0 to 5556
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  5557 non-null   object
 1   Active  5557 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 49.0+ KB


In [532]:
train.sample(3)

Unnamed: 0,Smiles,Active
1472,CCC1(C2=NCCN2)Cc2ccccc2O1.Cl,False
3365,N#Cc1cc2nc(O)c(O)nc2cc1[N+](=O)[O-],False
3833,C[C@@]1(O)[C@H](O)[C@@H](CO)O[C@H]1n1ccc(N)nc1=O,False


In [533]:
list(train['Active'].unique())

[False, True]

In [534]:
list_active = {False: 0, True: 1}

In [535]:
train['Active'] = train['Active'].map(list_active)

In [536]:
train.sample()

Unnamed: 0,Smiles,Active
3424,COCC(=O)NC/C=C/c1ccc2ncnc(Nc3ccc(Oc4ccc(C)nc4)...,0


In [537]:
train['train'] = 1
test['train'] = 0

In [538]:
df = pd.concat([train, test], ignore_index=True)

In [539]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7171 entries, 0 to 7170
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Smiles  7171 non-null   object 
 1   Active  5557 non-null   float64
 2   train   7171 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 168.2+ KB


In [540]:
df['train'].unique()

array([1, 0], dtype=int64)

In [541]:
df[df['train'] == 1]['Active'].value_counts()

0.00    5351
1.00     206
Name: Active, dtype: int64

In [542]:
from collections import Counter

atoms = Counter()
for i in df['Smiles'].unique():
    for j in i:
        atoms[j] += 1

In [543]:
len(atoms.keys())

45

In [544]:
cont = 0
for i in atoms.keys():
    new_value = len(atoms.keys()) - cont
    atoms[i] = new_value
    cont += 1

In [545]:
atoms

Counter({'C': 45,
         'O': 44,
         'c': 43,
         '1': 42,
         '2': 41,
         '[': 40,
         'n': 39,
         'H': 38,
         ']': 37,
         '(': 36,
         'N': 35,
         ')': 34,
         '@': 33,
         '.': 32,
         'l': 31,
         '=': 30,
         'S': 29,
         '3': 28,
         '4': 27,
         'F': 26,
         '-': 25,
         's': 24,
         '/': 23,
         '5': 22,
         'o': 21,
         'a': 20,
         '+': 19,
         '#': 18,
         'I': 17,
         'P': 16,
         'B': 15,
         'r': 14,
         '\\': 13,
         'Z': 12,
         '6': 11,
         '7': 10,
         '8': 9,
         'e': 8,
         'A': 7,
         'K': 6,
         'M': 5,
         'g': 4,
         'i': 3,
         'L': 2,
         '9': 1})

In [546]:
df['len'] = df['Smiles'].apply(lambda x: len(x))

In [547]:
df['1'] = df['Smiles'].apply(lambda x: list(x))

In [548]:
df.sample(3)

Unnamed: 0,Smiles,Active,train,len,1
812,C[N+]1(C)CCC(OC(=O)C(c2ccccc2)c2ccccc2)CC1.[I-],0.0,1,47,"[C, [, N, +, ], 1, (, C, ), C, C, C, (, O, C, ..."
1595,C1=C2CCCN[C@H]2[C@@H]2C[C@H]1[C@H]1CCCCN1C2,0.0,1,43,"[C, 1, =, C, 2, C, C, C, N, [, C, @, H, ], 2, ..."
2212,Cc1cc(N)c2ccccc2[n+]1CCCCCCCCCC[n+]1c(C)cc(N)c...,0.0,1,66,"[C, c, 1, c, c, (, N, ), c, 2, c, c, c, c, c, ..."


In [549]:
df['len'].unique().max()

707

In [550]:
new_df = pd.DataFrame()
for i in tqdm(range(len(df['1']))):
    e = pd.DataFrame(df.loc[i].explode())
    e.reset_index(drop=True, inplace=True)
    e = e.T
    new_df = pd.concat([new_df, e], ignore_index = True)

100%|██████████████████████████████████████████████████████████████████████████████| 7171/7171 [06:28<00:00, 18.44it/s]


In [551]:
new_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,701,702,703,704,705,706,707,708,709,710
6640,COc1ccc(/C=C2\CCCN=C2c2cccnc2)c(OC)c1,,0,37,C,O,c,1,c,c,...,,,,,,,,,,
3015,C=CC[N@@+]12CC[C@@]34c5ccccc5N5/C=C6/[C@H]7C[C...,0.0,1,155,C,=,C,C,[,N,...,,,,,,,,,,
3395,C[C@@]1(C(=O)O)NCCc2cc(O)c(O)cc21,0.0,1,33,C,[,C,@,@,],...,,,,,,,,,,
7012,CS(=O)(=O)O.OC(CCN1CCCCC1)(c1ccccc1)c1ccccc1,,0,44,C,S,(,=,O,),...,,,,,,,,,,
3750,FC(F)(F)c1cccc(Nc2nc(-c3ccccc3)nc3ncccc23)c1,0.0,1,44,F,C,(,F,),(,...,,,,,,,,,,


In [552]:
new_df.fillna(0, inplace = True)

In [553]:
new_df.rename(columns = {0: 'Smiles', 1: 'Active', 2: 'train', 3: 'len'}, inplace = True)
new_df = new_df.astype({"Active": "Int64"})

In [554]:
new_df.set_index(['Smiles', 'Active', 'train', 'len'], inplace = True)

In [555]:
new_df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,4,5,6,7,8,9,10,11,12,13,...,701,702,703,704,705,706,707,708,709,710
Smiles,Active,train,len,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
C=C(C)c1cccc(C(C)(C)NC(=O)Nc2ccc3c(c2)OCCO3)c1,0,1,46,C,=,C,(,C,),c,1,c,c,...,0,0,0,0,0,0,0,0,0,0
CC(=O)Nc1ccc(S(=O)(=O)c2ccc(NC(C)=O)cc2)cc1,0,0,43,C,C,(,=,O,),N,c,1,c,...,0,0,0,0,0,0,0,0,0,0
CCn1c(C(=O)N(C2CC2)C2CC2)cc2c3c(ncn3C)c(Nc3cc(C)n(C)n3)nc21,0,1,59,C,C,n,1,c,(,C,(,=,O,...,0,0,0,0,0,0,0,0,0,0
CCc1cccc(N(C)C(=N)Nc2cccc3ccccc23)c1.Cl,0,1,39,C,C,c,1,c,c,c,c,(,N,...,0,0,0,0,0,0,0,0,0,0
CCCCCCCCc1ccc(CCC(N)(CO)CO)cc1.Cl,0,1,33,C,C,C,C,C,C,C,C,c,1,...,0,0,0,0,0,0,0,0,0,0


In [556]:
from tqdm import tqdm
import time

for i in tqdm(range(len(new_df[4]))):
    new_df.iloc[i] = new_df.iloc[i].map(atoms)

100%|█████████████████████████████████████████████████████████████████████████████| 7171/7171 [00:11<00:00, 611.91it/s]


In [557]:
new_df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,4,5,6,7,8,9,10,11,12,13,...,701,702,703,704,705,706,707,708,709,710
Smiles,Active,train,len,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Cc1cc(C(C)(C)CC(C)(C)C)ccc1OCCOCC[N+](C)(C)Cc1ccccc1.[Cl-],0,1,58,45,43,42,43,43,36,45,36,45,34,...,0,0,0,0,0,0,0,0,0,0
CCn1c(=O)/c(=C2\Sc3ccccc3N2C)s/c1=C\c1cccc[n+]1CC.[Cl-],0,0,55,45,45,39,42,43,36,30,44,34,23,...,0,0,0,0,0,0,0,0,0,0
COc1cccc(C(=O)Nc2ncc(Cc3cccc(C(F)(F)F)c3)s2)c1,0,1,46,45,44,43,42,43,43,43,43,36,45,...,0,0,0,0,0,0,0,0,0,0
C[C@@H](NCCCc1cccc(C(F)(F)F)c1)c1cccc2ccccc12,0,0,45,45,40,45,33,33,38,37,36,35,45,...,0,0,0,0,0,0,0,0,0,0
C/C(=C\C(=O)Nc1ccccc1C(=O)O)c1ccc2ccccc2c1,0,0,42,45,23,45,36,30,45,13,45,36,30,...,0,0,0,0,0,0,0,0,0,0


In [558]:
new_df.reset_index(inplace = True)
new_df.set_index(['Smiles'], inplace = True)

In [559]:
new_df.sample()

Unnamed: 0_level_0,Active,train,len,4,5,6,7,8,9,10,...,701,702,703,704,705,706,707,708,709,710
Smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cc1cc(O)c(C(=O)N[C@@H](C(=O)N[C@@H]2C(=O)N3C(C(=O)[O-])=C(CSc4nnnn4C)CS[C@H]23)c2ccc(O)cc2)cn1.[Na+],0,0,100,45,43,42,43,43,36,44,...,0,0,0,0,0,0,0,0,0,0


In [560]:
#new_df = new_df.astype({"Active": "Int64"})

In [561]:
train_new = new_df[new_df['train'] == 1]
test_new =new_df[new_df['train'] == 0]

In [562]:
X = train_new.drop(['Active', 'train'], axis = 1)
y = train_new['Active']

In [563]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5557 entries, COc1ccc2[nH]cc(CCN)c2c1 to CCCCNc1ccc(C(=O)OCCN(C)C)cc1.Cl
Columns: 708 entries, len to 710
dtypes: int64(1), object(707)
memory usage: 30.1+ MB


In [564]:
from sklearn import model_selection, datasets, metrics, tree 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [565]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(X, y, 
                                                                                    test_size=0.30, random_state=RANDOM_SEED)

In [566]:
from lazypredict.Supervised import LazyClassifier

In [567]:
clf_lc = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf_lc.fit(train_data, test_data, train_labels, test_labels)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [02:22<00:00,  4.91s/it]


In [568]:
print(models)

Empty DataFrame
Columns: [Accuracy, Balanced Accuracy, ROC AUC, F1 Score, Time Taken]
Index: []


In [569]:
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_labels)

DecisionTreeClassifier()

In [570]:
predictions_clf = clf.predict(test_data)
metrics.f1_score(test_labels, predictions_clf)

0.08333333333333333

In [571]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=4)
clf.fit(X=train_data, y=train_labels)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

0.9614299940778089 {'max_depth': 4}


In [572]:
clf_4 = tree.DecisionTreeClassifier(max_depth = 4)
clf_4.fit(train_data, train_labels)

DecisionTreeClassifier(max_depth=4)

In [573]:
predictions_clf = clf_4.predict(test_data)
metrics.f1_score(test_labels, predictions_clf)

0.0

In [574]:
bagging_trees = BaggingClassifier(clf, n_estimators = 150)
bagging_trees.fit(train_data,train_labels)
predictions_bg = bagging_trees.predict(test_data)

KeyboardInterrupt: 

In [None]:
metrics.f1_score(test_labels, predictions_bg)

In [575]:
from sklearn.neighbors import NearestCentroid

In [576]:
NC_model = NearestCentroid()

In [577]:
train_data, test_data, train_labels, test_labels
NC_model.fit(train_data, train_labels)
predictions_clf_tree = NC_model.predict(test_data)
metrics.f1_score(test_labels, predictions_clf_tree)

0.08366013071895426

In [578]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier()


In [579]:
clf_rf.fit(train_data, train_labels)
predictions_clf_rf = clf_rf.predict(test_data)
metrics.f1_score(test_labels, predictions_clf_rf)

0.0

In [582]:
test_new_data = test_new.drop(['Active','train'], axis = 1) 

In [583]:
predictions = clf.predict(test_new_data)

In [584]:
test_new_data['Active'] = predictions

In [585]:
test_new_data.sample(5)

Unnamed: 0_level_0,len,4,5,6,7,8,9,10,11,12,...,702,703,704,705,706,707,708,709,710,Active
Smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CNC(=O)c1ccccc1Nc1nc(Nc2ccc(N3CCOCC3)cc2OC)ncc1Cl,49,45,35,45,36,30,44,34,43,42,...,0,0,0,0,0,0,0,0,0,0
O=S(=O)(N[C@H]1CC[C@@](c2cc(F)ccc2F)(S(=O)(=O)c2ccc(Cl)cc2)CC1)C(F)(F)F,71,44,30,29,36,30,44,34,36,35,...,0,0,0,0,0,0,0,0,0,0
CC1Nc2ccc(Cl)cc2S(=O)(=O)N1,27,45,45,42,35,43,41,43,43,43,...,0,0,0,0,0,0,0,0,0,0
CC(C)(O)CC(=O)O.CC(C)(O)CC(=O)O,31,45,45,36,45,34,36,44,34,45,...,0,0,0,0,0,0,0,0,0,0
O=C(CBr)N1CC([N+](=O)[O-])([N+](=O)[O-])C1,42,44,30,45,36,45,15,14,34,35,...,0,0,0,0,0,0,0,0,0,0


In [586]:
test_new_data.reset_index(inplace = True)

In [587]:
test_new_data.sample(5)

Unnamed: 0,Smiles,len,4,5,6,7,8,9,10,11,...,702,703,704,705,706,707,708,709,710,Active
1550,NS(=O)(=O)Oc1ccc2c3c(c(=O)oc2c1)CCCCC3,38,35,29,36,30,44,34,36,30,...,0,0,0,0,0,0,0,0,0,0
634,COc1ccc(C(=O)CCC(=O)O)c2ccccc12,31,45,44,43,42,43,43,43,36,...,0,0,0,0,0,0,0,0,0,0
1363,OC(CN1CCC(Cc2ccc(F)cc2)CC1)c1ccc(Cl)cc1,39,44,45,36,45,35,42,45,45,...,0,0,0,0,0,0,0,0,0,0
628,O=c1cc[nH]c(=O)[nH]1,20,44,30,43,42,43,43,40,39,...,0,0,0,0,0,0,0,0,0,0
1179,CSc1nc2ccc3nc(NC(=O)C(c4ccccc4)c4ccccc4)sc3c2s1,47,45,29,43,42,39,43,41,43,...,0,0,0,0,0,0,0,0,0,0


In [588]:
submission = test_new_data[['Smiles', 'Active']]

In [589]:
submission.sample(4)

Unnamed: 0,Smiles,Active
391,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,0
348,C[C@H](CCCC(C)(C)O)[C@H]1CC[C@H]2[C@@H]3CC=C4C...,0
1139,C[C@]12C[C@H](O)[C@H]3[C@@H](CCC4=CC(=O)CC[C@@...,0
409,COc1cc(C#N)ccc1S(=O)(=O)Nc1ccc2c(c1)cc(C)c(=O)n2C,0


submission['Active'] = submission['Active'].apply(lambda x: int(x))

In [590]:
submission.to_csv('submission_2.csv', index=False)