In [474]:
import pandas as pd
import numpy as np
import math

from tqdm import tqdm
import time

In [475]:
RANDOM_SEED = 13022022

In [476]:
test = pd.read_csv('Task/test.csv')
test.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [477]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1614 entries, 0 to 1613
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  1614 non-null   object
dtypes: object(1)
memory usage: 12.7+ KB


In [478]:
test.sample(3)

Unnamed: 0,Smiles
1555,Clc1cccc(Nc2ncnc3sccc23)c1
1213,Cc1ccc(Oc2nc3cc(-c4ccc5c(ccn5C)c4)c(Cl)cc3[nH]...
643,CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2c-1nc1cc(F)...


In [479]:
train = pd.read_csv('Task/train.csv')
train.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [480]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5557 entries, 0 to 5556
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Smiles  5557 non-null   object
 1   Active  5557 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 49.0+ KB


In [481]:
train.sample(3)

Unnamed: 0,Smiles,Active
3970,CC[C@@H](N)CNc1ccnc(-c2cc(-c3cnn(C)c3)ccc2O)n1,False
1642,CC(C)(C=O)Cc1cc(C(C)(C)C)c(O)c(C(C)(C)C)c1,False
5256,NC(Cc1cc(O)c(O)cc1O)C(=O)O,False


In [482]:
list(train['Active'].unique())

[False, True]

In [483]:
list_active = {False: 0, True: 1}

In [484]:
train['Active'] = train['Active'].map(list_active)

In [485]:
train.sample()

Unnamed: 0,Smiles,Active
624,NCCS(=O)(=O)O,0


In [486]:
train['train'] = 1
test['train'] = 0

In [487]:
df = pd.concat([train, test], ignore_index=True)

In [488]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7171 entries, 0 to 7170
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Smiles  7171 non-null   object 
 1   Active  5557 non-null   float64
 2   train   7171 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 168.2+ KB


In [489]:
df['train'].unique()

array([1, 0], dtype=int64)

In [490]:
df[df['train'] == 1]['Active'].value_counts()

0.00    5351
1.00     206
Name: Active, dtype: int64

In [491]:
from collections import Counter

atoms = Counter()
for i in df['Smiles'].unique():
    for j in i:
        atoms[j] += 1

In [492]:
len(atoms.keys())

45

In [493]:
cont = 0
for i in atoms.keys():
    new_value = len(atoms.keys()) - cont
    atoms[i] = new_value
    cont += 1

In [494]:
atoms

Counter({'C': 45,
         'O': 44,
         'c': 43,
         '1': 42,
         '2': 41,
         '[': 40,
         'n': 39,
         'H': 38,
         ']': 37,
         '(': 36,
         'N': 35,
         ')': 34,
         '@': 33,
         '.': 32,
         'l': 31,
         '=': 30,
         'S': 29,
         '3': 28,
         '4': 27,
         'F': 26,
         '-': 25,
         's': 24,
         '/': 23,
         '5': 22,
         'o': 21,
         'a': 20,
         '+': 19,
         '#': 18,
         'I': 17,
         'P': 16,
         'B': 15,
         'r': 14,
         '\\': 13,
         'Z': 12,
         '6': 11,
         '7': 10,
         '8': 9,
         'e': 8,
         'A': 7,
         'K': 6,
         'M': 5,
         'g': 4,
         'i': 3,
         'L': 2,
         '9': 1})

In [495]:
df['len'] = df['Smiles'].apply(lambda x: len(x))

In [496]:
df['1'] = df['Smiles'].apply(lambda x: list(x))

In [497]:
df.sample(3)

Unnamed: 0,Smiles,Active,train,len,1
4023,O=[N+]([O-])O[C@H]1CO[C@H]2[C@@H]1OC[C@H]2O[N+...,0.0,1,55,"[O, =, [, N, +, ], (, [, O, -, ], ), O, [, C, ..."
4256,NCCCCCC(=O)O,0.0,1,12,"[N, C, C, C, C, C, C, (, =, O, ), O]"
6929,C=CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@...,,0,59,"[C, =, C, C, [, C, @, ], 1, (, O, ), C, C, [, ..."


In [498]:
df['len'].unique().max()

707

In [499]:
new_df = pd.DataFrame()
for i in tqdm(range(len(df['1']))):
    e = pd.DataFrame(df.loc[i].explode())
    e.reset_index(drop=True, inplace=True)
    e = e.T
    new_df = pd.concat([new_df, e], ignore_index = True)

100%|██████████████████████████████████████████████████████████████████████████████| 7171/7171 [06:28<00:00, 18.46it/s]


In [500]:
new_df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,701,702,703,704,705,706,707,708,709,710
2117,O=[N+]([O-])C(Br)(CO)CO,1.0,1,23,O,=,[,N,+,],...,,,,,,,,,,
1077,CCCCCCCCCCCCCCCC[N+](C)(C)Cc1ccccc1.O.[Cl-],0.0,1,43,C,C,C,C,C,C,...,,,,,,,,,,
5022,NC(=O)CNCCC(c1ccccc1)c1ccccc1,0.0,1,29,N,C,(,=,O,),...,,,,,,,,,,
334,CCOC(=O)[C@H]1O[C@@H]1C(=O)N[C@@H](CC(C)C)C(=O...,0.0,1,55,C,C,O,C,(,=,...,,,,,,,,,,
6040,Cn1nnc2c(C(N)=O)ncn2c1=O,,0,24,C,n,1,n,n,c,...,,,,,,,,,,


In [501]:
new_df.fillna(0, inplace = True)

In [502]:
new_df.rename(columns = {0: 'Smiles', 1: 'Active', 2: 'train', 3: 'len'}, inplace = True)
new_df = new_df.astype({"Active": "Int64"})

In [503]:
new_df.set_index(['Smiles', 'Active', 'train', 'len'], inplace = True)

In [504]:
new_df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,4,5,6,7,8,9,10,11,12,13,...,701,702,703,704,705,706,707,708,709,710
Smiles,Active,train,len,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)c3)cc2)ccn1.Cc1ccc(S(=O)(=O)O)cc1,0,1,79,C,N,C,(,=,O,),c,1,c,...,0,0,0,0,0,0,0,0,0,0
Cn1cc(C[C@@H](N)C(=O)O)c2ccccc21,0,0,32,C,n,1,c,c,(,C,[,C,@,...,0,0,0,0,0,0,0,0,0,0
CCCNC(C)(C)COC(=O)c1ccccc1.Cl,0,1,29,C,C,C,N,C,(,C,),(,C,...,0,0,0,0,0,0,0,0,0,0
O=C1NC(=O)/C(=C\c2c[nH]c(=O)c(-c3ccc(N4CCNCC4)nc3)c2)S1,0,1,55,O,=,C,1,N,C,(,=,O,),...,0,0,0,0,0,0,0,0,0,0
Cc1cc(C)c(/C=C2\C(=O)Nc3ccccc32)[nH]1,0,1,37,C,c,1,c,c,(,C,),c,(,...,0,0,0,0,0,0,0,0,0,0


In [505]:
from tqdm import tqdm
import time

for i in tqdm(range(len(new_df[4]))):
    new_df.iloc[i] = new_df.iloc[i].map(atoms)

100%|█████████████████████████████████████████████████████████████████████████████| 7171/7171 [00:12<00:00, 576.88it/s]


In [506]:
new_df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,4,5,6,7,8,9,10,11,12,13,...,701,702,703,704,705,706,707,708,709,710
Smiles,Active,train,len,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
O=C(CC(c1ccccc1)c1ccccc1)N1CCN(C(c2ccccc2)c2ccccc2)CC1,0,0,54,44,30,45,36,45,45,36,43,42,43,...,0,0,0,0,0,0,0,0,0,0
CN(CCCN1c2ccccc2CCc2ccccc21)CC(=O)c1ccc(Cl)cc1,0,0,46,45,35,36,45,45,45,35,42,43,41,...,0,0,0,0,0,0,0,0,0,0
C[C@@H]1O[C@@H](O[C@@H]2[C@@H](O)[C@H](OCCc3ccc(O)c(O)c3)O[C@H](CO)[C@H]2OC(=O)/C=C/c2ccc(O)c(O)c2)[C@H](O)[C@H](O)[C@H]1O,1,1,122,45,40,45,33,33,38,37,42,44,40,...,0,0,0,0,0,0,0,0,0,0
C[C@@H]1O[C@@H](O[C@H]2[C@H](O)[C@@H](O[C@@H]3O[C@@H](C)[C@H](O)[C@@H](O)[C@H]3O)[C@H](O[C@H]3CC[C@@]4(C)C(=CC[C@H]5[C@@H]6C[C@@H]7O[C@]8(CC[C@@H](C)CO8)[C@@H](C)[C@@H]7[C@@]6(C)CC[C@@H]54)C3)O[C@@H]2CO)[C@H](O)[C@H](O)[C@H]1O,0,0,226,45,40,45,33,33,38,37,42,44,40,...,0,0,0,0,0,0,0,0,0,0
c1ccc2c(CCNc3ccc(Nc4ccncc4)cc3)c[nH]c2c1,0,1,40,43,42,43,43,43,41,43,36,45,45,...,0,0,0,0,0,0,0,0,0,0


In [507]:
new_df.reset_index(inplace = True)
new_df.set_index(['Smiles'], inplace = True)

In [508]:
new_df.sample()

Unnamed: 0_level_0,Active,train,len,4,5,6,7,8,9,10,...,701,702,703,704,705,706,707,708,709,710
Smiles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CC(C)[C@H](N)C(=O)OCCOCn1cnc2c(=O)nc(N)[nH]c21,0,1,46,45,45,36,45,34,40,45,...,0,0,0,0,0,0,0,0,0,0


In [509]:
#new_df = new_df.astype({"Active": "Int64"})

In [510]:
train_new = new_df[new_df['train'] == 1]
test_new =new_df[new_df['train'] == 0]

In [511]:
X = train_new.drop(['Active', 'train'], axis = 1)
y = train_new['Active']

In [512]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5557 entries, COc1ccc2[nH]cc(CCN)c2c1 to CCCCNc1ccc(C(=O)OCCN(C)C)cc1.Cl
Columns: 708 entries, len to 710
dtypes: int64(1), object(707)
memory usage: 30.1+ MB


In [513]:
from sklearn import model_selection, datasets, metrics, tree 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [514]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(X, y, 
                                                                                    test_size=0.30, random_state=RANDOM_SEED)

In [515]:
from lazypredict.Supervised import LazyClassifier

In [516]:
clf_lc = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf_lc.fit(train_data, test_data, train_labels, test_labels)

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [02:22<00:00,  4.91s/it]


In [517]:
print(models)

Empty DataFrame
Columns: [Accuracy, Balanced Accuracy, ROC AUC, F1 Score, Time Taken]
Index: []


In [518]:
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_labels)

DecisionTreeClassifier()

In [519]:
predictions_clf = clf.predict(test_data)
metrics.f1_score(test_labels, predictions_clf)

0.0875912408759124

In [520]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=4)
clf.fit(X=train_data, y=train_labels)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

0.9624582717127705 {'max_depth': 4}


In [522]:
clf_4 = tree.DecisionTreeClassifier(max_depth = 4)
clf_4.fit(train_data, train_labels)

DecisionTreeClassifier(max_depth=4)

In [523]:
predictions_clf = clf_4.predict(test_data)
metrics.f1_score(test_labels, predictions_clf)

0.0

In [None]:
bagging_trees = BaggingClassifier(clf, n_estimators = 150)
bagging_trees.fit(train_data,train_labels)
predictions_bg = bagging_trees.predict(test_data)

In [None]:
metrics.f1_score(test_labels, predictions_bg)

In [None]:
from sklearn.neighbors import NearestCentroid

In [None]:
NC_model = NearestCentroid()

In [None]:
train_data, test_data, train_labels, test_labels
NC_model.fit(train_data, train_labels)
predictions_clf_tree = NC_model.predict(test_data)
metrics.f1_score(test_labels, predictions_clf_tree)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier()


In [None]:
clf_rf.fit(train_data, train_labels)
predictions_clf_rf = clf_rf.predict(test_data)
metrics.f1_score(test_labels, predictions_clf_rf)

In [None]:
test_new_data = test_new.drop(['Active'], axis = 1) 

In [None]:
predictions = clf.predict(test_new_data)

In [None]:
test_new_data['Active'] = predictions

In [None]:
test_new_data.sample(5)

In [None]:
test_new_data.reset_index(inplace = True)

In [None]:
test_new_data.sample(5)

In [None]:
submission = test_new_data[['Smiles', 'Active']]

In [None]:
submission.sample(4)

submission['Active'] = submission['Active'].apply(lambda x: int(x))

In [None]:
submission.to_csv('submission_2.csv', index=False)