In [1]:
%%capture
!conda install -y -c rdkit rdkit

In [2]:
%%writefile normalize_inchis.py

from tqdm import tqdm
from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from pathlib import Path

def normalize_inchi(inchi):
    try:
        mol = Chem.MolFromInchi(inchi)
    except:
        pass
    if mol is None:
        return inchi, 0
    else:
        try:   
            new_inchi = Chem.MolToInchi(mol)
        except: return inchi, 0
        if new_inchi == inchi:
            return new_inchi, 1
        else:
            return new_inchi, 2
        
submission_name = './final_sub.csv'
norm_path = Path('./submission.csv')

# Do the job
N = norm_path.read_text().count('\n') if norm_path.exists() else 0
print(N, 'number of predictions already normalized')

r = open(submission_name, 'r')
write_mode = 'w' if N == 0 else 'a'
w = open(str(norm_path), write_mode, buffering=1)

for _ in range(N):
    r.readline()
line = r.readline()  # this line is the header or is where it died last time
w.write(line)

pbar = tqdm()
while True:
    line = r.readline()
    if not line:
        break  # done
  
    image_id = line.split(',')[0]
    inchi = ','.join(line[:-1].split(',')[1:]).replace('"','')
    all_inchi = inchi.split(',|')
    normed = []
    scores = []
    final_inchi = None
    for i in all_inchi:
        inchi_norm, score = normalize_inchi(i)
        normed += [inchi_norm]
        scores += [score]
        if score == 1:
            final_inchi = inchi_norm
            break
    
    if final_inchi is None:
        # Check for Normalized values at least.
        for i in range(len(scores)):
            if scores[i] == 2:
                final_inchi = normed[i]
                break # keep order of preference.
    # Give up
    if final_inchi is None:
        final_inchi = normed[0] # Take the Best you've got.

    w.write(f'{image_id},"{final_inchi}"\n')
    pbar.update(1)

r.close()
w.close()

Writing normalize_inchis.py


# Concatenate the Sharded Predictions

In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm 
sample_sub = pd.read_csv('../input/version22/submission.csv')

In [4]:
csv = pd.read_csv('../input/beamsearched/submission.csv')

to_be_df = {"image_id": csv.image_id.tolist(), 'InChI': csv.InChI.values.tolist()}
ids_exist = csv.image_id.values
not_in = sample_sub[np.logical_not(sample_sub.image_id.isin(ids_exist))]
to_be_df['image_id'] += not_in['image_id'].values.tolist()
to_be_df['InChI'] += not_in['InChI'].values.tolist() # Fill with Best, Not Beam Searched submission
#values_in = sample_sub[sample_sub.image_id.isin(ids_exist)].sort_values('image_id').reset_index(drop = True)
# Sort and Save
to_be_df = pd.DataFrame(to_be_df).sort_values('image_id').reset_index(drop = True)
to_be_df.to_csv('./padded_submission.csv', index = False)

In [5]:
import pandas as pd
sample_sub = pd.read_csv('../input/version22/submission.csv')
final_prediction_paths = ['./padded_submission.csv', '../input/version22/submission.csv', '../input/version16/submission.csv', '../input/pseudolabels-bms/concat_submission.csv'] # Ordered in the preference you want
CSV = None
cur_num = 1
for path in final_prediction_paths:
    df = pd.read_csv(path)
    if CSV is None:
        CSV = df
        CSV[f'InChI_{cur_num}'] = CSV['InChI']
        del CSV['InChI']
        cur_num += 1
    else:
        # add column
        CSV[f'InChI_{cur_num}'] = '|' + df.InChI
        cur_num += 1
CSV = CSV.sort_values(by = 'image_id').reset_index(drop = True)
CSV.to_csv("final_sub.csv", index = False)


# Normalize

In [6]:
!while [ 1 ]; do python normalize_inchis.py && break; done

0 number of predictions already normalized
7425it [00:09, 798.91it/s]/bin/bash: line 1:  9555 Segmentation fault      (core dumped) python normalize_inchis.py
7459 number of predictions already normalized
74556it [01:13, 1218.51it/s]/bin/bash: line 1:  9557 Segmentation fault      (core dumped) python normalize_inchis.py
82127 number of predictions already normalized
54466it [00:52, 1157.60it/s]/bin/bash: line 1:  9559 Segmentation fault      (core dumped) python normalize_inchis.py
136650 number of predictions already normalized
15006it [00:12, 1186.00it/s]/bin/bash: line 1:  9561 Segmentation fault      (core dumped) python normalize_inchis.py
151661 number of predictions already normalized
2054it [00:01, 1196.17it/s]/bin/bash: line 1:  9563 Segmentation fault      (core dumped) python normalize_inchis.py
153801 number of predictions already normalized
45885it [00:38, 1158.79it/s]/bin/bash: line 1:  9565 Segmentation fault      (core dumped) python normalize_inchis.py
199773 number o

# Post process for final sub(InCHI-1)

In [7]:
csv = pd.read_csv("./submission.csv")
new_df = pd.DataFrame({'image_id': csv.image_id, 'InChI': csv.InChI_1})

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
new_df.to_csv("./submission.csv", index = False)

In [9]:
pd.read_csv('./submission.csv').head()

Unnamed: 0,image_id,InChI
0,00000d2a601c,InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-...
1,00001f7fc849,InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18...
2,000037687605,InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)1...
3,00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-1..."
4,00004df0fe53,InChI=1S/C9H12O2/c1-4-5-2-6-7(3-5)11-9(10)8(4)...
