## download original database 

In [1]:
! git clone https://github.com/MassBank/MassBank-data.git

fatal: destination path 'MassBank-data' already exists and is not an empty directory.


In [1]:
import os
from glob import glob
import sys
from multiprocessing import Pool
import numpy as np
import pickle

from tqdm import tqdm

In [2]:
vender_list = glob("./MassBank-data/*")
file_list = []
for ven in tqdm(vender_list):
    file_list.extend([os.path.abspath(one) for one in glob(ven+"/*.txt")])

print("number of file list",len(file_list))

100%|██████████| 45/45 [00:00<00:00, 67.20it/s]

('number of file list', 56270)





In [3]:
NUMBET_OF_MP_PROCESSES = None # Number of processes use to parse and expand data.

AUTHORS = "AUTHORS"
AC_INSTRUMENT = "AC$INSTRUMENT"
AC_INSTRUMENT_TYPE = "AC$INSTRUMENT_TYPE"
CH_NAME = "CH$NAME"
CH_SMILES = "CH$SMILES"
AC_MASS_SPECTROETRY = "AC$MASS_SPECTROMETRY"
MS_TYPE = "MS_TYPE"
ION_MODE = "ION_MODE"
IONIZATION_ENERGY = "IONIZATION_ENERGY"

def analyze(args):
    num,path = args
    #param init 
    auth = None
    inst = None
    inst_type = None
    name = None
    smiles = None
    ms_type = None
    ion_mode = None
    ret = {}
    with open(path) as f:
        temp = f.read().split("\n")
        for num,line in enumerate(temp):
            row = line.split(": ",1)
            if row[0] == AUTHORS:
                ret["authors"]= row[1]
            elif row[0] == AC_INSTRUMENT:
                ret["instrument"] = row[1]
            elif row[0] == AC_INSTRUMENT_TYPE:
                ret["instrument_type"] = row[1]
            elif row[0] == CH_NAME:
                ret["name"] = row[1]
            elif row[0] == CH_SMILES:
                ret["smiles"] = row[1]
            elif row[0] == AC_MASS_SPECTROETRY:
                label,_type = row[1].split(" ",1)
                if label == MS_TYPE:
                    ret["ms_type"] = _type
                elif label == ION_MODE:
                    ret["ion_mode"] = _type
                elif label == IONIZATION_ENERGY:
                    ret["ionization_energy"] = _type
            elif row[0] == "PK$PEAK":
                peak_start = num+1
        peak_x,peak_y = zip(*[one.split(" ")[2:4] for one in temp[peak_start:-1] if one != "//"])
        ret["peak_x"] = np.array(peak_x,dtype=np.float32)
        ret["peak_y"] = np.array(peak_y,dtype=np.float32)
    ret["path"] = path
    return ret

p = Pool(processes=NUMBET_OF_MP_PROCESSES)
ret = p.map(analyze,enumerate(file_list))
p.close()
p.join()

print("number of expand data:",len(ret))

import pickle
with open("massbank.pkl","wb") as f:
    pickle.dump(ret,f,protocol=2)

def is_select(one):
    return one["smiles"]!="N/A"
    
with open("massbank_smiles_list.txt","w") as f:
    f.write("\n".join([one["smiles"] for one in ret if is_select(one)]))

('number of expand data:', 56270)


In [25]:
from rdkit import Chem
from multiprocessing import Pool

zinc_path = "../data/zinc/all.txt"
massbank_path = "./massbank_smiles_list.txt"

print("get zinc list")
with open(zinc_path) as f:
    zinc_mols = [Chem.MolFromSmiles(one.strip()) for one in f.readlines() if one != "\n"]
print("Number of zinc dataset: %d" % (len(zinc_mols)))
    
print("get massbank list")
with open(massbank_path) as f:
    massbank_mols = [Chem.MolFromSmiles(one.strip()) for one in f.readlines() if one != "\n"]
print("Number of massbank dataset: %d" % (len(massbank_mols)))

smiles_list = [Chem.MolToSmiles(one) for one in zinc_mols]
smiles_list.extend([Chem.MolToSmiles(one) for one in massbank_mols])

def check_comflict(mol):
    global smiles_list
    flag = [True for one in smiles_list if mol == one]
    return len(flag) != 1

print("Check Conflict")
p = Pool()
ret = p.map(check_comflict,smiles_list)
p.close()
p.join()
result = [one for i,(one,flag) in enumerate(zip(smiles_list,ret)) if not flag]
print("Number of Conflicted :%d " %(len(result)))

del zinc_mols,massbank_mols,ret,p

get zinc list
Number of zinc dataset: 249456
get massbank list
Number of massbank dataset: 55278
Check Conflict
Number of Conflicted :256521 


In [26]:
print len(result)

256521


In [27]:
with open("all_smiles.txt","w") as f:
    f.write("\n".join(result))

In [14]:
mol1 = Chem.MolFromSmiles("CCC(C)C")
mol2 = Chem.MolFromSmiles("CC(C)CC")
print Chem.MolToSmiles(mol1)
print Chem.MolToSmiles(mol2)
mol1 == mol2

CCC(C)C
CCC(C)C


False