In [1]:
import os
import sys
import numpy as np
from tqdm.notebook import tqdm
from matchms.importing import load_from_json
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

path_root = os.path.dirname(os.getcwd())
path_data = os.path.join(os.path.dirname(path_root), "Data")  # add your local data folder here

## Load spectra 
- imported, cleaned by matchms + Pubchem lookup
- processed (intensity normalization, peak removal outside 10.0-1000.0 Da, minimum 5 peaks)
- remove spectra without InChIKey + smiles/inchi

In [2]:
import pickle

outfile = os.path.join(path_data, 'GNPS_all', 'ALL_GNPS_210409_positive_processed_annotated.pickle')
with open(outfile, 'rb') as file:
    spectrums = pickle.load(file)

print("number of spectra:", len(spectrums))

number of spectra: 250371


In [3]:
def count_annotations(spectrums):
    inchi_lst = []
    smiles_lst = []
    inchikey_lst = []
    for i, spec in enumerate(spectrums):
        inchi_lst.append(spec.get("inchi"))
        smiles_lst.append(spec.get("smiles"))
        inchikey = spec.get("inchikey")
        if inchikey is None:
            inchikey = spec.get("inchikey_inchi")
        inchikey_lst.append(inchikey)

    inchi_count = sum([1 for x in inchi_lst if x])
    smiles_count = sum([1 for x in smiles_lst if x])
    inchikey_count = sum([1 for x in inchikey_lst if x])
    print("Inchis:", inchi_count, "--", len(set(inchi_lst)), "unique")
    print("Smiles:", smiles_count, "--", len(set(smiles_lst)), "unique")
    print("Inchikeys:", inchikey_count, "--", 
          len(set([x[:14] for x in inchikey_lst if x])), "unique (first 14 characters)")

In [4]:
count_annotations(spectrums)

Inchis: 250371 -- 20395 unique
Smiles: 250371 -- 26760 unique
Inchikeys: 250371 -- 17106 unique (first 14 characters)


## Create reference scores (Tanimoto)
- Check better alternatives?

In [5]:
from collections import Counter 
  
def most_frequent(List): 
    occurence_count = Counter(List) 
    return occurence_count.most_common(1)[0][0] 

In [6]:
inchikeys_list = []
for s in spectrums:
    inchikeys_list.append(s.get("inchikey"))

inchikeys14_array = np.array([x[:14] for x in inchikeys_list])

In [7]:
inchikeys14_unique = list({x[:14] for x in inchikeys_list})
len(inchikeys14_unique)

17106

In [9]:
inchikey14 = inchikeys14_unique[2]
print(inchikey14)

idx = np.where(inchikeys14_array == inchikey14)[0]
for i in idx:
    print(spectrums[i].get("smiles") + "\n")

print("most frequent:", most_frequent([spectrums[i].get("smiles") for i in idx]))

HJBWJAPEBGSQPR
COc1ccc(/C=C/C(=O)O)cc1OC

COC1=CC=C(\C=C\C(O)=O)C=C1OC

COC1=C(C=C(C=C1)/C=C/C(=O)O)OC

COC1=C(C=C(C=C1)/C=C/C(=O)O)OC

COc(c1)c(OC)cc(C=CC(O)=O)c1

COc(c1)c(OC)cc(C=CC(O)=O)c1

COc(c1)c(OC)cc(C=CC(O)=O)c1

COc(c1)c(OC)cc(C=CC(O)=O)c1

COc(c1)c(OC)cc(C=CC(O)=O)c1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(OC)C=C(C=CC(O)=O)C=C1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(OC)C=C(\C=C\C(O)=O)C=C1

COC1=C(C=C(C=C1)C=CC(=O)O)OC

COC1=C(C=C(C=C1)C=CC(=O)O)OC

COC1=C(C=C(C=C1)C=CC(=O)O)OC

COC1=C(C=C(C=C1)C=CC(=O)O)OC

most frequent: COC1=C(OC)C=C(\C=C\C(O)=O)C=C1


In [10]:
inchi_list = []
for s in spectrums:
    inchi_list.append(s.get("inchi"))

inchi_array = np.array(inchi_list)

In [11]:
inchi_mapping = []
ID_mapping = []

for inchikey14 in inchikeys14_unique:
    idx = np.where(inchikeys14_array == inchikey14)[0]
    
    inchi = most_frequent([spectrums[i].get("inchi") for i in idx])
    inchi_mapping.append(inchi)
    ID = idx[np.where(inchi_array[idx] == inchi)[0][0]]
    ID_mapping.append(ID)

In [12]:
import pandas as pd
metadata = pd.DataFrame(list(zip(inchikeys14_unique, inchi_mapping, ID_mapping)), columns=["inchikey", "inchi", "ID"])
metadata.head()

Unnamed: 0,inchikey,inchi,ID
0,STZYTFJPGGDRJD,InChI=1S/C30H47NO4S/c1-7-28(4)16-24(35-25(33)1...,9914
1,SWTDXMBCOHIACK,InChI=1S/C27H32O7/c1-25(2)19(12-20(28)32-5)27(...,89952
2,HJBWJAPEBGSQPR,InChI=1S/C11H12O4/c1-14-9-5-3-8(4-6-11(12)13)7...,11055
3,VQNWOYVWHDVFJY,InChI=1S/C40H64N6O6/c1-14-15-16-18-21-28(8)37(...,29825
4,WFDXOXNFNRHQEC,InChI=1S/C22H17N3O5/c1-27-13-17(22(26)28-2)16-...,21467


In [15]:
spectrums[11055].get("inchikey")

'HJBWJAPEBGSQPR-GQCTYLIASA-N'

In [13]:
metadata.to_csv("metadata_AllInchikeys_data210409_positive.csv")

In [16]:
metadata = pd.read_csv("metadata_AllInchikeys_data210409_positive.csv")
metadata.head()

Unnamed: 0.1,Unnamed: 0,inchikey,inchi,ID
0,0,STZYTFJPGGDRJD,InChI=1S/C30H47NO4S/c1-7-28(4)16-24(35-25(33)1...,9914
1,1,SWTDXMBCOHIACK,InChI=1S/C27H32O7/c1-25(2)19(12-20(28)32-5)27(...,89952
2,2,HJBWJAPEBGSQPR,InChI=1S/C11H12O4/c1-14-9-5-3-8(4-6-11(12)13)7...,11055
3,3,VQNWOYVWHDVFJY,InChI=1S/C40H64N6O6/c1-14-15-16-18-21-28(8)37(...,29825
4,4,WFDXOXNFNRHQEC,InChI=1S/C22H17N3O5/c1-27-13-17(22(26)28-2)16-...,21467


In [14]:
metadata.ID.values.shape

(17106,)

## Add fingerprints (where necessary)

In [17]:
from matchms.filtering.add_fingerprint import add_fingerprint
from tqdm.notebook import tqdm

for i in tqdm(metadata.ID.values):
    spectrums[i] = add_fingerprint(spectrums[i],
                                   fingerprint_type="daylight", nbits=2048)

  0%|          | 0/17106 [00:00<?, ?it/s]

In [18]:
for i in tqdm(metadata.ID.values):
    if np.any(np.isnan(spectrums[i].get("fingerprint"))):
        print(i)

  0%|          | 0/17106 [00:00<?, ?it/s]

In [19]:
from matchms.similarity import FingerprintSimilarity
import time

spectrums_represent = [spectrums[i] for i in metadata.ID.values]

similarity_measure = FingerprintSimilarity(similarity_measure="jaccard")
tstart = time.time()
scores_mol_similarity = similarity_measure.matrix(spectrums_represent, spectrums_represent)
tend = time.time()
print(f"Calculation took {tend-tstart:.2f} s.")

Calculation took 1376.32 s.


In [20]:
filename = os.path.join(path_data, "similarities_ALL_GNPS_210409_positive_daylight2048_jaccard.npy")
np.save(filename, scores_mol_similarity)

In [21]:
scores_mol_similarity.shape

(17106, 17106)

In [22]:
tanimoto_df = pd.DataFrame(scores_mol_similarity, columns=metadata.inchikey.values, index=metadata.inchikey.values)
tanimoto_df.head()

Unnamed: 0,STZYTFJPGGDRJD,SWTDXMBCOHIACK,HJBWJAPEBGSQPR,VQNWOYVWHDVFJY,WFDXOXNFNRHQEC,GZLIPAFSJXROEC,YTZSBJLNMIQROD,FOULCGVQZYQEQM,BPSJMBKZSUTYNF,PZJVSTTVMXPZCJ,...,VLSRUFWCGBMYDJ,SXXHPCVDFDABHW,MRHAPHFJBAUDTR,ZYCWGZVLCXRARB,CGUNOWXWUXNOPE,MGRVRXRGTBOSHW,WELCNKRQSNXMDQ,XFANDVLPSBUGKD,NDTYTMIUWGWIMO,OAUIRSVJXOFAOO
STZYTFJPGGDRJD,1.0,0.377404,0.124056,0.269856,0.233411,0.258824,0.354221,0.227136,0.277537,0.316971,...,0.112832,0.17795,0.277099,0.308905,0.341988,0.017219,0.274643,0.298611,0.138404,0.270531
SWTDXMBCOHIACK,0.377404,1.0,0.158318,0.274962,0.29724,0.324305,0.372534,0.216172,0.274742,0.3513,...,0.140399,0.18438,0.325533,0.353607,0.387618,0.014911,0.364401,0.371447,0.153314,0.366841
HJBWJAPEBGSQPR,0.124056,0.158318,1.0,0.159198,0.209205,0.200203,0.133133,0.106944,0.201604,0.157643,...,0.753012,0.152395,0.180529,0.18828,0.147473,0.005848,0.199245,0.234842,0.074725,0.180833
VQNWOYVWHDVFJY,0.269856,0.274962,0.159198,1.0,0.270677,0.261011,0.218063,0.237487,0.228497,0.471683,...,0.149272,0.216159,0.591716,0.370402,0.27924,0.014306,0.288416,0.314224,0.100515,0.26204
WFDXOXNFNRHQEC,0.233411,0.29724,0.209205,0.270677,1.0,0.329466,0.273837,0.184561,0.258525,0.355102,...,0.19403,0.212011,0.330603,0.31287,0.318026,0.017301,0.319708,0.357045,0.083333,0.351071


In [23]:
filename = os.path.join(path_data, "ALL_GNPS_210409_positive_tanimoto_scores.pickle")
tanimoto_df.to_pickle(filename)