In [31]:
import matplotlib.pyplot as plt
import os
from matchms import Spectrum
from matchms.similarity import CosineGreedy
import numpy as np
from numpy.linalg import norm 
import pandas as pd
from tqdm import tqdm
from rdkit.Chem import AllChem
from rdkit import Chem

In [2]:
def load_spectra(path):
    return {f.rsplit('.',1)[0]: np.load(os.path.join(path,f)) for f in os.listdir(path) if f.endswith('.npy')}

In [3]:
spectral_similiarty_embedding_dict = {}
spectral_similiarty_embedding_dict['0.6'] = load_spectra('./spectral_similarity/0.6/embedded_spectra_test/test_rows_spectral/')
spectral_similiarty_embedding_dict['0.7'] = load_spectra('./spectral_similarity/0.7/embedded_spectra_test/test_rows_spectral/')
spectral_similiarty_embedding_dict['0.8'] = load_spectra('./spectral_similarity/0.8/embedded_spectra_test/test_rows_spectral/')
spectral_similiarty_embedding_dict['0.9'] = load_spectra('./spectral_similarity/0.9/embedded_spectra_test/test_rows_spectral/')

In [4]:
structural_similiarty_embedding_dict = {}
structural_similiarty_embedding_dict['0.6'] = load_spectra('./structural_similarity/0.6/embedded_spectra_test/test_rows_structural/')
structural_similiarty_embedding_dict['0.7'] = load_spectra('./structural_similarity/0.7/embedded_spectra_test/test_rows_structural/')
structural_similiarty_embedding_dict['0.8'] = load_spectra('./structural_similarity/0.8/embedded_spectra_test/test_rows_structural/')
structural_similiarty_embedding_dict['0.9'] = load_spectra('./structural_similarity/0.9/embedded_spectra_test/test_rows_structural/')

In [34]:
def get_smiles(path, ids):
    df = pd.read_csv(path)
    output_dict = {}
    for id in tqdm(ids):
        row = df.loc[df['spectrum_id'] == id]
        assert len(row) == 1
        smiles = row['Smiles'].values[0]
        print(row)
        mol = Chem.MolFromSmiles(smiles)
        daylight_2048 = AllChem.GetMorganFingerprintAsBitVect(AllChem.MolFromSmiles(mol), 2, nBits=2048)
        daylight_4096 = AllChem.GetMorganFingerprintAsBitVect(AllChem.MolFromSmiles(mol), 2, nBits=4096)
        output_dict[id] = {'smiles': row['Smiles'].values[0], 
                          'Morgan_2048_2': row['Morgan_2048_2'].values[0],
                          'Morgan_2048_3': row['Morgan_2048_3'].values[0],
                          'Morgan_4096_2': row['Morgan_4096_2'].values[0],
                          'Morgan_4096_3': row['Morgan_4096_3'].values[0],
                          'Daylight_2048': daylight_2048,
                          'Daylight_4096': daylight_4096,}
    return output_dict
        

spectral_similarity_smiles_dict   = {}
structural_similarity_smiles_dict = {}

all_keys = list(set(list(spectral_similiarty_embedding_dict['0.6'].keys()) + list(spectral_similiarty_embedding_dict['0.7'].keys()) + list(spectral_similiarty_embedding_dict['0.8'].keys()) + list(spectral_similiarty_embedding_dict['0.9'].keys())))
spectral_similarity_smiles_dict = get_smiles('../../../data/structural_similarity/test_rows_spectral.csv', list(all_keys))

# spectral_similarity_smiles_dict = {key: }

  0%|          | 0/29732 [00:00<?, ?it/s]

        scan         spectrum_id  collision_energy   Adduct Compound_Source  \
3665  386852  CCMSLIB00004707203               NaN  [M+K]1+        isolated   

                                           Compund_Name  Precursor_MZ  \
3665  2-[6-[2-[(2E,6E,10E)-12-[4-[3,4-dihydroxy-6-me...       1185.49   

      ExactMass  Charge  Ion_Mode  ... msManufacturer msMassAnalyzer  \
3665        NaN       1  positive  ...            NaN           ftms   

     msIonisation msDissociationMethod GNPS_library_membership  \
3665          NaN                  NaN                    MONA   

     ppmBetweenExpAndThMass Morgan_2048_2 Morgan_4096_2  Morgan_2048_3  \
3665                    NaN           NaN           NaN            NaN   

     Morgan_4096_3  
3665           NaN  

[1 rows x 23 columns]





TypeError: No registered converter was able to produce a C++ rvalue of type std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > from this Python object of type float

: 

# Plot Structural Similarity Against MatchMS Similarity

In [8]:
def cos(spec1, spec2):
    return np.dot(spec1,spec2)/(norm(spec1)*norm(spec2))

In [10]:
score = cos(structural_similiarty_embedding_dict['0.6']['CCMSLIB00006380444'], structural_similiarty_embedding_dict['0.6']['CCMSLIB00005721329'])
score

0.08168274755267121

In [None]:
linalg.norm

<function numpy.linalg.norm(x, ord=None, axis=None, keepdims=False)>