In [12]:
import pathlib
import time

import numpy as np
import pandas as pd
import pubchempy as pcp
from jcamp import JCAMP_reader

In [5]:
# スペクトルデータがあるフォルダのパス
p_temp=pathlib.Path('data/nist-mass-spectrum/')

In [6]:
# 1つ1つのスペクトルデータのパスを取得し、
# jcampで辞書型に変換し、その後DataFrameに変換
spectrum_list=[]
for path in p_temp.iterdir():
    spectrum_list.append(JCAMP_reader(path))
spectrum_df=pd.DataFrame(spectrum_list)

In [7]:
# スペクトルデータからCAS noを取得して、
# PubChemで検索し、データを取得。
# 時間がかかるので注意
# またあまりたくさん検索しすぎない。
comp_num=len(spectrum_df)
comp_spectrum_list=[]
for i in range(comp_num):
    # 1秒待つ
    time.sleep(1)
    print(i)
    row=spectrum_df.iloc[i,:]
    comp_list=pcp.get_compounds(row["cas registry no"], 'name')
    if len(comp_list)==0:
        comp_spectrum_list.append(row)
    # スペクトルデータとpubchemデータを連結
    comp_spectrum_list.append(
        pd.concat([row,comp_list[0].to_series()]))
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [8]:
# DataFrame化
df=pd.DataFrame(comp_spectrum_list,index=list(range(comp_num)))
df.x

0     [14.0, 15.0, 19.0, 25.0, 26.0, 27.0, 28.0, 29....
1     [19.0, 20.0, 24.0, 25.0, 31.0, 41.0, 43.0, 50....
2     [13.0, 14.0, 25.0, 26.0, 31.0, 32.0, 33.0, 34....
3     [2.0, 12.0, 13.0, 14.0, 15.0, 16.0, 19.0, 20.0...
4     [19.0, 20.0, 24.0, 25.0, 26.0, 27.0, 31.0, 32....
5     [12.0, 13.0, 14.0, 15.0, 31.0, 32.0, 33.0, 34....
6     [25.0, 26.0, 31.0, 33.0, 34.0, 44.0, 45.0, 46....
7                  [12.0, 13.0, 14.0, 15.0, 16.0, 17.0]
8     [2.0, 12.0, 13.0, 14.0, 15.0, 16.0, 24.0, 25.0...
9     [12.0, 13.0, 14.0, 19.0, 20.0, 31.0, 32.0, 33....
10    [2.0, 12.0, 13.0, 14.0, 15.0, 16.0, 19.0, 20.0...
11    [12.0, 13.0, 19.0, 20.0, 31.0, 32.0, 39.0, 50....
12     [19.0, 31.0, 32.0, 50.0, 51.0, 69.0, 70.0, 88.0]
13    [12.0, 13.0, 19.0, 24.0, 31.0, 32.0, 43.0, 50....
14    [25.0, 26.0, 29.0, 31.0, 32.0, 33.0, 34.0, 35....
Name: x, dtype: object

In [14]:
df.to_pickle("ProcessedData/nist_ms_pcp.pkl")

In [33]:

comp_num=len(spectrum_df)
comp_dict=[]
for index, row in spectrum_df.iterrows():
    cas_no=row['cas registry no']
    comp_list=pcp.get_compounds(cas_no, 'name')
    comp_dict.append(comp_list[0].to_series())

In [59]:
comp_spectrum_list.append(spectrum_df.iloc[4,:])

In [64]:
comp_spectrum_list[0]

pandas.core.series.Series

In [66]:
list(range(5))

[0, 1, 2, 3, 4]

Unnamed: 0,title,jcamp-dx,data type,owner,cas registry no,$nist mass spec no,molform,mw,$nist source,xunits,...,pharmacophore_features_3d,record,rotatable_bond_count,shape_fingerprint_3d,shape_selfoverlap_3d,tpsa,undefined_atom_stereo_count,undefined_bond_stereo_count,volume_3d,xlogp
0,"Ethane, fluoro-",4.24,MASS SPECTRUM,NIST Mass Spectrometry Data Center,353-36-6,18890,C2 H5 F,48,MSDC,M/Z,...,,"{'id': {'id': {'cid': 9620}}, 'atoms': {'aid':...",0.0,,,0.0,0.0,0.0,,0.9
1,"Ethane, pentafluoro-",4.24,MASS SPECTRUM,NIST Mass Spectrometry Data Center,354-33-6,142366,C2 H F5,120,MSDC,M/Z,...,,"{'id': {'id': {'cid': 9633}}, 'atoms': {'aid':...",0.0,,,0.0,0.0,0.0,,2.2
2,"Ethane, 1,1,2,2-tetrafluoro-",4.24,MASS SPECTRUM,NIST Mass Spectrometry Data Center,359-35-3,1233,C2 H2 F4,102,MSDC,M/Z,...,,"{'id': {'id': {'cid': 9667}}, 'atoms': {'aid':...",1.0,,,0.0,0.0,0.0,,1.9
3,"1,1,2-Trifluoroethane",4.24,MASS SPECTRUM,NIST Mass Spectrometry Data Center,430-66-0,142370,C2 H3 F3,84,MSDC,M/Z,...,,,,,,,,,,


In [69]:
df.columns

Index(['title', 'jcamp-dx', 'data type', 'owner', 'cas registry no',
       '$nist mass spec no', 'molform', 'mw', '$nist source', 'xunits',
       'yunits', 'xfactor', 'yfactor', 'firstx', 'lastx', 'firsty', 'maxx',
       'minx', 'maxy', 'miny', 'npoints', 'peak table', 'end', 'x', 'y',
       'filename', 'origin', 'atom_stereo_count', 'atoms', 'bond_stereo_count',
       'bonds', 'cactvs_fingerprint', 'canonical_smiles', 'charge', 'cid',
       'complexity', 'conformer_id_3d', 'conformer_rmsd_3d', 'coordinate_type',
       'covalent_unit_count', 'defined_atom_stereo_count',
       'defined_bond_stereo_count', 'effective_rotor_count_3d', 'elements',
       'exact_mass', 'feature_selfoverlap_3d', 'fingerprint',
       'h_bond_acceptor_count', 'h_bond_donor_count', 'heavy_atom_count',
       'inchi', 'inchikey', 'isomeric_smiles', 'isotope_atom_count',
       'iupac_name', 'mmff94_energy_3d', 'mmff94_partial_charges_3d',
       'molecular_formula', 'molecular_weight', 'monoisotopic_mas

title                                                   Ethane, fluoro-
jcamp-dx                                                           4.24
data type                                                 MASS SPECTRUM
owner                                NIST Mass Spectrometry Data Center
cas registry no                                                353-36-6
$nist mass spec no                                                18890
molform                                                         C2 H5 F
mw                                                                   48
$nist source                                                       MSDC
xunits                                                              M/Z
yunits                                               RELATIVE INTENSITY
xfactor                                                               1
yfactor                                                               1
firstx                                                          

In [44]:
comp_list=pcp.get_compounds('faoijfeoiwaofjewoi', 'name')

In [50]:
if len(comp_list)==0:
    print('yes')

yes


In [42]:
pd.concat([row,comp_list[0].to_series()])

title                                                  Norflurane
jcamp-dx                                                     4.24
data type                                           MASS SPECTRUM
owner                          NIST Mass Spectrometry Data Center
cas registry no                                          811-97-2
                                              ...                
tpsa                                                            0
undefined_atom_stereo_count                                     0
undefined_bond_stereo_count                                     0
volume_3d                                                    None
xlogp                                                         1.7
Length: 71, dtype: object

In [38]:
row

title                                                        Norflurane
jcamp-dx                                                           4.24
data type                                                 MASS SPECTRUM
owner                                NIST Mass Spectrometry Data Center
cas registry no                                                811-97-2
$nist mass spec no                                               298494
molform                                                        C2 H2 F4
mw                                                                  102
$nist source                                                       MSDC
xunits                                                              M/Z
yunits                                               RELATIVE INTENSITY
xfactor                                                               1
yfactor                                                               1
firstx                                                          

In [39]:
comp_list[0].to_series()

atom_stereo_count                                                              0
atoms                          [{'aid': 1, 'number': 9, 'element': 'F', 'x': ...
bond_stereo_count                                                              0
bonds                          [{'aid1': 1, 'aid2': 5, 'order': 1}, {'aid1': ...
cactvs_fingerprint             0000000001000000000000011100000000000000000000...
canonical_smiles                                                    C(C(F)(F)F)F
charge                                                                         0
cid                                                                        13129
complexity                                                                  35.3
conformer_id_3d                                                             None
conformer_rmsd_3d                                                           None
coordinate_type                                                               2d
covalent_unit_count         

In [27]:
comp_list=pcp.get_compounds('50-78-2', 'name')

In [32]:
comp_list[0].to_series()

atom_stereo_count                                                              0
atoms                          [{'aid': 1, 'number': 8, 'element': 'O', 'x': ...
bond_stereo_count                                                              0
bonds                          [{'aid1': 1, 'aid2': 5, 'order': 1}, {'aid1': ...
cactvs_fingerprint             1100000001110000001110000000000000000000000000...
canonical_smiles                                        CC(=O)OC1=CC=CC=C1C(=O)O
charge                                                                         0
cid                                                                         2244
complexity                                                                   212
conformer_id_3d                                                             None
conformer_rmsd_3d                                                           None
coordinate_type                                                               2d
covalent_unit_count         

In [25]:
[comp_dict]

['353-36-6',
 '354-33-6',
 '359-35-3',
 '420-46-2',
 '430-66-0',
 '593-53-3',
 '624-72-6',
 '74-82-8',
 '74-84-0',
 '75-10-5',
 '75-37-6',
 '75-46-7',
 '75-73-0',
 '76-16-4',
 '811-97-2']