### 1. Download Infrared Spectra

In [1]:
import os
import nistchempy as nist
df = nist.get_all_data()
df.loc[:5, df.columns[:7]]

Unnamed: 0,ID,name,formula,mol_weight,inchi,inchi_key,cas_rn
0,B100,iron oxide anion,FeO-,71.845,,,
1,B1000,AsF3..Cl anion,AsClF3-,167.37,,,
2,B1000000,AgH2-,AgH2-,109.8846,,,
3,B1000001,HAg(H2),AgH3,110.892,,,
4,B1000002,AgNO+,AgNO+,137.8738,,,
5,B1000003,AgNO,AgNO,137.8743,,,


In [2]:
X = nist.Compound('C85018')
X.inchi

'InChI=1S/C14H10/c1-3-7-13-11(5-1)9-10-12-6-2-4-8-14(12)13/h1-10H'

In [2]:
IDs = df.ID[~df.inchi.isna() & df.cIR]
print(len(IDs))

15891


In [1]:
# 下载NIST上全部的红外光谱
if not os.path.exists('./IR'):
    os.makedirs('./IR')

for ID in IDs:
    com = nist.Compound(ID)
    com.get_ir_spectra()
    print(com.IR)
    com.save_ir_spectra('./IR')

### 2. Filter Out the IR Spectra

#### 2.1. Find Out the Empty Spectral Files

In [None]:
PATH = './IR'
# 遍历一级文件目录
files = os.listdir(PATH)
print(len(files))

9494


In [None]:
# 部分光谱文件中不包含数据点，找出这些文件
empty = []

for file in files:
    with open(os.path.join(PATH, file), 'r') as f:
        while True:
            line = f.readline()
            if line == '':
                break
            
            if line[:10] == '##NPOINTS=': 
                if line[10] == '0':  # 找出没有数据的文件
                    empty.append(file)
                break

In [None]:
# 将没有数据的文件以'0_'开头命名，确认无误后手动删除
print(len(empty))
for file in empty:
    print(file)
    os.rename(os.path.join(PATH, file), os.path.join(PATH, '0_' + file))

#### 2.2. Distinguish the Sampling State of the IR spectra

In [None]:
state = {'GAS':0, 'VAPOR':0, 'SOLID':0, 'LIQUID':0, 'SOLUTION':0}

gas = []
vapor = []
solid = []
liquid = []
solution = []

for file in files:
    with open(os.path.join(PATH, file), 'r') as f:
        while True:
            line = f.readline()
            if line == '':
                break
            
            if line[:8] == '##STATE=': 
                if line[8:11] == 'GAS' or line[8:11] == 'gas':
                    state['GAS'] += 1
                    gas.append(file)
                elif line[8:13] == 'VAPOR' or line[8:13] == 'vapor':
                    state['VAPOR'] += 1
                    vapor.append(file)
                elif line[8:13] == 'SOLID' or line[8:13] == 'solid':
                    state['SOLID'] += 1
                    solid.append(file)
                elif line[8:14] == 'LIQUID' or line[8:14] == 'liquid':
                    state['LIQUID'] += 1
                    liquid.append(file)
                elif line[8:16] == 'SOLUTION' or line[8:16] == 'solution':
                    state['SOLUTION'] += 1
                    solution.append(file)
                break

In [None]:
state

{'GAS': 9494, 'VAPOR': 70, 'SOLID': 4959, 'LIQUID': 1386, 'SOLUTION': 2284}

In [None]:
gasDIR = './IR_gas'
vaporDIR = './IR_vapor'
solidDIR = './IR_solid'
liquidDIR = './IR_liquid'
solutionDIR = './IR_solution'

In [None]:
# 按采样状态分类到不同文件夹

print('-----------GAS------------')
if not os.path.exists(gasDIR):
    os.makedirs(gasDIR)
for file in gas:
    # print(file)
    os.rename(os.path.join(PATH, file), os.path.join(gasDIR, file))
    
print('-----------VAPOR------------')
if not os.path.exists(vaporDIR):
    os.makedirs(vaporDIR)
for file in vapor:
    # print(file)
    os.rename(os.path.join(PATH, file), os.path.join(vaporDIR, file))

print('-----------SOLID------------')
if not os.path.exists(solidDIR):
    os.makedirs(solidDIR)
for file in solid:
    # print(file)
    os.rename(os.path.join(PATH, file), os.path.join(solidDIR, file))

print('-----------LIQUID------------')
if not os.path.exists(liquidDIR):
    os.makedirs(liquidDIR)
for file in liquid:
    # print(file)
    os.rename(os.path.join(PATH, file), os.path.join(liquidDIR, file))

print('-----------SOLUTION------------')
if not os.path.exists(solutionDIR):
    os.makedirs(solutionDIR)
for file in solution:
    # print(file)
    os.rename(os.path.join(PATH, file), os.path.join(solutionDIR, file))

In [None]:
import os

# 选择气态IR做分析
gasFiles = os.listdir(gasDIR)
len(gasFiles)

9494

#### 2.3. Distinguish the XUNITS/YUNITS of the IR spectra

In [None]:
yUnits = {}

for file in gasFiles:
    with open(os.path.join(gasDIR, file), 'r') as f:
        while True:
            line = f.readline()
            if line == '':
                break
                
            if line[:9] == '##YUNITS=':
                if line[9:] in yUnits.keys():
                    yUnits[line[9:]] += 1
                else:
                    yUnits[line[9:]] = 1
                break

In [None]:
yUnits

{'TRANSMITTANCE\n': 221,
 'ABSORBANCE\n': 8273,
 '(micromol/mol)-1m-1 (base 10)\n': 1000}

In [None]:
xUnits = {}

for file in gasFiles:
    with open(os.path.join(gasDIR, file), 'r') as f:
        while True:
            line = f.readline()
            if line == '':
                break
                
            if line[:9] == '##XUNITS=':
                if line[9:] in xUnits.keys():
                    xUnits[line[9:]] += 1
                else:
                    xUnits[line[9:]] = 1
                break

In [None]:
xUnits

{'1/CM\n': 8477, 'cm-1\n': 1000, 'MICROMETERS\n': 17}

In [None]:
# 找出X是波数，Y是吸光度的所有气态IR
avai = []

for file in gasFiles:
    with open(os.path.join(gasDIR, file), 'r') as f:
        while True:
            line = f.readline()
            if line == '':
                break
                
            if line[:9] == '##XUNITS=':
                if line[9:] == 'MICROMETERS\n':
                    break
            
            if line[:9] == '##YUNITS=':
                if line[9:] == 'ABSORBANCE\n':
                    avai.append(file)
                break

In [None]:
len(avai)

8273

In [None]:
for file in avai:
    os.rename(os.path.join(gasDIR, file), os.path.join(gasDIR, 'ab_' + file))

In [None]:
files = os.listdir('./IR_gas')

i = 0

for file in files:
    f = file.replace('.', '_').split('_')
    if f[0] == 'ab' and f[3] == '0':
        i += 1

print(i)  # 检查发现吸光度的IR谱都是不同化合物的，即CAS ID都不同

8273


### 3. Download Corresponding InChI

In [3]:
# 获取气态吸光度IR的分子的InChI

import os

DIR = './IR_gas'
files = os.listdir(DIR)

In [6]:
import pandas as pd

inchi_pd = pd.DataFrame(columns=['ID', 'InChi'])
inchi_file = './dataset/inchi.csv'

In [None]:
i = 0

for file in files:
    if file[:2] == 'ab':  # YUNITS=ABSORBANCE
        f_id = file.split('_')[1]
        
        com = nist.Compound(f_id)
        f_inchi = com.inchi
        
        inchi_pd.loc[i] = [f_id, f_inchi]
        
        i += 1

In [None]:
inchi_pd.to_csv(inchi_file, index=False)