In [55]:
# from tkinter import Image
from tqdm import tqdm
from datetime import datetime
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Alignment
from openpyxl.drawing.image import Image
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import Draw
from pandas.core.indexes.base import InvalidIndexError

In [56]:
def get_deprecated_organ_names():
    """
        返回冗余器官名的替换映射表
    """
    deprecated_organ_names = dict()
    deprecated_organ_names['adrenalgland'] = 'adrenal'
    deprecated_organ_names['adrenal gland'] = 'adrenal'
    deprecated_organ_names['adrenalglad'] = 'adrenal'
    deprecated_organ_names['adrenay'] = 'adrenal'
    deprecated_organ_names['adrenays'] = 'adrenal'
    deprecated_organ_names['adrenals'] = 'adrenal'

    deprecated_organ_names['bladder/urine'] = 'bladder'
    deprecated_organ_names['braint'] = 'brain'
    deprecated_organ_names['bran'] = 'brain'
    deprecated_organ_names['brainstem'] = 'brain'
    deprecated_organ_names['brown fat'] = 'fat'
    deprecated_organ_names['brownfat'] = 'fat'
    deprecated_organ_names['bones'] = 'bone'
    deprecated_organ_names['bone marrow'] = 'bone'
    deprecated_organ_names['bonemarrow'] = 'bone'

    deprecated_organ_names['caudalcortex'] = 'brain'
    deprecated_organ_names['cerebellam'] = 'brain'
    deprecated_organ_names['cerebellun'] = 'brain'
    deprecated_organ_names['cerebellum'] = 'brain'
    deprecated_organ_names['cerebellurn'] = 'brain'
    deprecated_organ_names['cerebeum'] = 'brain'
    deprecated_organ_names['cerebrum'] = 'brain'
    deprecated_organ_names['cerebralcortex'] = 'brain'
    deprecated_organ_names['cerebralcoex'] = 'brain'
    deprecated_organ_names['cerebrum(left)'] = 'brain'
    deprecated_organ_names['cerebrum(right)'] = 'brain'
    deprecated_organ_names['cortex'] = 'brain'
    deprecated_organ_names['cortex(left)'] = 'brain'
    deprecated_organ_names['cortex(right)'] = 'brain'
    deprecated_organ_names['cranium'] = 'bone'

    deprecated_organ_names['diencephalon'] = 'brain'
    
    deprecated_organ_names['eyes'] = 'eye'

    deprecated_organ_names['faeces'] = 'feces'
    deprecated_organ_names['frontal cortex'] = 'brain'
    deprecated_organ_names['frontalcortex'] = 'brain'
    deprecated_organ_names['frontalccortex'] = 'brain'
    deprecated_organ_names['fatsubcutaneous'] = 'fat'
    deprecated_organ_names['fat,subcutan'] = 'fat'
    deprecated_organ_names['femur'] = 'bone'
    deprecated_organ_names['femurs'] = 'bone'

    deprecated_organ_names['gallbladder'] = 'bladder'

    deprecated_organ_names['hiwocampus'] = 'brain'
    deprecated_organ_names['hoart'] = 'heart'
    deprecated_organ_names['hear'] = 'heart'
    deprecated_organ_names['hippocampus'] = 'brain'
    deprecated_organ_names['hypothalamus'] = 'brain'
    deprecated_organ_names['hard.'] = 'hardergland'
    deprecated_organ_names['harderiangland'] = 'hardergland'

    deprecated_organ_names['iiver'] = 'liver'
    deprecated_organ_names['intestines'] = 'intestine'

    deprecated_organ_names['kidneys'] = 'kidney'
    deprecated_organ_names['ktdney'] = 'kidney'
    deprecated_organ_names['kidners'] = 'kidney'
    deprecated_organ_names['kidncy'] = 'kidney'

    deprecated_organ_names['lungs'] = 'lung'
    deprecated_organ_names['lurg'] = 'lung'
    deprecated_organ_names['lver'] = 'liver'
    deprecated_organ_names['l-intestine'] = 'large intestine'
    deprecated_organ_names['l.intest'] = 'large intestine'
    deprecated_organ_names['large'] = 'large intestine'
    deprecated_organ_names['largeintestine'] = 'large intestine'
    
    deprecated_organ_names['muscles'] = 'muscle'
    deprecated_organ_names['musclo'] = 'muscle'
    deprecated_organ_names['mwclc'] = 'muscle'
    deprecated_organ_names['midbrain'] = 'brain'

    deprecated_organ_names['medalla'] = 'medulla'
    deprecated_organ_names['modulla'] = 'medulla'
    
    deprecated_organ_names['gonads'] = 'gonad'

    deprecated_organ_names['ovaries'] = 'ovary'
    deprecated_organ_names['occipitalcortex'] = 'brain'
    deprecated_organ_names['olfac.tub.'] = 'olfactorybulb'

    deprecated_organ_names['pans'] = 'pancreas'
    deprecated_organ_names['parictalcortex'] = 'brain'
    deprecated_organ_names['pinealbod'] = 'brain'
    deprecated_organ_names['pituitary'] = 'brain'
    deprecated_organ_names['pituitarygland'] = 'brain'
    deprecated_organ_names['plasma'] = 'blood'
    
    deprecated_organ_names['restbrain'] = 'brain'
    deprecated_organ_names['restofbrain'] = 'brain'

    deprecated_organ_names['s-intestine'] = 'small intestine'
    deprecated_organ_names['s.intest'] = 'small intestine'
    deprecated_organ_names['s.intestine'] = 'small intestine'
    deprecated_organ_names['small'] = 'small intestine'
    deprecated_organ_names['smalintestine'] = 'small intestine'
    deprecated_organ_names['smallintestine'] = 'small intestine'
    deprecated_organ_names['smallinterstine'] = 'small intestine'
    deprecated_organ_names['smallintestines'] = 'small intestine'
    deprecated_organ_names['smallintestme'] = 'small intestine'
    deprecated_organ_names['salivaryglands'] = 'salivarygland'
    deprecated_organ_names['salivary gland'] = 'salivarygland'
    deprecated_organ_names['splee'] = 'spleen'
    deprecated_organ_names['skull'] = 'bone'
    deprecated_organ_names['stomachb'] = 'stomach'
    deprecated_organ_names['stomachc'] = 'stomach'
    deprecated_organ_names['striatum'] = 'brain'
    deprecated_organ_names['stratum'] = 'brain'
    deprecated_organ_names['striaturn'] = 'brain'
    deprecated_organ_names['striatam'] = 'brain'
    deprecated_organ_names['splee'] = 'spleen'
        
    deprecated_organ_names['testes'] = 'testis'
    deprecated_organ_names['testicle'] = 'testis'
    deprecated_organ_names['thalamas'] = 'thalamus'
    deprecated_organ_names['thyroidc'] = 'thyroid'
    deprecated_organ_names['thyroidgland'] = 'thyroid'

    deprecated_organ_names['urineb'] = 'bladder'

    deprecated_organ_names['whitefat'] = 'fat'
    deprecated_organ_names['wholebrain'] = 'brain'
       
    return deprecated_organ_names

def get_removed_organ_names():
    removed_organ_names = list()
    removed_organ_names.append('')
    removed_organ_names.append(' ')
    removed_organ_names.append('c6tumor')
    removed_organ_names.append('caecum')
    removed_organ_names.append('carcass')
    removed_organ_names.append('cartilage')
    removed_organ_names.append('caudate')
    removed_organ_names.append('cervicalcord')
    removed_organ_names.append('coecum')
    removed_organ_names.append('duodenum')
    removed_organ_names.append('erythrocyte')
    removed_organ_names.append('esophagus')
    removed_organ_names.append('fat,mesentrial')
    removed_organ_names.append('inferiorcolliculus')
    removed_organ_names.append('oesophagus')
    removed_organ_names.append('superiorcolliculus')
    removed_organ_names.append('trachea')
    removed_organ_names.append('tumor')
    removed_organ_names.append('tumor:bloodratio')
    removed_organ_names.append('tumor:lungratio')
    removed_organ_names.append('tumor:muscleratio')
    removed_organ_names.append('tumour')
    removed_organ_names.append('uterus/blood')
    removed_organ_names.append('uterus/muscle')
    removed_organ_names.append('uterus-to-blood')
    removed_organ_names.append('uterus-to-muscle')
    removed_organ_names.append('xenograftcontrol')
    removed_organ_names.append('xenografttk+')
    return removed_organ_names

In [57]:
# 保存化合物名与其mol文件路径以及图片路径的映射，优化制表的遍历效率
compound_name2file_map = dict()
compound_name2img_map = dict()
"""替换器官名映射表"""
deprecated_organ_names = get_deprecated_organ_names()
# 无用器官名列表
removed_organ_names = get_removed_organ_names()

# base_dir = os.getcwd()
# print("base dir: ", base_dir)

# 数据集的mol文件路径
mol_files = []
data_path = "./data"
# data_path = "./test"
if not os.path.exists(data_path):
    os.makedirs(data_path)

# 文件夹1-img用于存放数据集化合物图片
savepic = "./img"
if not os.path.exists(savepic):
    os.makedirs(savepic)
# 数据集汇总表存放路径
cur_time = datetime.now().strftime("%Y%m%d")
folder_path = f"./result/{cur_time}"
excel_path = f"{folder_path}/数据表汇总.xlsx"

data_list = os.listdir(data_path)
for file in data_list:
    if file.endswith(".mol"):
        mol_file = os.path.join(data_path, file)
        compound_name = os.path.splitext(file)[0]
        compound_name2file_map[compound_name] = mol_file
        mol_files.append(mol_file)
# print(compound_name2file_map)

if not os.path.exists(folder_path):
    os.makedirs(folder_path)
if not os.path.exists(excel_path):
    wkc = openpyxl.Workbook(excel_path)
    wkc.save(excel_path)

# # 数据集汇总表存放路径
# filepath = "./数据表汇总测试10-8.xlsx"
# # 若路径下无汇总表则新建
# if not os.path.exists(filepath):
#     wkc = openpyxl.Workbook(filepath)
#     wkc.save(filepath)
# # 打开数据汇总表
# wbc = openpyxl.load_workbook(filepath)
# # 操作当前相应的表
# wsc = wbc.active
# 记录出错的文件
errorfile = []

# 分散数据集中对于时间间隔一栏（首栏）的叫法不一，会影响数据的收集，用列表记录用于函数处理
denied_interval_markers = ['', 'time', 'organ', 'time(min)', 'tissue', 'organs', 'tissues', '% id/organ', '%id/organ', 'organ distribution', 'organdistribution', 'regin', 'organ（body）']

In [58]:
def init_workbook_dataframe():
    """
        初始化带全部列头的dataframe, 用于将读取到的excel数据填入其中

        Return:
            包含所有列头的空DataFrame
    """
    
    organ_lists = ['blood', 'brain', 'brain/blooda', 'brain:bloodratio','heart', 'heart/blood',

                    'abdominalaorta', 'adrenal', 'amygdala', 'aorta', 'at',

                    'bile', 'bladder', 'bone', 'bow', 

                    'colon', 'corpusstriatum',
                    
                    'eye',

                    'fat', 'feces',
                    # gonad hard. 
                    'git', 'gitract', 'gonad', 'gut', 'hardergland',

                    'intestine',

                    'kidney',

                    'large intestine', 'lean', 'liver', 'llmg', 'lung',
                    
                    'marrow', 'medulla', 'muscle',

                    'olfactorybulb', 'ovary',

                    'pancreas', 'pocs', 'pons', 'pons-medulla', 'prostate', 

                    'region', 'remainder',

                    'small intestine', 'salivarygland', 'septum', 'skin', 'spleen', 'stiatum', 'stomach', 'submandibular', 
                    
                    'tail', 'testis', 'thalamus', 'thymus', 'thyroid', 'trachea', 'tissue',
                    
                    'urinarybladder', 'urine', 'uterus'
                ]

    time_intervals = [0.25, 0.5, 1, 2, 5, 7, 10, 12, 13, 15, 20, 30, 
                    40, 45, 55, 60, 70, 75, 80, 90, 100, 105, 110, 120, 
                    125, 150, 180, 220, 240, 300, 330, 360, 440, 
                    480, 550, 720, 1080, 1440, 2120, 2880]
    headers = ['Compound index']
    for organ in organ_lists:
        for time in time_intervals:
            if str(time).find("*") != -1:
                headers.append(organ + " mean" + str(time)[:-1] + "min*")
                headers.append(organ + " sd" + str(time)[:-1] + "min*")
            else:
                headers.append(organ + " mean" + str(time) + "min")
                headers.append(organ + " sd" + str(time) + "min")
    df = pd.DataFrame(columns=headers)
    return df


In [59]:
def get_DataFrame_from_workbook(workbook):
	"""
		用于从excel文件中读取药物的各种数据, 将数据进行处理并打包成DataFrame输入到总表中

		Args:
			workbook: excel文件的文件路径
		Return:
			处理所有workbook内数据的DataFrame
	"""
	try:
		temp_wb = openpyxl.load_workbook(workbook)
	except FileNotFoundError as e:
		print(e)
		return None
	temp_ws = temp_wb.active
	compound_index = os.path.splitext(os.path.split(workbook)[-1])[0]
	sheet_data = dict()
	# 判断处理的是否是第一行列表头
	is_header_row = True
	# 保存列表头数据（时间点数据）的列表
	time_headers = []

	for row in temp_ws.rows:
		# 记录第一行的列头（保存着时间点数据）
		if is_header_row:
			for cell in row:
				if cell.value is not None:
					time_header = str(cell.value).strip().replace(
					    " ", "").replace("\n", "").lower()
					# 判断列头是否是被拒绝接受的，是则跳过不处理，否则为正常的时间点数据
					if time_header not in denied_interval_markers:
						# 修正由于OCR识别问题导致的字符错误
						time_header = time_header.replace('mim', 'min').replace('minb', 'min').replace('minc', 'min').replace(
						    'miu', 'min').replace('meanm', 'mean').replace('sem', 'sd').replace('se', 'sd').replace('mn', 'min')
					else:
						continue
					# 存在部分时间点数据缺少时间单位，默认附上min
					if not time_header.endswith("min") and not time_header.endswith("h"):
						time_header = time_header + "min"
					# 存在部分时间点数据的单位是小时，转换为分钟
					if time_header[-1] == 'h':
						try:
							index = time_header.find('mean')
							if index != -1:
								index = index + 4
							else:
								index = time_header.find('sd')
								if index != -1:
									index = index + 2
							if index != -1:
								hour = int(time_header[index:-1])
								time_header = time_header[:index] + str(hour * 60) + 'min'
							else:
								print("Data missed in {}: {}".format(compound_index, time_header))
								continue
						except ValueError as e:
							print(e)
							print(compound_index)
							print(time_header)
							print()
						# 存在部分时序列头的时间数字缺失，输出错误的数据并防止输入到总数据集中
					if time_header != 'sdmin' and time_header != 'meanmin':
						time_headers.append(time_header)
					else:
						print("Invalid header format of {}: {}".format(compound_index, time_header))

			# 部分数据文件中的数据并非从第一行开始，通过判断列表的长度可以充当跳过前面空行的作用
			if len(time_headers) > 0:
				is_header_row = False
				# 试图找出错误的时间列头的列表
				if str(time_headers[0]).find('mean') == -1 and str(time_headers[0]).find('sd') == -1:
					print("Wrong headers of {}: {}".format(compound_index, time_headers))
		# 带数值的列表数据
		else:
			# 将一行的数据先置入列表中，再根据行名与行数据切割成字典形式并保存到sheet_data，这一步等同于将行转换为列
			temp_list = []
			for cell in row:
				if cell.value is not None:
					temp_list.append(str(cell.value).strip().replace(" ", "").replace("\n", ""))
			if len(temp_list) > 0:
				organ_name = str(temp_list[0]).lower()
				sheet_data[organ_name] = temp_list[1:]

	# 将时间表头与器官名进行组合，用于置入DataFrame成为新的表头
	organs = list(sheet_data.keys())
	extended_headers = ['Compound index']
	try:
		for organ in organs:
			for time_header in time_headers:
				# 若器官名是被取消的，跳过
				if organ not in removed_organ_names:
					# 若器官名是需要被替换的，替换
					if deprecated_organ_names.get(organ) is not None:
						organ = deprecated_organ_names.get(organ)
					extended_headers.append(str.lower(" ".join([str(organ), str(time_header)])))
	except Exception as e:
		print(e)
		print("Problem compound index: ", compound_index)
		print("Organ: ", organ)
		print("time_header: ", time_header)
		print("extended_headers: ", extended_headers)
		print()
	# 设置DataFrame并写入化合物编号
	df = pd.DataFrame(columns=extended_headers)
	df[extended_headers[0]] = [compound_index]

	# 遍历表内其他数据，并写入到对应的列中
	for organ_name, organ_data in sheet_data.items():
		# 跳过organ数据
		# if organ_name == organs[0]:
		# 	continue
		if organ_name not in removed_organ_names:
			if deprecated_organ_names.get(organ_name) is not None:
				organ_name = deprecated_organ_names.get(organ_name)
			cur = 0
			try:
				for data in organ_data:
					time_header = str.lower(' '.join([str(organ_name), str(time_headers[cur])]))
					df[time_header] = [data]
					cur = cur + 1
			except IndexError as e:
				print("Sheet data: ", sheet_data)
				print("Organs list: ", organs)
				print("Headers list: ", time_headers)
				print("Problem organ name:", organ_name)
				print("Problem organ data:", data)
				print("Cursor index: ", cur)
				print("Problem compound index: ", compound_index)
				print(e)
				print()
				continue
	return df

In [60]:
"""
    把mol文件生成的化合物图片保存到文件夹1-img中
"""
# 读取数据集文件
for mol_file in tqdm(mol_files):
    try:
        # 确认文件后缀名是否为mol
        split_path = os.path.splitext(mol_file)
        if split_path[-1] == '.mol':
            # 筛选路径，获得文件名
            compound_name = os.path.split(split_path[0])[-1]
            mol = Chem.MolFromMolFile(mol_file)
            # Draw.MolToImage(mol, size=(120, 120), kekulize=True)
            img_path = savepic + '/' + compound_name + '.png'
            Draw.MolToFile(mol, img_path, size=(120, 120))
            
            compound_name2img_map[compound_name] = img_path
    except (FileNotFoundError, OSError) as e:
        print(e)
        errorfile.append(mol_file)
# print(compound_name2img_map)

 13%|█▎        | 109/832 [00:00<00:04, 167.69it/s]

Bad input file ./data\1069A-16β-F-DHT;18F-2.mol
Bad input file ./data\1069A-16β-F-Mib;18F-6.mol
Bad input file ./data\1069A-16β-F-MNT;18F-7.mol
Bad input file ./data\1069A-16β-F-MNT;18F-9.mol
Bad input file ./data\1069A-16β-F-T;18F-4.mol


 81%|████████  | 671/832 [00:04<00:01, 122.25it/s]

Bad input file ./data\1620A-18F-ﬂuoroisonicotinic.mol


 88%|████████▊ | 728/832 [00:05<00:00, 162.61it/s]

Bad input file ./data\1672C-18F-β-CFT-FP.mol


100%|██████████| 832/832 [00:06<00:00, 130.90it/s]


In [61]:
# 读取excel文件并写入excel表格
# main_df = pd.DataFrame()
main_df = init_workbook_dataframe()

for compound_name, compound_file in tqdm(compound_name2file_map.items()):
    if compound_name is not None:
        # 获得mol文件对应的excel文件并读取数据
        compound_excel_name = compound_file.replace("mol", "xlsx")
        df = get_DataFrame_from_workbook(compound_excel_name)

        if df is not None:
            try:
                # 抛弃index来让这两个dataframe可以合并
                # main_df = main_df.reset_index(drop=True)
                # df = df.reset_index(drop=True)
                main_df = pd.concat([main_df, df], axis=0, join='inner')
                # main_df = main_df.append(df)
            except (InvalidIndexError) as e:
                # print(main_df)
                # print(df)
                print(compound_file)
                print(e)



  2%|▏         | 20/832 [00:01<00:48, 16.89it/s]

./data\1031-18F5.mol
Reindexing only valid with uniquely valued Index objects


  3%|▎         | 22/832 [00:01<00:48, 16.60it/s]

./data\1037B-18F3.mol
Reindexing only valid with uniquely valued Index objects


  3%|▎         | 26/832 [00:01<01:18, 10.33it/s]

./data\1038-18F1.mol
Reindexing only valid with uniquely valued Index objects


  5%|▌         | 42/832 [00:03<01:52,  7.05it/s]

./data\1051-18F4.mol
Reindexing only valid with uniquely valued Index objects


  5%|▌         | 45/832 [00:04<01:44,  7.50it/s]

./data\1056-11C4.mol
Reindexing only valid with uniquely valued Index objects
./data\1056-18F2.mol
Reindexing only valid with uniquely valued Index objects


  6%|▌         | 48/832 [00:04<01:21,  9.57it/s]

./data\1056-18F3.mol
Reindexing only valid with uniquely valued Index objects


  6%|▋         | 54/832 [00:04<01:03, 12.22it/s]

./data\1063-18F-2a.mol
Reindexing only valid with uniquely valued Index objects
./data\1063-18F-2b.mol
Reindexing only valid with uniquely valued Index objects


  7%|▋         | 58/832 [00:05<01:10, 10.96it/s]

./data\1063-18F-2c.mol
Reindexing only valid with uniquely valued Index objects
./data\1063-FES.mol
Reindexing only valid with uniquely valued Index objects


 13%|█▎        | 105/832 [00:09<00:48, 15.07it/s]

./data\1077-18F-2.mol
Reindexing only valid with uniquely valued Index objects


 13%|█▎        | 109/832 [00:09<00:56, 12.76it/s]

./data\1081-18F-1.mol
Reindexing only valid with uniquely valued Index objects
./data\1081-18F-2.mol
Reindexing only valid with uniquely valued Index objects
./data\1081-18F-3.mol
Reindexing only valid with uniquely valued Index objects


 14%|█▎        | 113/832 [00:09<00:54, 13.10it/s]

./data\1081-18F-4.mol
Reindexing only valid with uniquely valued Index objects
./data\1082A-18F28.mol
Reindexing only valid with uniquely valued Index objects
./data\1082A-18F29.mol
Reindexing only valid with uniquely valued Index objects
./data\1082A-18F30.mol
Reindexing only valid with uniquely valued Index objects


 14%|█▍        | 117/832 [00:10<00:50, 14.23it/s]

./data\1082A-18F31.mol
Reindexing only valid with uniquely valued Index objects
./data\1082A-18F40.mol
Reindexing only valid with uniquely valued Index objects
./data\1082A-18F41.mol
Reindexing only valid with uniquely valued Index objects
./data\1082A-18F42.mol
Reindexing only valid with uniquely valued Index objects


 14%|█▍        | 119/832 [00:10<00:52, 13.53it/s]

./data\1083-18F-1.mol
Reindexing only valid with uniquely valued Index objects
./data\1083-18F-2.mol
Reindexing only valid with uniquely valued Index objects
./data\1083-18F-3.mol
Reindexing only valid with uniquely valued Index objects


 15%|█▍        | 123/832 [00:10<00:57, 12.29it/s]

./data\1083-18F-4.mol
Reindexing only valid with uniquely valued Index objects


 16%|█▌        | 131/832 [00:11<01:32,  7.61it/s]

./data\1093-18F19.mol
Reindexing only valid with uniquely valued Index objects
./data\1093-18F4.mol
Reindexing only valid with uniquely valued Index objects


 16%|█▌        | 135/832 [00:12<01:22,  8.44it/s]

./data\1095-18F6d.mol
Reindexing only valid with uniquely valued Index objects


 19%|█▉        | 159/832 [00:13<00:48, 13.90it/s]

./data\1133-18F1a.mol
Reindexing only valid with uniquely valued Index objects
./data\1133-18F3.mol
Reindexing only valid with uniquely valued Index objects
./data\1133-18F4.mol
Reindexing only valid with uniquely valued Index objects


 20%|██        | 167/832 [00:14<00:49, 13.50it/s]

./data\1136-18F10.mol
Reindexing only valid with uniquely valued Index objects


 20%|██        | 169/832 [00:14<01:12,  9.14it/s]

./data\1136-18F11.mol
Reindexing only valid with uniquely valued Index objects
./data\1136-18F15.mol
Reindexing only valid with uniquely valued Index objects


 21%|██        | 171/832 [00:15<01:26,  7.65it/s]

./data\1136-18F2.mol
Reindexing only valid with uniquely valued Index objects
./data\1136-18F5.mol
Reindexing only valid with uniquely valued Index objects


 21%|██        | 175/832 [00:15<01:26,  7.59it/s]

./data\1141A-18F2a.mol
Reindexing only valid with uniquely valued Index objects
./data\1141A-18F2b.mol
Reindexing only valid with uniquely valued Index objects


 21%|██▏       | 177/832 [00:15<01:27,  7.47it/s]

./data\1141A-18F2c.mol
Reindexing only valid with uniquely valued Index objects
./data\1141A-18F2d.mol
Reindexing only valid with uniquely valued Index objects


 22%|██▏       | 179/832 [00:16<01:27,  7.49it/s]

./data\1141A-18F2e.mol
Reindexing only valid with uniquely valued Index objects
./data\1141A-18F2f.mol
Reindexing only valid with uniquely valued Index objects


 23%|██▎       | 190/832 [00:17<00:46, 13.74it/s]

./data\1149-18F22.mol
Reindexing only valid with uniquely valued Index objects


 24%|██▍       | 198/832 [00:17<00:44, 14.31it/s]

./data\1163-15a.mol
Reindexing only valid with uniquely valued Index objects
./data\1163-15b.mol
Reindexing only valid with uniquely valued Index objects
./data\1163-15c.mol
Reindexing only valid with uniquely valued Index objects


 25%|██▍       | 206/832 [00:18<00:58, 10.77it/s]

./data\1170A-18FRU52461.mol
Reindexing only valid with uniquely valued Index objects
./data\1170B-18F10.mol
Reindexing only valid with uniquely valued Index objects


 25%|██▌       | 208/832 [00:18<01:03,  9.81it/s]

./data\1170B-18F11.mol
Reindexing only valid with uniquely valued Index objects
./data\1170B-18F15.mol
Reindexing only valid with uniquely valued Index objects


 25%|██▌       | 210/832 [00:18<01:26,  7.21it/s]

./data\1170B-18F2.mol
Reindexing only valid with uniquely valued Index objects
./data\1170B-18F5.mol
Reindexing only valid with uniquely valued Index objects


 26%|██▋       | 220/832 [00:19<00:45, 13.57it/s]

./data\1182-18F1a.mol
Reindexing only valid with uniquely valued Index objects


 27%|██▋       | 224/832 [00:20<00:48, 12.59it/s]

./data\1187A-18F5a.mol
Reindexing only valid with uniquely valued Index objects
./data\1187A-18F5b.mol
Reindexing only valid with uniquely valued Index objects
./data\1187A-18F5c.mol
Reindexing only valid with uniquely valued Index objects


 28%|██▊       | 229/832 [00:20<00:50, 11.98it/s]

./data\1189C-18F6.mol
Reindexing only valid with uniquely valued Index objects
./data\1194A-42.mol
Reindexing only valid with uniquely valued Index objects


 28%|██▊       | 231/832 [00:20<00:52, 11.49it/s]

./data\1194A-43.mol
Reindexing only valid with uniquely valued Index objects


 29%|██▉       | 244/832 [00:21<00:36, 16.20it/s]

./data\1202-15a.mol
Reindexing only valid with uniquely valued Index objects
./data\1202-15b.mol
Reindexing only valid with uniquely valued Index objects
./data\1202-15c.mol
Reindexing only valid with uniquely valued Index objects


 30%|██▉       | 248/832 [00:21<00:43, 13.44it/s]

./data\1205-18F2a.mol
Reindexing only valid with uniquely valued Index objects
./data\1205-18F3a.mol
Reindexing only valid with uniquely valued Index objects


 30%|███       | 252/832 [00:22<00:56, 10.22it/s]

./data\1205-18F4a.mol
Reindexing only valid with uniquely valued Index objects
./data\1209A-18F-2a.mol
Reindexing only valid with uniquely valued Index objects
./data\1209A-18F-2b.mol
Reindexing only valid with uniquely valued Index objects


 31%|███       | 254/832 [00:22<01:17,  7.47it/s]

./data\1209A-18F-2c.mol
Reindexing only valid with uniquely valued Index objects


 31%|███       | 255/832 [00:23<01:41,  5.70it/s]

./data\1209A-18F-2d.mol
Reindexing only valid with uniquely valued Index objects


 31%|███       | 257/832 [00:23<01:43,  5.58it/s]

./data\1209A-18F-2e.mol
Reindexing only valid with uniquely valued Index objects
./data\1209A-18F-2f.mol
Reindexing only valid with uniquely valued Index objects


 31%|███▏      | 260/832 [00:23<01:12,  7.90it/s]

./data\1210-18F 4-PBFPB.mol
Reindexing only valid with uniquely valued Index objects


 33%|███▎      | 272/832 [00:24<00:50, 11.17it/s]

./data\1230A-18F-FU.mol
Reindexing only valid with uniquely valued Index objects


 35%|███▍      | 290/832 [00:26<00:43, 12.57it/s]

./data\1248-11CPIB.mol
Reindexing only valid with uniquely valued Index objects
./data\1248-18F3a.mol
Reindexing only valid with uniquely valued Index objects


 35%|███▌      | 292/832 [00:26<00:51, 10.58it/s]

./data\1248-18F3b.mol
Reindexing only valid with uniquely valued Index objects
./data\1248-18F5.mol
Reindexing only valid with uniquely valued Index objects


 35%|███▌      | 294/832 [00:26<01:05,  8.16it/s]

./data\1251-11CPIB.mol
Reindexing only valid with uniquely valued Index objects
./data\1251-18F2.mol
Reindexing only valid with uniquely valued Index objects


 35%|███▌      | 295/832 [00:26<01:16,  6.98it/s]

./data\1251-18F3.mol
Reindexing only valid with uniquely valued Index objects
./data\1251-18F6.mol
Reindexing only valid with uniquely valued Index objects


 36%|███▌      | 298/832 [00:27<01:27,  6.11it/s]

./data\1251-18FKS28.mol
Reindexing only valid with uniquely valued Index objects
./data\1253A-18FDFA.mol
Reindexing only valid with uniquely valued Index objects


 37%|███▋      | 310/832 [00:28<00:34, 15.03it/s]

./data\1262D-4-18F-FMR.mol
Reindexing only valid with uniquely valued Index objects
./data\1262D-6-18F-FMR.mol
Reindexing only valid with uniquely valued Index objects


 38%|███▊      | 318/832 [00:28<00:43, 11.77it/s]

./data\1268-18F-IV.mol
Reindexing only valid with uniquely valued Index objects


 41%|████      | 339/832 [00:30<00:38, 12.73it/s]

./data\1314-FAGal.mol
Reindexing only valid with uniquely valued Index objects
./data\1314-FAGlu.mol
Reindexing only valid with uniquely valued Index objects
./data\1314-FAMan.mol
Reindexing only valid with uniquely valued Index objects


 41%|████      | 341/832 [00:31<01:23,  5.88it/s]

./data\1315-18FFAG.mol
Reindexing only valid with uniquely valued Index objects


 44%|████▍     | 368/832 [00:34<00:52,  8.77it/s]

./data\1344-18F-FPP.mol
Reindexing only valid with uniquely valued Index objects


 45%|████▍     | 371/832 [00:34<01:03,  7.27it/s]

./data\1352-18FFPND.mol
Reindexing only valid with uniquely valued Index objects


 45%|████▍     | 372/832 [00:35<01:31,  5.00it/s]

./data\1354-18F2.mol
Reindexing only valid with uniquely valued Index objects


 45%|████▍     | 373/832 [00:36<02:22,  3.22it/s]

./data\1354-18F3.mol
Reindexing only valid with uniquely valued Index objects


 45%|████▌     | 377/832 [00:36<01:29,  5.10it/s]

./data\1354-18F4.mol
Reindexing only valid with uniquely valued Index objects


 46%|████▌     | 379/832 [00:36<01:16,  5.92it/s]

./data\1358-11C-AMP.mol
Reindexing only valid with uniquely valued Index objects
./data\1358-11C-BMP.mol
Reindexing only valid with uniquely valued Index objects


 46%|████▌     | 382/832 [00:37<00:53,  8.36it/s]

./data\1358-18F-BMP.mol
Reindexing only valid with uniquely valued Index objects


 47%|████▋     | 390/832 [00:37<00:47,  9.32it/s]

./data\1384-5a.mol
Reindexing only valid with uniquely valued Index objects
./data\1384-5b.mol
Reindexing only valid with uniquely valued Index objects
./data\1384-5c.mol
Reindexing only valid with uniquely valued Index objects


 49%|████▉     | 411/832 [00:40<01:23,  5.02it/s]

./data\1406-18FDOA.mol
Reindexing only valid with uniquely valued Index objects


 50%|████▉     | 413/832 [00:41<01:15,  5.53it/s]

./data\1413A-18FO-PEt-PD3.mol
Reindexing only valid with uniquely valued Index objects


 53%|█████▎    | 443/832 [00:43<00:32, 12.00it/s]

./data\1428D-cis-4-FCWAY.mol
Reindexing only valid with uniquely valued Index objects
./data\1428D-trans-4-FCWAY.mol
Reindexing only valid with uniquely valued Index objects


 54%|█████▍    | 448/832 [00:44<01:14,  5.16it/s]

./data\1432-2S-18F1.mol
Reindexing only valid with uniquely valued Index objects


 55%|█████▌    | 458/832 [00:45<00:41,  9.05it/s]

./data\1440-18F-JNJ41510417.mol
Reindexing only valid with uniquely valued Index objects


 56%|█████▌    | 464/832 [00:46<00:42,  8.61it/s]

./data\1444E-F18FDMPPF.mol
Reindexing only valid with uniquely valued Index objects


 57%|█████▋    | 471/832 [00:47<00:42,  8.41it/s]

./data\1446-125I4.mol
Reindexing only valid with uniquely valued Index objects
./data\1446-125I6.mol
Reindexing only valid with uniquely valued Index objects


 57%|█████▋    | 473/832 [00:47<00:45,  7.84it/s]

./data\1447-18F1.mol
Reindexing only valid with uniquely valued Index objects
./data\1450-18FT807.mol
Reindexing only valid with uniquely valued Index objects


 57%|█████▋    | 475/832 [00:48<00:53,  6.73it/s]

./data\1460-18a.mol
Reindexing only valid with uniquely valued Index objects


 57%|█████▋    | 476/832 [00:48<01:16,  4.65it/s]

./data\1460-18b.mol
Reindexing only valid with uniquely valued Index objects


 57%|█████▋    | 477/832 [00:48<01:29,  3.97it/s]

./data\1460-18c.mol
Reindexing only valid with uniquely valued Index objects


 58%|█████▊    | 479/832 [00:49<01:12,  4.87it/s]

./data\1460-18F16.mol
Reindexing only valid with uniquely valued Index objects


 58%|█████▊    | 484/832 [00:49<00:47,  7.27it/s]

./data\1470-18FAZ11637326.mol
Reindexing only valid with uniquely valued Index objects


 59%|█████▉    | 495/832 [00:50<00:27, 12.40it/s]

./data\1489-18FDCFBC.mol
Reindexing only valid with uniquely valued Index objects


 60%|█████▉    | 499/832 [00:51<00:49,  6.75it/s]

./data\1490-18F3.mol
Reindexing only valid with uniquely valued Index objects


 60%|██████    | 501/832 [00:51<00:44,  7.46it/s]

./data\1494-p-18FDMPPF.mol
Reindexing only valid with uniquely valued Index objects


 61%|██████    | 504/832 [00:52<00:55,  5.88it/s]

./data\1494-p18FDMPPF.mol
Reindexing only valid with uniquely valued Index objects


  warn(msg)
 62%|██████▏   | 515/832 [00:53<00:35,  8.95it/s]

./data\1505A-FFMMT.mol
Reindexing only valid with uniquely valued Index objects
./data\1509B-11C-pipzA-4.mol
Reindexing only valid with uniquely valued Index objects


 62%|██████▏   | 517/832 [00:54<00:41,  7.67it/s]

./data\1509B-18F-FA-4.mol
Reindexing only valid with uniquely valued Index objects
./data\1510B-18F-FAA Ester.mol
Reindexing only valid with uniquely valued Index objects


 62%|██████▎   | 520/832 [00:54<00:38,  8.20it/s]

./data\1510B-18F-FAA.mol
Reindexing only valid with uniquely valued Index objects


 64%|██████▎   | 529/832 [00:55<00:38,  7.86it/s]

./data\1516A-18FFAG.mol
Reindexing only valid with uniquely valued Index objects


 65%|██████▍   | 537/832 [00:56<00:26, 10.96it/s]

./data\1531-18FFBFPA.mol
Reindexing only valid with uniquely valued Index objects
./data\1534-123IIBPPA.mol
Reindexing only valid with uniquely valued Index objects


 65%|██████▍   | 539/832 [00:56<00:42,  6.90it/s]

./data\1534-18FFBPPA.mol
Reindexing only valid with uniquely valued Index objects


 67%|██████▋   | 558/832 [00:58<00:22, 12.08it/s]

./data\1554-18FFDE.mol
Reindexing only valid with uniquely valued Index objects
./data\1554-18FFDP.mol
Reindexing only valid with uniquely valued Index objects


 69%|██████▉   | 576/832 [00:59<00:17, 14.25it/s]

./data\1564-18FE-TCP.mol
Reindexing only valid with uniquely valued Index objects


 72%|███████▏  | 603/832 [01:07<00:19, 11.65it/s]

./data\1589B-11C-SA4503.mol
Reindexing only valid with uniquely valued Index objects
./data\1589B-18F-FE-SA5845.mol
Reindexing only valid with uniquely valued Index objects


 74%|███████▎  | 612/832 [01:09<00:24,  9.03it/s]

./data\1594A-18FFETNIM.mol
Reindexing only valid with uniquely valued Index objects


 74%|███████▍  | 618/832 [01:09<00:20, 10.42it/s]

./data\1597A-18FFEtP4A.mol
Reindexing only valid with uniquely valued Index objects
./data\1597A-18FFEtP4OH.mol
Reindexing only valid with uniquely valued Index objects


 75%|███████▍  | 620/832 [01:10<00:24,  8.71it/s]

./data\1598-FECT.mol
Reindexing only valid with uniquely valued Index objects
./data\1598-FETT.mol
Reindexing only valid with uniquely valued Index objects


 75%|███████▍  | 622/832 [01:10<00:27,  7.51it/s]

./data\1598-RTI-31.mol
Reindexing only valid with uniquely valued Index objects
./data\1598-RTI-32.mol
Reindexing only valid with uniquely valued Index objects


 78%|███████▊  | 649/832 [01:11<00:11, 16.35it/s]

./data\1619A-18F-NCFHEB.mol
Reindexing only valid with uniquely valued Index objects


 78%|███████▊  | 651/832 [01:12<00:14, 12.49it/s]

./data\1623-11C-CGP-12388.mol
Reindexing only valid with uniquely valued Index objects


 81%|████████▏ | 678/832 [01:14<00:09, 16.64it/s]

./data\1644-11CYM-09151-2.mol
Reindexing only valid with uniquely valued Index objects
./data\1644-18F10b.mol
Reindexing only valid with uniquely valued Index objects
./data\1644-18F11b.mol
Reindexing only valid with uniquely valued Index objects


 82%|████████▏ | 680/832 [01:14<00:14, 10.84it/s]

./data\1645-18F-fluoroproxyfan.mol
Reindexing only valid with uniquely valued Index objects


 82%|████████▏ | 682/832 [01:14<00:16,  9.28it/s]

./data\1651A-18F-fluspidine.mol
Reindexing only valid with uniquely valued Index objects


 82%|████████▏ | 686/832 [01:15<00:17,  8.59it/s]

./data\1651B-18F1.mol
Reindexing only valid with uniquely valued Index objects


 83%|████████▎ | 692/832 [01:15<00:14,  9.45it/s]

./data\1658C-18F-FME-MCN.mol
Reindexing only valid with uniquely valued Index objects
./data\1659B-18F-FP-TZTP.mol
Reindexing only valid with uniquely valued Index objects
./data\1660A-18F-2a.mol
Reindexing only valid with uniquely valued Index objects


 83%|████████▎ | 694/832 [01:16<00:20,  6.70it/s]

./data\1660A-18F-2b.mol
Reindexing only valid with uniquely valued Index objects


 84%|████████▎ | 695/832 [01:16<00:22,  6.08it/s]

./data\1660A-18F-2c.mol
Reindexing only valid with uniquely valued Index objects


 84%|████████▍ | 697/832 [01:16<00:22,  6.03it/s]

./data\1660A-18F-2d.mol
Reindexing only valid with uniquely valued Index objects
./data\1660A-18F-2e.mol
Reindexing only valid with uniquely valued Index objects


 84%|████████▍ | 699/832 [01:17<00:21,  6.30it/s]

./data\1660A-18F-2f.mol
Reindexing only valid with uniquely valued Index objects
./data\1661B-18FFMNP.mol
Reindexing only valid with uniquely valued Index objects


 86%|████████▌ | 713/832 [01:18<00:12,  9.61it/s]

./data\1672C-18F-β-CFT-FP.mol
Reindexing only valid with uniquely valued Index objects


 86%|████████▌ | 717/832 [01:18<00:13,  8.71it/s]

./data\1675C-18F-FP-DTBZ.mol
Reindexing only valid with uniquely valued Index objects
./data\1675D-18FAV-133.mol
Reindexing only valid with uniquely valued Index objects
./data\1679B-18F-1.mol
Reindexing only valid with uniquely valued Index objects


 87%|████████▋ | 721/832 [01:19<00:11, 10.02it/s]

./data\1681-18F-FPCT.mol
Reindexing only valid with uniquely valued Index objects


 88%|████████▊ | 731/832 [01:20<00:11,  9.11it/s]

./data\1707-18F7a.mol
Reindexing only valid with uniquely valued Index objects


 88%|████████▊ | 735/832 [01:20<00:09,  9.74it/s]

./data\1712-18F-RR-2.mol
Reindexing only valid with uniquely valued Index objects
./data\1712-18F-RR-FQNPe.mol
Reindexing only valid with uniquely valued Index objects


 89%|████████▊ | 738/832 [01:21<00:11,  8.29it/s]

./data\1712-18F-RS-2.mol
Reindexing only valid with uniquely valued Index objects
./data\1712-18F-RS-FQNPe.mol
Reindexing only valid with uniquely valued Index objects


 89%|████████▉ | 743/832 [01:21<00:08, 10.35it/s]

./data\1717-18F-FTFMPP.mol
Reindexing only valid with uniquely valued Index objects


 90%|█████████ | 751/832 [01:22<00:05, 13.86it/s]

./data\1729-11C-HC3.mol
Reindexing only valid with uniquely valued Index objects
./data\1729-18F-HC3.mol
Reindexing only valid with uniquely valued Index objects
./data\1736D-123I-MIBG.mol
Reindexing only valid with uniquely valued Index objects


 91%|█████████ | 755/832 [01:22<00:08,  9.11it/s]

./data\1739A-18FFE@CIT.mol
Reindexing only valid with uniquely valued Index objects


 91%|█████████ | 757/832 [01:23<00:09,  7.70it/s]

./data\1739B-18FMCL-322.mol
Reindexing only valid with uniquely valued Index objects


 94%|█████████▍| 785/832 [01:25<00:02, 17.42it/s]

./data\1757A-NCQ115.mol
Reindexing only valid with uniquely valued Index objects


 95%|█████████▍| 789/832 [01:25<00:02, 16.23it/s]

./data\1764-18F-Nifrolidine.mol
Reindexing only valid with uniquely valued Index objects
./data\1769A-18FNS10743.mol
Reindexing only valid with uniquely valued Index objects


 95%|█████████▌| 793/832 [01:25<00:03,  9.94it/s]

./data\1770-18FNS14490.mol
Reindexing only valid with uniquely valued Index objects


 97%|█████████▋| 804/832 [01:26<00:01, 15.43it/s]

./data\1781-18FPF-9811.mol
Reindexing only valid with uniquely valued Index objects


 98%|█████████▊| 816/832 [01:27<00:01, 10.21it/s]

./data\1812-18FSpiro-FBT.mol
Reindexing only valid with uniquely valued Index objects


 98%|█████████▊| 818/832 [01:28<00:02,  6.69it/s]

./data\1814-18FSR144385.mol
Reindexing only valid with uniquely valued Index objects
./data\1814-18FSR147963.mol
Reindexing only valid with uniquely valued Index objects


100%|█████████▉| 830/832 [01:28<00:00, 13.71it/s]

./data\1827A-18F7a.mol
Reindexing only valid with uniquely valued Index objects


100%|██████████| 832/832 [01:29<00:00,  9.29it/s]

./data\1827B-18F1.mol
Reindexing only valid with uniquely valued Index objects





In [62]:
main_df = pd.DataFrame.dropna(main_df, axis=1, how='all')
main_df.insert(loc=1, column='Compound structure', value="")
main_df.insert(loc=1, column='SMILES', value="")
main_df.to_excel(excel_path, index=False, engine='openpyxl', encoding='utf-8')

In [63]:
"""
    使用openpyxl打开excel文件并进行设定
"""
# 打开数据汇总表
wbc = openpyxl.load_workbook(excel_path)
# 操作当前相应的表
wsc = wbc.active

# 调整列宽
wsc.column_dimensions['A'].width = 25
wsc.column_dimensions['B'].width = 50
wsc.row_dimensions[1].height = 30
SMILES_column = 2

alignment = Alignment(horizontal='left', vertical='center')

for col in tqdm(wsc.columns):
    for cell in col:
        cell.alignment = alignment
# wbc.save(excel_path)

3it [00:00, 96.06it/s]


In [64]:
"""
    插入SMILES
"""
# 读取smiles文件并写入excel表格
row = 2
for compound_name_cell in tqdm(wsc['A']):
    compound_file_name = compound_name2file_map.get(compound_name_cell.value)
    if compound_file_name is not None:
        try:
            writer = Chem.MolFromMolFile(compound_file_name)
            SMILES = Chem.MolToSmiles(writer)
        except OSError as e:
            print(e)
            row = row + 1
            continue
        # 把对应文件的smiles字符串填写到对应行的第四列
        wsc.cell(row, SMILES_column).value = SMILES
        wsc.cell(row, SMILES_column).alignment = alignment
        row = row + 1
# wbc.save(excel_path)

 35%|███▍      | 224/646 [00:00<00:00, 790.01it/s]

Bad input file ./data\1069A-16β-F-DHT;18F-2.mol
Bad input file ./data\1069A-16β-F-Mib;18F-6.mol
Bad input file ./data\1069A-16β-F-MNT;18F-7.mol
Bad input file ./data\1069A-16β-F-MNT;18F-9.mol
Bad input file ./data\1069A-16β-F-T;18F-4.mol


100%|██████████| 646/646 [00:00<00:00, 866.06it/s]

Bad input file ./data\1620A-18F-ﬂuoroisonicotinic.mol





In [65]:
"""图片写入到汇总表"""
# 记录当前操作的行数
row = 2
# 对map长度的计数器，防止map内数据已经使用完的情况下程序还在对excel进行行遍历
count = 0
map_length = len(compound_name2img_map)
# 调整列宽
wsc.column_dimensions['C'].width = 20


# from PIL import Image as PImage
# 读取A列的化合物名
try:
    for compound_name_cell in tqdm(wsc['A']):
        # if count == map_length:
        #     break
        compound_name = compound_name_cell.value
        # 跳过第一行
        if compound_name == '文献编号' or compound_name == '化合物编号':
            continue
        img_path = compound_name2img_map.get(compound_name)
        if img_path is not None:
            img = Image(img_path)
            # img = PImage.open(img_path).resize((120, 120))

            # 图片只保存在C列，只对C列每一行进行操作
            wsc.add_image(img, 'C' + str(row))
            # 调整行高
            wsc.row_dimensions[row].height = 96
            row = row + 1
            count = count + 1
except UnboundLocalError as e:
    print(e)
finally:
    wbc.save(excel_path)

    # # 读取savepic文件夹下的文件名
    # for img_name in os.listdir(savepic):
    #     img_compound_name = os.path.splitext(img_name)[0]
    #     if compound_name_cell.value == img_compound_name:
    #         img = Image(savepic + '/' + img_name)
    #         # 图片只保存在C列，只对C列每一行进行操作
    #         wsc.add_image(img, 'C' + str(row))
    #         row = row + 1
    #         # 调整列宽
    #         wsc.column_dimensions['C'].width = 28
    #         # 调整行高
    #         wsc.row_dimensions[row].height = 101
    #         wbc.save(filepath)
    #         break

100%|██████████| 646/646 [00:00<00:00, 888.25it/s] 


In [66]:
print(errorfile)

['./data\\1069A-16β-F-DHT;18F-2.mol', './data\\1069A-16β-F-Mib;18F-6.mol', './data\\1069A-16β-F-MNT;18F-7.mol', './data\\1069A-16β-F-MNT;18F-9.mol', './data\\1069A-16β-F-T;18F-4.mol', './data\\1620A-18F-ﬂuoroisonicotinic.mol', './data\\1672C-18F-β-CFT-FP.mol']
