In [1]:
from indigo import Indigo
indigo = Indigo()

import pandas as pd
from pathlib import Path
import os
import warnings
import json
import re
warnings.filterwarnings('ignore')
from tqdm import tqdm
import numpy as np
from drfp import DrfpEncoder
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 5000)
pd.set_option('display.max_rows', 1500)

In [None]:
#target structure
reaction_dict = {
    'doi': '', #дои статьи, откуда оно взялось
    'reaction_rxn': '', #реакция в формате rxn
    'products': [{'molfile': '',
                  'amount': 0, #in mmol
                  'weight': 0, #in g
                  'vol': 0, #in mL
                  'yield': 0 #in %
                 }], #список словарей для нескольких продуктов
    'reagents': [{'molfile': '',
                  'role': '', #reactant, reagent, catalyst
                  'amount': 0, #in mmol
                  'weight': 0, #in g
                  'vol': 0, #in mL
                 }],
    'scale': 0, # = максимальный amount среди продуктов
    'temperature': [0], #темпеатура в градусах Цельсия. Список так как может быть несколько температур
    'time': [0], #время реакции в часах. Список так как может быть несколько этапов
    'misc': [''], #список строк со всякими остальными условиями '20 mA', '450 nm 400 W'
    'solvent': [{'solvent_name': '', 'solvent_vol': 0}], #список словарей, где указаны объемы и названия растворителей
    'protocol': '' #сюда тупо строкой падает методика
}

In [2]:
df = pd.read_csv('/media/oleg/second_ssd/reactions_from_reaxys/total_rxn_df.csv', sep = '\t')

In [3]:
df.columns

Index(['Reaction ID', 'Reaction: Links to Reaxys', 'Data Count',
       'Number of Reaction Details', 'Reaction Rank', 'Record Type',
       'Reactant', 'Product', 'Bin', 'Reaction',
       'Reaction Details: Reaction Classification', 'Example label',
       'Example title', 'Fulltext of reaction', 'Number of Reaction Steps',
       'Multi-step Scheme', 'Multi-step Details', 'Number of Stages',
       'Solid Phase', 'Time (Reaction Details) [h]',
       'Temperature (Reaction Details) [C]',
       'Pressure (Reaction Details) [Torr]', 'pH-Value (Reaction Details)',
       'Other Conditions', 'Reaction Type', 'Subject Studied',
       'Prototype Reaction', 'Named Reaction',
       'Type of reaction description (Reaction Details)', 'Location',
       'Comment (Reaction Details)', 'Product.1', 'Yield', 'Yield (numerical)',
       'Yield (optical)', 'Stage Reactant', 'Reagent', 'Catalyst',
       'Solvent (Reaction Details)', 'References', 'Links to Reaxys'],
      dtype='object')

In [4]:
cols_to_del = ['Reaction ID', 'Reaction: Links to Reaxys', 'Data Count', 'Bin', 'Example label',
       'Example title', 'Reaction Type', 'Subject Studied',
       'Prototype Reaction', 'Named Reaction', 'Type of reaction description (Reaction Details)', 'Location',
       'Comment (Reaction Details)', 'Links to Reaxys' ]
df.drop(cols_to_del, axis = 1, inplace = True)

In [5]:
df.shape

(8370933, 27)

In [6]:
df['References'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 8370933 entries, 0 to 8370932
Series name: References
Non-Null Count    Dtype 
--------------    ----- 
4549717 non-null  object
dtypes: object(1)
memory usage: 63.9+ MB


In [7]:
df = df[~df['References'].isna()]

In [8]:
df.shape

(4549717, 27)

In [9]:
df

Unnamed: 0,Number of Reaction Details,Reaction Rank,Record Type,Reactant,Product,Reaction,Reaction Details: Reaction Classification,Fulltext of reaction,Number of Reaction Steps,Multi-step Scheme,Multi-step Details,Number of Stages,Solid Phase,Time (Reaction Details) [h],Temperature (Reaction Details) [C],Pressure (Reaction Details) [Torr],pH-Value (Reaction Details),Other Conditions,Product.1,Yield,Yield (numerical),Yield (optical),Stage Reactant,Reagent,Catalyst,Solvent (Reaction Details),References
0,1.0,81.0,full reaction; has preparation,2-Nitrobenzenesulfonamide; (S)-tert-butyl (4-cyclopropyl-4-hydroxy-2-methylenebutyl) carbonate,(S)-N-(4-cyclopropyl-4-hydroxy-2-methylenebutyl)-2-nitrobenzenesulfonamide,NS(=O)(=O)C1=C(C=CC=C1)N(=O)=O.CC(C)(C)OC(=O)OCC(=C)C[C@H](O)C1CC1>>O[C@@H](CC(=C)CNS(=O)(=O)C1=C(C=CC=C1)N(=O)=O)C1CC1,Preparation,,1.0,,,,,24,25,,,Inert atmosphere,(S)-N-(4-cyclopropyl-4-hydroxy-2-methylenebutyl)-2-nitrobenzenesulfonamide,73 percent,73,,,tetrakis(triphenylphosphine) palladium(0),,tetrahydrofuran,"Article; Luo, Guoshun; Xiang, Ming; Krische, Michael J.; Organic Letters; vol. 21; 8; (2019); p. 2493 - 2497;"
1,1.0,86.0,full reaction; has preparation,methyl 4-benzamido-2-hydroxybenzoate,4-benzoylamino-2-hydroxybenzoic acid,COC(=O)C1=C(O)C=C(NC(=O)C2=CC=CC=C2)C=C1>>OC(=O)C1=C(O)C=C(NC(=O)C2=CC=CC=C2)C=C1,Preparation,"To a solution of compound 4h (90 mg, 0.33 mmol) in THF(2 mL) was added aqueous NaOH (1.5 M, 8 mL). The mixturewas stirred at 50C for 2 h. After being cooled to room temperature,the solution was acidified with 2 M HCl to pH = 3-4.The precipitation was collected to give the compound 4h(62 mg, 73%). mp 263.1-264.7C; 1H-NMR (400 MHz, DMSOd6)delta: 10.48 (s, 1H), 8.05-7.89 (m, 2H), 7.76 (d, J = 8.7 Hz, 1H),7.67-7.44 (m, 4H), 7.34 (dd, J = 8.7, 2.0 Hz, 1H); 13C-NMR(101 MHz, DMSO-d6) delta: 172.11, 166.64, 162.44, 146.08, 135.06,132.40, 131.32, 128.93, 128.30, 111.59, 108.51, 107.38; ESI-MSm/z: 280.0563 (Calcd for C14H11NO4 [M + Na]+: 280.0580).",1.0,,,,,2,50,,,,4-benzoylamino-2-hydroxybenzoic acid,73 percent,73,,,water; sodium hydroxide,,,"Article; Liang, Jie; Tang, Yun-xiang; Tang, Xiang-zheng; Liang, Hua-ju; Gao, Yamin; Fang, Cuiting; Zhang, Tian-yu; Yan, Ming; Chemical and Pharmaceutical Bulletin; vol. 67; 4; (2019); p. 372 - 381;"
2,1.0,90.0,full reaction; has preparation,4-bromo-3-hydroxy-aniline; 4-trifluoromethyl-phenyl acetyl chloride,N-(4-bromo-3-hydroxyphenyl)-4-(trifluoromethyl)benzamide,NC1=CC=C(Br)C(O)=C1.FC(F)(F)C1=CC=C(C=C1)C(Cl)=O>>OC1=C(Br)C=CC(NC(=O)C2=CC=C(C=C2)C(F)(F)F)=C1,Preparation,"General procedure: A solution of BBr3 in dichloromethane (1.0 M, 12 mL,12 mmol) was added slowly to a solution of 4-bromo-3-methoxyaniline (800 mg, 3.96 mmol) in methylene chloride(15 mL) at 0C. The resulting brown solution was warmed toroom temperature and stirred for 24 h. After saturated aqueousNaHCO3 (30 mL) was added at 0C, the solution was extractedwith EtOAc (20 mL × 3). The combined organic layer wasdried with anhydrous Na2SO4, filtered and concentrated invacuum. The residue was purified by flash chromatographyover silica gel (petroleum-EtOAc = 2 : 1) to give 5-amino-2-bromophenol (665 mg, 88%).To a solution of 5-amino-2-bromophenol (55 mg, 0.29 mmol)and triethylamine (53 muL, 0.38 mmol) in tetrahydrofuran (THF)(3 mL) was added slowly benzoyl chloride (0.32 mmol) at 0C.The reaction mixture was then stirred at room temperature for30 min. After the reaction was quenched with water (10 mL),the solution was extracted with EtOAc (10 mL × 2). The combinedorganic layer was dried over anhydrous Na2SO4, filteredand concentrated in vacuum. The residue was purified bycolumn chromatography to afford the product 3 (74 mg, 87%).",1.0,,,,,0.5,0 - 20,,,,N-(4-bromo-3-hydroxyphenyl)-4-(trifluoromethyl)benzamide,73 percent,73,,,triethylamine,,tetrahydrofuran,"Article; Liang, Jie; Tang, Yun-xiang; Tang, Xiang-zheng; Liang, Hua-ju; Gao, Yamin; Fang, Cuiting; Zhang, Tian-yu; Yan, Ming; Chemical and Pharmaceutical Bulletin; vol. 67; 4; (2019); p. 372 - 381;"
3,1.0,90.0,full reaction; has preparation,4-bromo-3-hydroxy-aniline; 2-naphthaloyl chloride,N-(4-bromo-3-hydroxyphenyl)-2-naphthamide,NC1=CC=C(Br)C(O)=C1.ClC(=O)C1=CC2=CC=CC=C2C=C1>>OC1=C(Br)C=CC(NC(=O)C2=CC3=CC=CC=C3C=C2)=C1,Preparation,"General procedure: A solution of BBr3 in dichloromethane (1.0 M, 12 mL,12 mmol) was added slowly to a solution of 4-bromo-3-methoxyaniline (800 mg, 3.96 mmol) in methylene chloride(15 mL) at 0C. The resulting brown solution was warmed toroom temperature and stirred for 24 h. After saturated aqueousNaHCO3 (30 mL) was added at 0C, the solution was extractedwith EtOAc (20 mL × 3). The combined organic layer wasdried with anhydrous Na2SO4, filtered and concentrated invacuum. The residue was purified by flash chromatographyover silica gel (petroleum-EtOAc = 2 : 1) to give 5-amino-2-bromophenol (665 mg, 88%).To a solution of 5-amino-2-bromophenol (55 mg, 0.29 mmol)and triethylamine (53 muL, 0.38 mmol) in tetrahydrofuran (THF)(3 mL) was added slowly benzoyl chloride (0.32 mmol) at 0C.The reaction mixture was then stirred at room temperature for30 min. After the reaction was quenched with water (10 mL),the solution was extracted with EtOAc (10 mL × 2). The combinedorganic layer was dried over anhydrous Na2SO4, filteredand concentrated in vacuum. The residue was purified bycolumn chromatography to afford the product 3 (74 mg, 87%).",1.0,,,,,0.5,0 - 20,,,,N-(4-bromo-3-hydroxyphenyl)-2-naphthamide,73 percent,73,,,triethylamine,,tetrahydrofuran,"Article; Liang, Jie; Tang, Yun-xiang; Tang, Xiang-zheng; Liang, Hua-ju; Gao, Yamin; Fang, Cuiting; Zhang, Tian-yu; Yan, Ming; Chemical and Pharmaceutical Bulletin; vol. 67; 4; (2019); p. 372 - 381;"
4,1.0,66.0,full reaction; has preparation,C56H62O10,C42H50O10,C(OC1=C2CC3=CC=CC4=C3OCCOCCOCCOCCOCCOCCOCCOC3=C(CC1=CC=C2)C=CC=C3CC1=C(OCC2=CC=CC=C2)C(C4)=CC=C1)C1=CC=CC=C1>>OC1=C2CC3=CC=CC4=C3OCCOCCOCCOCCOCCOCCOCCOC3=C(CC1=CC=C2)C=CC=C3CC1=C(O)C(C4)=CC=C1,Preparation,,1.0,,,,,,,,,,C42H50O10,73 percent,73,,,palladium on activated charcoal; hydrogen,,,"Article; Tokunaga, Yuji; Hayakawa, Kentaroh; Miyashita, Junichi; Kawasaki, Tsuneomi; Miyagawa, Shinobu; Tetrahedron Letters; vol. 54; 50; (2013); p. 6829 - 6833;"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8163876,1.0,79.0,full reaction; has preparation,C12H13NO3; thiophenol,"2-methoxy-4-methyl-4-((phenylthio)methyl)isoquinoline-1,3(2H,4H)-dione",CON(C(=O)C(C)=C)C(=O)C1=CC=CC=C1.SC1=CC=CC=C1>>CON1C(=O)C2=C(C=CC=C2)C(C)(CSC2=CC=CC=C2)C1=O,Preparation,,1.0,,,,,12,60,,,Schlenk technique,"2-methoxy-4-methyl-4-((phenylthio)methyl)isoquinoline-1,3(2H,4H)-dione",78 percent,78,,,oxygen; toluene-4-sulfonic acid,,"1,2-dichloro-ethane","Article; Yuan, Yan-Qin; Kumar, Pailla Santhosh; Zhang, Chun-Niu; Yang, Ming-Hua; Guo, Sheng-Rong; Organic and Biomolecular Chemistry; vol. 15; 35; (2017); p. 7330 - 7338;"
8163877,1.0,79.0,full reaction; has preparation,N-methyl-N-methacryloyl-4-methylbenzamide; 4-t-butylbenzenethiol,"4-(((4-(tert-butyl)phenyl)thio)methyl)-2,4,6-trimethylisoquinoline-1,3(2H,4H)-dione",CN(C(=O)C(C)=C)C(=O)C1=CC=C(C)C=C1.CC(C)(C)C1=CC=C(S)C=C1>>CN1C(=O)C2=C(C=C(C)C=C2)C(C)(CSC2=CC=C(C=C2)C(C)(C)C)C1=O,Preparation,,1.0,,,,,12,60,,,Schlenk technique,"4-(((4-(tert-butyl)phenyl)thio)methyl)-2,4,6-trimethylisoquinoline-1,3(2H,4H)-dione",78 percent,78,,,oxygen; toluene-4-sulfonic acid,,"1,2-dichloro-ethane","Article; Yuan, Yan-Qin; Kumar, Pailla Santhosh; Zhang, Chun-Niu; Yang, Ming-Hua; Guo, Sheng-Rong; Organic and Biomolecular Chemistry; vol. 15; 35; (2017); p. 7330 - 7338;"
8163878,1.0,82.0,full reaction; has preparation,sodium cyanide; methanesulfonic acid but-3-ynyl ester,pent-4-ynenitrile,[Na+].[C-]#N.CS(=O)(=O)OCCC#C>>C#CCCC#N,Preparation,,1.0,,,,,1.75,20 - 70,,,,pent-4-ynenitrile,78 percent,78,,,triethylamine,,"N,N-dimethyl-formamide","Article; ?afa?, Peter; Marchalin, ?tefan; ?oral, Michal; Moncol, Jan; Daich, Adam; Organic Letters; vol. 19; 18; (2017); p. 4742 - 4745;"
8163879,1.0,91.0,full reaction; has preparation,6-ethyl-3alpha-hydroxy-7-oxo-5beta-cholan-24-oic acid ethyl ester; chloromethyl methyl ether,6-ethyl-3alpha-methyloxymethyl-7-oxo-5beta-cholan-24-oic acid ethyl ester,[H][C@@]1(CC[C@@]2([H])[C@]3([H])C(=O)C(CC)[C@]4([H])C[C@H](O)CC[C@]4(C)[C@@]3([H])CC[C@]12C)[C@H](C)CCC(=O)OCC.COCCl>>[H][C@@]1(CC[C@@]2([H])[C@]3([H])C(=O)C(CC)[C@]4([H])C[C@@H](CC[C@]4(C)[C@@]3([H])CC[C@]12C)OCOC)[C@H](C)CCC(=O)OCC,Preparation,"The compound represented by the formula (VII) (44.7 g, 0.1 mol) was dissolved in 500 ml of dichloromethane,Diisopropylethylamine (100 ml) was added and chloromethyl methyl ether was added at room temperature(100 ml) was added to the organic layer, and the organic layer was washed with saturated sodium chloride solution (200 ml x), and the organic layer was washed with saturated sodium bicarbonate solution (100 ml) 2), dried over anhydrous sodium sulfate, filtered and concentrated to give the compound represented by the formula (VIII) (38.2 g, yield: 78.0%).",1.0,,,,,24,20,,,,6-ethyl-3alpha-methyloxymethyl-7-oxo-5beta-cholan-24-oic acid ethyl ester,78 percent,78,,,"N-ethyl-N,N-diisopropylamine",,dichloromethane,"Patent; Hefei Nuoruiji Pharmaceutical Technology Co., Ltd.; Wu Yun; (11 pag.)CN106279336; (2017); A;"


In [10]:
def split_ref(string):
    string = string.replace('Article;', '___Article;').replace('Patent;', '___Patent;')
    string = ';'.join([x for x in string.split(';') if len(x.replace(' ', '')) > 0])
    splitted = [x for x in string.split('___') if len(x.replace(' ', '')) > 0]
    splitted.sort(key = len, reverse=True)
    
    cleaned = []
    
    for idx, x in enumerate(splitted):
        included = False
        for y in splitted[:idx]:
            if x in y:
                included = True
                break
        if not included:
            cleaned.append(x)
    
#     if len(cleaned) > 1:
#         cleaned = [x for x in cleaned if '(from Gmelin)' not in x.lower()]
    
    return cleaned

In [11]:
df['num_refs'] = df['References'].apply(lambda x: len(split_ref(x)))

In [12]:
df = df[df['num_refs'] == 1].copy()

In [13]:
df.drop('num_refs', axis = 1, inplace = True)

In [14]:
df

Unnamed: 0,Number of Reaction Details,Reaction Rank,Record Type,Reactant,Product,Reaction,Reaction Details: Reaction Classification,Fulltext of reaction,Number of Reaction Steps,Multi-step Scheme,Multi-step Details,Number of Stages,Solid Phase,Time (Reaction Details) [h],Temperature (Reaction Details) [C],Pressure (Reaction Details) [Torr],pH-Value (Reaction Details),Other Conditions,Product.1,Yield,Yield (numerical),Yield (optical),Stage Reactant,Reagent,Catalyst,Solvent (Reaction Details),References
0,1.0,81.0,full reaction; has preparation,2-Nitrobenzenesulfonamide; (S)-tert-butyl (4-cyclopropyl-4-hydroxy-2-methylenebutyl) carbonate,(S)-N-(4-cyclopropyl-4-hydroxy-2-methylenebutyl)-2-nitrobenzenesulfonamide,NS(=O)(=O)C1=C(C=CC=C1)N(=O)=O.CC(C)(C)OC(=O)OCC(=C)C[C@H](O)C1CC1>>O[C@@H](CC(=C)CNS(=O)(=O)C1=C(C=CC=C1)N(=O)=O)C1CC1,Preparation,,1.0,,,,,24,25,,,Inert atmosphere,(S)-N-(4-cyclopropyl-4-hydroxy-2-methylenebutyl)-2-nitrobenzenesulfonamide,73 percent,73,,,tetrakis(triphenylphosphine) palladium(0),,tetrahydrofuran,"Article; Luo, Guoshun; Xiang, Ming; Krische, Michael J.; Organic Letters; vol. 21; 8; (2019); p. 2493 - 2497;"
1,1.0,86.0,full reaction; has preparation,methyl 4-benzamido-2-hydroxybenzoate,4-benzoylamino-2-hydroxybenzoic acid,COC(=O)C1=C(O)C=C(NC(=O)C2=CC=CC=C2)C=C1>>OC(=O)C1=C(O)C=C(NC(=O)C2=CC=CC=C2)C=C1,Preparation,"To a solution of compound 4h (90 mg, 0.33 mmol) in THF(2 mL) was added aqueous NaOH (1.5 M, 8 mL). The mixturewas stirred at 50C for 2 h. After being cooled to room temperature,the solution was acidified with 2 M HCl to pH = 3-4.The precipitation was collected to give the compound 4h(62 mg, 73%). mp 263.1-264.7C; 1H-NMR (400 MHz, DMSOd6)delta: 10.48 (s, 1H), 8.05-7.89 (m, 2H), 7.76 (d, J = 8.7 Hz, 1H),7.67-7.44 (m, 4H), 7.34 (dd, J = 8.7, 2.0 Hz, 1H); 13C-NMR(101 MHz, DMSO-d6) delta: 172.11, 166.64, 162.44, 146.08, 135.06,132.40, 131.32, 128.93, 128.30, 111.59, 108.51, 107.38; ESI-MSm/z: 280.0563 (Calcd for C14H11NO4 [M + Na]+: 280.0580).",1.0,,,,,2,50,,,,4-benzoylamino-2-hydroxybenzoic acid,73 percent,73,,,water; sodium hydroxide,,,"Article; Liang, Jie; Tang, Yun-xiang; Tang, Xiang-zheng; Liang, Hua-ju; Gao, Yamin; Fang, Cuiting; Zhang, Tian-yu; Yan, Ming; Chemical and Pharmaceutical Bulletin; vol. 67; 4; (2019); p. 372 - 381;"
2,1.0,90.0,full reaction; has preparation,4-bromo-3-hydroxy-aniline; 4-trifluoromethyl-phenyl acetyl chloride,N-(4-bromo-3-hydroxyphenyl)-4-(trifluoromethyl)benzamide,NC1=CC=C(Br)C(O)=C1.FC(F)(F)C1=CC=C(C=C1)C(Cl)=O>>OC1=C(Br)C=CC(NC(=O)C2=CC=C(C=C2)C(F)(F)F)=C1,Preparation,"General procedure: A solution of BBr3 in dichloromethane (1.0 M, 12 mL,12 mmol) was added slowly to a solution of 4-bromo-3-methoxyaniline (800 mg, 3.96 mmol) in methylene chloride(15 mL) at 0C. The resulting brown solution was warmed toroom temperature and stirred for 24 h. After saturated aqueousNaHCO3 (30 mL) was added at 0C, the solution was extractedwith EtOAc (20 mL × 3). The combined organic layer wasdried with anhydrous Na2SO4, filtered and concentrated invacuum. The residue was purified by flash chromatographyover silica gel (petroleum-EtOAc = 2 : 1) to give 5-amino-2-bromophenol (665 mg, 88%).To a solution of 5-amino-2-bromophenol (55 mg, 0.29 mmol)and triethylamine (53 muL, 0.38 mmol) in tetrahydrofuran (THF)(3 mL) was added slowly benzoyl chloride (0.32 mmol) at 0C.The reaction mixture was then stirred at room temperature for30 min. After the reaction was quenched with water (10 mL),the solution was extracted with EtOAc (10 mL × 2). The combinedorganic layer was dried over anhydrous Na2SO4, filteredand concentrated in vacuum. The residue was purified bycolumn chromatography to afford the product 3 (74 mg, 87%).",1.0,,,,,0.5,0 - 20,,,,N-(4-bromo-3-hydroxyphenyl)-4-(trifluoromethyl)benzamide,73 percent,73,,,triethylamine,,tetrahydrofuran,"Article; Liang, Jie; Tang, Yun-xiang; Tang, Xiang-zheng; Liang, Hua-ju; Gao, Yamin; Fang, Cuiting; Zhang, Tian-yu; Yan, Ming; Chemical and Pharmaceutical Bulletin; vol. 67; 4; (2019); p. 372 - 381;"
3,1.0,90.0,full reaction; has preparation,4-bromo-3-hydroxy-aniline; 2-naphthaloyl chloride,N-(4-bromo-3-hydroxyphenyl)-2-naphthamide,NC1=CC=C(Br)C(O)=C1.ClC(=O)C1=CC2=CC=CC=C2C=C1>>OC1=C(Br)C=CC(NC(=O)C2=CC3=CC=CC=C3C=C2)=C1,Preparation,"General procedure: A solution of BBr3 in dichloromethane (1.0 M, 12 mL,12 mmol) was added slowly to a solution of 4-bromo-3-methoxyaniline (800 mg, 3.96 mmol) in methylene chloride(15 mL) at 0C. The resulting brown solution was warmed toroom temperature and stirred for 24 h. After saturated aqueousNaHCO3 (30 mL) was added at 0C, the solution was extractedwith EtOAc (20 mL × 3). The combined organic layer wasdried with anhydrous Na2SO4, filtered and concentrated invacuum. The residue was purified by flash chromatographyover silica gel (petroleum-EtOAc = 2 : 1) to give 5-amino-2-bromophenol (665 mg, 88%).To a solution of 5-amino-2-bromophenol (55 mg, 0.29 mmol)and triethylamine (53 muL, 0.38 mmol) in tetrahydrofuran (THF)(3 mL) was added slowly benzoyl chloride (0.32 mmol) at 0C.The reaction mixture was then stirred at room temperature for30 min. After the reaction was quenched with water (10 mL),the solution was extracted with EtOAc (10 mL × 2). The combinedorganic layer was dried over anhydrous Na2SO4, filteredand concentrated in vacuum. The residue was purified bycolumn chromatography to afford the product 3 (74 mg, 87%).",1.0,,,,,0.5,0 - 20,,,,N-(4-bromo-3-hydroxyphenyl)-2-naphthamide,73 percent,73,,,triethylamine,,tetrahydrofuran,"Article; Liang, Jie; Tang, Yun-xiang; Tang, Xiang-zheng; Liang, Hua-ju; Gao, Yamin; Fang, Cuiting; Zhang, Tian-yu; Yan, Ming; Chemical and Pharmaceutical Bulletin; vol. 67; 4; (2019); p. 372 - 381;"
4,1.0,66.0,full reaction; has preparation,C56H62O10,C42H50O10,C(OC1=C2CC3=CC=CC4=C3OCCOCCOCCOCCOCCOCCOCCOC3=C(CC1=CC=C2)C=CC=C3CC1=C(OCC2=CC=CC=C2)C(C4)=CC=C1)C1=CC=CC=C1>>OC1=C2CC3=CC=CC4=C3OCCOCCOCCOCCOCCOCCOCCOC3=C(CC1=CC=C2)C=CC=C3CC1=C(O)C(C4)=CC=C1,Preparation,,1.0,,,,,,,,,,C42H50O10,73 percent,73,,,palladium on activated charcoal; hydrogen,,,"Article; Tokunaga, Yuji; Hayakawa, Kentaroh; Miyashita, Junichi; Kawasaki, Tsuneomi; Miyagawa, Shinobu; Tetrahedron Letters; vol. 54; 50; (2013); p. 6829 - 6833;"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8163876,1.0,79.0,full reaction; has preparation,C12H13NO3; thiophenol,"2-methoxy-4-methyl-4-((phenylthio)methyl)isoquinoline-1,3(2H,4H)-dione",CON(C(=O)C(C)=C)C(=O)C1=CC=CC=C1.SC1=CC=CC=C1>>CON1C(=O)C2=C(C=CC=C2)C(C)(CSC2=CC=CC=C2)C1=O,Preparation,,1.0,,,,,12,60,,,Schlenk technique,"2-methoxy-4-methyl-4-((phenylthio)methyl)isoquinoline-1,3(2H,4H)-dione",78 percent,78,,,oxygen; toluene-4-sulfonic acid,,"1,2-dichloro-ethane","Article; Yuan, Yan-Qin; Kumar, Pailla Santhosh; Zhang, Chun-Niu; Yang, Ming-Hua; Guo, Sheng-Rong; Organic and Biomolecular Chemistry; vol. 15; 35; (2017); p. 7330 - 7338;"
8163877,1.0,79.0,full reaction; has preparation,N-methyl-N-methacryloyl-4-methylbenzamide; 4-t-butylbenzenethiol,"4-(((4-(tert-butyl)phenyl)thio)methyl)-2,4,6-trimethylisoquinoline-1,3(2H,4H)-dione",CN(C(=O)C(C)=C)C(=O)C1=CC=C(C)C=C1.CC(C)(C)C1=CC=C(S)C=C1>>CN1C(=O)C2=C(C=C(C)C=C2)C(C)(CSC2=CC=C(C=C2)C(C)(C)C)C1=O,Preparation,,1.0,,,,,12,60,,,Schlenk technique,"4-(((4-(tert-butyl)phenyl)thio)methyl)-2,4,6-trimethylisoquinoline-1,3(2H,4H)-dione",78 percent,78,,,oxygen; toluene-4-sulfonic acid,,"1,2-dichloro-ethane","Article; Yuan, Yan-Qin; Kumar, Pailla Santhosh; Zhang, Chun-Niu; Yang, Ming-Hua; Guo, Sheng-Rong; Organic and Biomolecular Chemistry; vol. 15; 35; (2017); p. 7330 - 7338;"
8163878,1.0,82.0,full reaction; has preparation,sodium cyanide; methanesulfonic acid but-3-ynyl ester,pent-4-ynenitrile,[Na+].[C-]#N.CS(=O)(=O)OCCC#C>>C#CCCC#N,Preparation,,1.0,,,,,1.75,20 - 70,,,,pent-4-ynenitrile,78 percent,78,,,triethylamine,,"N,N-dimethyl-formamide","Article; ?afa?, Peter; Marchalin, ?tefan; ?oral, Michal; Moncol, Jan; Daich, Adam; Organic Letters; vol. 19; 18; (2017); p. 4742 - 4745;"
8163879,1.0,91.0,full reaction; has preparation,6-ethyl-3alpha-hydroxy-7-oxo-5beta-cholan-24-oic acid ethyl ester; chloromethyl methyl ether,6-ethyl-3alpha-methyloxymethyl-7-oxo-5beta-cholan-24-oic acid ethyl ester,[H][C@@]1(CC[C@@]2([H])[C@]3([H])C(=O)C(CC)[C@]4([H])C[C@H](O)CC[C@]4(C)[C@@]3([H])CC[C@]12C)[C@H](C)CCC(=O)OCC.COCCl>>[H][C@@]1(CC[C@@]2([H])[C@]3([H])C(=O)C(CC)[C@]4([H])C[C@@H](CC[C@]4(C)[C@@]3([H])CC[C@]12C)OCOC)[C@H](C)CCC(=O)OCC,Preparation,"The compound represented by the formula (VII) (44.7 g, 0.1 mol) was dissolved in 500 ml of dichloromethane,Diisopropylethylamine (100 ml) was added and chloromethyl methyl ether was added at room temperature(100 ml) was added to the organic layer, and the organic layer was washed with saturated sodium chloride solution (200 ml x), and the organic layer was washed with saturated sodium bicarbonate solution (100 ml) 2), dried over anhydrous sodium sulfate, filtered and concentrated to give the compound represented by the formula (VIII) (38.2 g, yield: 78.0%).",1.0,,,,,24,20,,,,6-ethyl-3alpha-methyloxymethyl-7-oxo-5beta-cholan-24-oic acid ethyl ester,78 percent,78,,,"N-ethyl-N,N-diisopropylamine",,dichloromethane,"Patent; Hefei Nuoruiji Pharmaceutical Technology Co., Ltd.; Wu Yun; (11 pag.)CN106279336; (2017); A;"


In [15]:
refs_set = list(set(df['References'].to_list()))

In [16]:
ref_df = pd.DataFrame({'init_ref': refs_set})

In [17]:
ref_df['splitted_ref'] = ref_df['init_ref'].apply(lambda x: x.replace('(from Gmelin)', '').replace('&amp;', '&'))

In [18]:
ref_df

Unnamed: 0,init_ref,splitted_ref
0,"Article; Huang, Qinhua; Johnson, Ted W.; Bailey, Simon; Brooun, Alexei; Bunker, Kevin D.; Burke, Benjamin J.; Collins, Michael R.; Cook, Andrew S.; Cui, J. Jean; Dack, Kevin N.; Deal, Judith G.; Deng, Ya-Li; Dinh, Dac; Engstrom, Lars D.; He, Mingying; Hoffman, Jacqui; Hoffman, Robert L.; Johnson, Patrick S.; Kania, Robert S.; Lam, Hieu; Lam, Justine L.; Le, Phuong T.; Li, Qiuhua; Lingardo, Laura; Liu, Wei; Lu, Melissa West; McTigue, Michele; Palmer, Cynthia L.; Richardson, Paul F.; Sach, Neal W.; Shen, Hong; Smeal, Tod; Smith, Graham L.; Stewart, Albert E.; Timofeevski, Sergei; Tsaparikos, Konstantinos; Wang, Hui; Zhu, Huichun; Zhu, Jinjiang; Zou, Helen Y.; Edwards, Martin P.; Journal of Medicinal Chemistry; vol. 57; 4; (2014); p. 1170 - 1187;","Article; Huang, Qinhua; Johnson, Ted W.; Bailey, Simon; Brooun, Alexei; Bunker, Kevin D.; Burke, Benjamin J.; Collins, Michael R.; Cook, Andrew S.; Cui, J. Jean; Dack, Kevin N.; Deal, Judith G.; Deng, Ya-Li; Dinh, Dac; Engstrom, Lars D.; He, Mingying; Hoffman, Jacqui; Hoffman, Robert L.; Johnson, Patrick S.; Kania, Robert S.; Lam, Hieu; Lam, Justine L.; Le, Phuong T.; Li, Qiuhua; Lingardo, Laura; Liu, Wei; Lu, Melissa West; McTigue, Michele; Palmer, Cynthia L.; Richardson, Paul F.; Sach, Neal W.; Shen, Hong; Smeal, Tod; Smith, Graham L.; Stewart, Albert E.; Timofeevski, Sergei; Tsaparikos, Konstantinos; Wang, Hui; Zhu, Huichun; Zhu, Jinjiang; Zou, Helen Y.; Edwards, Martin P.; Journal of Medicinal Chemistry; vol. 57; 4; (2014); p. 1170 - 1187;"
1,"Article; Blake, Alexander J.; Reid, Gillian; Schroeder, Martin; Journal of the Chemical Society, Dalton Transactions; 12; (1990); p. 3849 - 3856; (from Gmelin)","Article; Blake, Alexander J.; Reid, Gillian; Schroeder, Martin; Journal of the Chemical Society, Dalton Transactions; 12; (1990); p. 3849 - 3856;"
2,"Article; Belyaev, Andrey; Chen, Yi-Ting; Su, Shih-Hao; Tseng, Yu-Jui; Karttunen, Antti J.; Tunik, Sergey P.; Chou, Pi-Tai; Koshevoy, Igor O.; Chemical Communications; vol. 53; 79; (2017); p. 10954 - 10957;","Article; Belyaev, Andrey; Chen, Yi-Ting; Su, Shih-Hao; Tseng, Yu-Jui; Karttunen, Antti J.; Tunik, Sergey P.; Chou, Pi-Tai; Koshevoy, Igor O.; Chemical Communications; vol. 53; 79; (2017); p. 10954 - 10957;"
3,"Article; Shainova, Roza S.; Gomktsyan, Tiruhi A.; Karapetyan, Armen V.; Yengoyan, Aleksandr P.; Journal of Chemical Research; vol. 41; 4; (2017); p. 205 - 209;","Article; Shainova, Roza S.; Gomktsyan, Tiruhi A.; Karapetyan, Armen V.; Yengoyan, Aleksandr P.; Journal of Chemical Research; vol. 41; 4; (2017); p. 205 - 209;"
4,"Article; Arlcl, Muersel; Ye?ilel, Okan Zafer; Ta?, Murat; Crystal Growth and Design; vol. 15; 6; (2015); p. 3024 - 3031;","Article; Arlcl, Muersel; Ye?ilel, Okan Zafer; Ta?, Murat; Crystal Growth and Design; vol. 15; 6; (2015); p. 3024 - 3031;"
...,...,...
633783,"Article; Jopp, Stefan; Ehlers, Peter; Frank, Eva; Mernyak, Erzsebet; Schneider, Gyula; Woelfling, Janos; Villinger, Alexander; Langer, Peter; Synlett; vol. 30; 5; (2019); p. 600 - 604;","Article; Jopp, Stefan; Ehlers, Peter; Frank, Eva; Mernyak, Erzsebet; Schneider, Gyula; Woelfling, Janos; Villinger, Alexander; Langer, Peter; Synlett; vol. 30; 5; (2019); p. 600 - 604;"
633784,"Article; Sugino, Hiroyoshi; Kawai, Hidetoshi; Umehara, Takeshi; Fujiwara, Kenshu; Suzuki, Takanori; Chemistry - A European Journal; vol. 18; 43; (2012); p. 13722 - 13732;","Article; Sugino, Hiroyoshi; Kawai, Hidetoshi; Umehara, Takeshi; Fujiwara, Kenshu; Suzuki, Takanori; Chemistry - A European Journal; vol. 18; 43; (2012); p. 13722 - 13732;"
633785,"Article; Carrilho, Rui M. B.; Costa, Goncalo N.; Neves, Angela C. B.; Pereira, Mariette M.; Grabulosa, Arnald; Bayon, J. Carles; Rocamora, Merce; Muller, Guillermo; European Journal of Inorganic Chemistry; 6; (2014); p. 1034 - 1041;","Article; Carrilho, Rui M. B.; Costa, Goncalo N.; Neves, Angela C. B.; Pereira, Mariette M.; Grabulosa, Arnald; Bayon, J. Carles; Rocamora, Merce; Muller, Guillermo; European Journal of Inorganic Chemistry; 6; (2014); p. 1034 - 1041;"
633786,"Article; Mehrabi, Hossein; Esfandiarpour, Zeinab; Journal of Sulfur Chemistry; vol. 36; 6; (2015); p. 583 - 590;","Article; Mehrabi, Hossein; Esfandiarpour, Zeinab; Journal of Sulfur Chemistry; vol. 36; 6; (2015); p. 583 - 590;"


In [19]:
def get_type(string):
    if string.startswith('Article;'):
        return 'article'
    elif string.startswith('Patent;'):
        return 'patent'
    else:
        return None

In [20]:
ref_df['publ_type'] = ref_df['splitted_ref'].apply(get_type)

In [21]:
ref_df = ref_df[~ref_df['publ_type'].isna()].copy()

In [22]:
ref_df['publ_type'].value_counts()

publ_type
article    485150
patent     143091
Name: count, dtype: int64

In [95]:
ref_df['publ_type'].value_counts()

publ_type
article    485150
patent     143091
Name: count, dtype: int64

In [185]:
article_df = ref_df[ref_df['publ_type'] == 'article']

In [186]:
article_df

Unnamed: 0,init_ref,splitted_ref,publ_type
0,"Article; Kalluvettukuzhy, Neena K.; Thilagar, Pakkirisamy; Organometallics; vol. 36; 14; (2017); p. 2692 - 2701;","Article; Kalluvettukuzhy, Neena K.; Thilagar, Pakkirisamy; Organometallics; vol. 36; 14; (2017); p. 2692 - 2701;",article
1,"Article; Franke, Oliver; Wiesler, Beatrix E.; Lehnert, Nicolai; Naether, Christian; Ksenofontov, Vadim; Neuhausen, Joerg; Tuczek, Felix; Inorganic Chemistry; vol. 41; 13; (2002); p. 3491 - 3499; (from Gmelin)","Article; Franke, Oliver; Wiesler, Beatrix E.; Lehnert, Nicolai; Naether, Christian; Ksenofontov, Vadim; Neuhausen, Joerg; Tuczek, Felix; Inorganic Chemistry; vol. 41; 13; (2002); p. 3491 - 3499;",article
2,"Article; Legenzov, Eric A.; Muralidharan, Sukumaran; Woodcock, Lukas B.; Eaton, Gareth R.; Eaton, Sandra S.; Rosen, Gerald M.; Kao, Joseph P. Y.; Bioconjugate Chemistry; vol. 27; 12; (2016); p. 2923 - 2930;","Article; Legenzov, Eric A.; Muralidharan, Sukumaran; Woodcock, Lukas B.; Eaton, Gareth R.; Eaton, Sandra S.; Rosen, Gerald M.; Kao, Joseph P. Y.; Bioconjugate Chemistry; vol. 27; 12; (2016); p. 2923 - 2930;",article
3,Article; Kudryavtsev; Shulga; Chupakhin; Churakov; Datsuk; Zabolotnev; Zefirov; Russian Chemical Bulletin; vol. 60; 4; (2011); p. 685 - 693;,Article; Kudryavtsev; Shulga; Chupakhin; Churakov; Datsuk; Zabolotnev; Zefirov; Russian Chemical Bulletin; vol. 60; 4; (2011); p. 685 - 693;,article
4,"Article; Zhang, Qi; Bai, Ping; Zheng, Cheng; Cheng, Yao; Wang, Tao; Lu, Xiaoxia; Bioorganic and Medicinal Chemistry; vol. 27; 12; (2019); p. 2387 - 2396;","Article; Zhang, Qi; Bai, Ping; Zheng, Cheng; Cheng, Yao; Wang, Tao; Lu, Xiaoxia; Bioorganic and Medicinal Chemistry; vol. 27; 12; (2019); p. 2387 - 2396;",article
...,...,...,...
633782,"Article; Kobayashi, Kazuhiro; Kozuki, Taketoshi; Fukamachi, Shuhei; Konishi, Hisatoshi; Helvetica Chimica Acta; vol. 93; 10; (2010); p. 2086 - 2093;","Article; Kobayashi, Kazuhiro; Kozuki, Taketoshi; Fukamachi, Shuhei; Konishi, Hisatoshi; Helvetica Chimica Acta; vol. 93; 10; (2010); p. 2086 - 2093;",article
633783,"Article; Jose, Jemini; Sreekanth; John, Athira M.; Basheer, Sabeel M.; Sreeja; Research on Chemical Intermediates; vol. 45; 2; (2019); p. 425 - 435;","Article; Jose, Jemini; Sreekanth; John, Athira M.; Basheer, Sabeel M.; Sreeja; Research on Chemical Intermediates; vol. 45; 2; (2019); p. 425 - 435;",article
633784,Article; Gholivand; Gholami; Tizhoush; Schenk; Fadaei; Bahrami; RSC Advances; vol. 4; 84; (2014); p. 44509 - 44516;,Article; Gholivand; Gholami; Tizhoush; Schenk; Fadaei; Bahrami; RSC Advances; vol. 4; 84; (2014); p. 44509 - 44516;,article
633785,"Article; Zhou, Chengcheng; Xu, Wenhan; Zhang, Pengbo; Jiang, Meijuan; Chen, Yuncong; Kwok, Ryan T. K.; Lee, Michelle M. S.; Shan, Guogang; Qi, Ruilian; Zhou, Xin; Lam, Jacky W. Y.; Wang, Shu; Tang, Ben Zhong; Advanced Functional Materials; vol. 29; 4; (2019);","Article; Zhou, Chengcheng; Xu, Wenhan; Zhang, Pengbo; Jiang, Meijuan; Chen, Yuncong; Kwok, Ryan T. K.; Lee, Michelle M. S.; Shan, Guogang; Qi, Ruilian; Zhou, Xin; Lam, Jacky W. Y.; Wang, Shu; Tang, Ben Zhong; Advanced Functional Materials; vol. 29; 4; (2019);",article


In [None]:
#'Angew. Chem.' если это не первая ссылка в строке - убирать нафиг 

In [23]:
def string2ref(string):
    type_ref = string.split(';')[0]
    ref = [x.strip() for x in string.split(';')[1:] if len(x.strip()) > 0]
    if type_ref == 'Article': 
        ref_dict = {
             'doi': '',
             'volume': '',
             'journal name': '',
             'title': '',
             'authors': '',
             'year': -1,
             'pages': ''}
        rest = []

#         if 'Angew. Chem.' in ref:
#             ref = ref[:ref.index('Angew. Chem.')]
#         if 'Article' in ref:
#             ref = ref[:ref.index('Article')]
#         if 'Patent' in ref:
#             ref = ref[:ref.index('Patent')]
        if 'C. A.' in ref:
            ref = ref[:ref.index('C. A.')]
        if 'C.A.' in ref:
            ref = ref[:ref.index('C.A.')]
        if 'C. I' in ref:
            ref = ref[:ref.index('C. I')]
        if 'C. II' in ref:
            ref = ref[:ref.index('C. II')]
        if 'C.' in ref:
            ref = ref[:ref.index('C.')]            
        for item in ref:
            assigned = False

            year_cands = re.findall(r'\(\d{4}\)', item)
            if len(year_cands) > 0:
                ref_dict['year'] = int(year_cands[0].replace('(', '').replace(')', ''))
                assigned = True
            if not assigned:
                if item.startswith('p.') or 'page' in item:
                    ref_dict['pages'] = item.replace('p.', '').replace(' ', '')
                    assigned = True
            if not assigned:
                if item.replace('-', '').replace('.', '').replace('/', '').replace('+', '').isnumeric():
                    assigned = True #'issue 
            if not assigned:
                if item in ['C', 'SUPPL. 1', 'SUPPL.1', 'PA', 'PB', 'II', '1-2 PART II', 'I', 'PART B',
                           'PART A', 'PC', 'l', 'pt 11']:
                    assigned = True #'some trash 
            if not assigned:
                if 'SPEC. ISS' in item :
                    assigned = True #'some trash                     
            if not assigned:
                if ' PART ' in item :
                    assigned = True #'some trash                       
            if not assigned:
                if 'SUPPL' in item :
                    assigned = True #'some trash     
            if not assigned:
                if item.startswith('vol.'):
                    ref_dict['volume'] = item.replace('vol.', '').replace(' ', '')
                    assigned = True        
            if not assigned:
                if item.startswith('(') and item.endswith(')'):
                    assigned = True #comment

            if not assigned:
                rest.append(item)
        ref_dict['journal name'] = rest[-1]   
        ref_dict['authors'] = ';'.join(rest[:-1])       
        return ref_dict

In [188]:
refs_list = []
for article in tqdm(article_df.to_dict('records')):
    ref_dict = string2ref(article['splitted_ref'])
    if ref_dict is not None:
        article.update(ref_dict)
        refs_list.append(article)


100%|████████████████████████████████| 485150/485150 [00:07<00:00, 69107.33it/s]


In [189]:
len(refs_list)

485150

In [190]:
article_df_new = pd.DataFrame(refs_list)

In [191]:
article_df_new

Unnamed: 0,init_ref,splitted_ref,publ_type,doi,volume,journal name,title,authors,year,pages
0,"Article; Kalluvettukuzhy, Neena K.; Thilagar, Pakkirisamy; Organometallics; vol. 36; 14; (2017); p. 2692 - 2701;","Article; Kalluvettukuzhy, Neena K.; Thilagar, Pakkirisamy; Organometallics; vol. 36; 14; (2017); p. 2692 - 2701;",article,,36,Organometallics,,"Kalluvettukuzhy, Neena K.;Thilagar, Pakkirisamy",2017,2692-2701
1,"Article; Franke, Oliver; Wiesler, Beatrix E.; Lehnert, Nicolai; Naether, Christian; Ksenofontov, Vadim; Neuhausen, Joerg; Tuczek, Felix; Inorganic Chemistry; vol. 41; 13; (2002); p. 3491 - 3499; (from Gmelin)","Article; Franke, Oliver; Wiesler, Beatrix E.; Lehnert, Nicolai; Naether, Christian; Ksenofontov, Vadim; Neuhausen, Joerg; Tuczek, Felix; Inorganic Chemistry; vol. 41; 13; (2002); p. 3491 - 3499;",article,,41,Inorganic Chemistry,,"Franke, Oliver;Wiesler, Beatrix E.;Lehnert, Nicolai;Naether, Christian;Ksenofontov, Vadim;Neuhausen, Joerg;Tuczek, Felix",2002,3491-3499
2,"Article; Legenzov, Eric A.; Muralidharan, Sukumaran; Woodcock, Lukas B.; Eaton, Gareth R.; Eaton, Sandra S.; Rosen, Gerald M.; Kao, Joseph P. Y.; Bioconjugate Chemistry; vol. 27; 12; (2016); p. 2923 - 2930;","Article; Legenzov, Eric A.; Muralidharan, Sukumaran; Woodcock, Lukas B.; Eaton, Gareth R.; Eaton, Sandra S.; Rosen, Gerald M.; Kao, Joseph P. Y.; Bioconjugate Chemistry; vol. 27; 12; (2016); p. 2923 - 2930;",article,,27,Bioconjugate Chemistry,,"Legenzov, Eric A.;Muralidharan, Sukumaran;Woodcock, Lukas B.;Eaton, Gareth R.;Eaton, Sandra S.;Rosen, Gerald M.;Kao, Joseph P. Y.",2016,2923-2930
3,Article; Kudryavtsev; Shulga; Chupakhin; Churakov; Datsuk; Zabolotnev; Zefirov; Russian Chemical Bulletin; vol. 60; 4; (2011); p. 685 - 693;,Article; Kudryavtsev; Shulga; Chupakhin; Churakov; Datsuk; Zabolotnev; Zefirov; Russian Chemical Bulletin; vol. 60; 4; (2011); p. 685 - 693;,article,,60,Russian Chemical Bulletin,,Kudryavtsev;Shulga;Chupakhin;Churakov;Datsuk;Zabolotnev;Zefirov,2011,685-693
4,"Article; Zhang, Qi; Bai, Ping; Zheng, Cheng; Cheng, Yao; Wang, Tao; Lu, Xiaoxia; Bioorganic and Medicinal Chemistry; vol. 27; 12; (2019); p. 2387 - 2396;","Article; Zhang, Qi; Bai, Ping; Zheng, Cheng; Cheng, Yao; Wang, Tao; Lu, Xiaoxia; Bioorganic and Medicinal Chemistry; vol. 27; 12; (2019); p. 2387 - 2396;",article,,27,Bioorganic and Medicinal Chemistry,,"Zhang, Qi;Bai, Ping;Zheng, Cheng;Cheng, Yao;Wang, Tao;Lu, Xiaoxia",2019,2387-2396
...,...,...,...,...,...,...,...,...,...,...
485145,"Article; Kobayashi, Kazuhiro; Kozuki, Taketoshi; Fukamachi, Shuhei; Konishi, Hisatoshi; Helvetica Chimica Acta; vol. 93; 10; (2010); p. 2086 - 2093;","Article; Kobayashi, Kazuhiro; Kozuki, Taketoshi; Fukamachi, Shuhei; Konishi, Hisatoshi; Helvetica Chimica Acta; vol. 93; 10; (2010); p. 2086 - 2093;",article,,93,Helvetica Chimica Acta,,"Kobayashi, Kazuhiro;Kozuki, Taketoshi;Fukamachi, Shuhei;Konishi, Hisatoshi",2010,2086-2093
485146,"Article; Jose, Jemini; Sreekanth; John, Athira M.; Basheer, Sabeel M.; Sreeja; Research on Chemical Intermediates; vol. 45; 2; (2019); p. 425 - 435;","Article; Jose, Jemini; Sreekanth; John, Athira M.; Basheer, Sabeel M.; Sreeja; Research on Chemical Intermediates; vol. 45; 2; (2019); p. 425 - 435;",article,,45,Research on Chemical Intermediates,,"Jose, Jemini;Sreekanth;John, Athira M.;Basheer, Sabeel M.;Sreeja",2019,425-435
485147,Article; Gholivand; Gholami; Tizhoush; Schenk; Fadaei; Bahrami; RSC Advances; vol. 4; 84; (2014); p. 44509 - 44516;,Article; Gholivand; Gholami; Tizhoush; Schenk; Fadaei; Bahrami; RSC Advances; vol. 4; 84; (2014); p. 44509 - 44516;,article,,4,RSC Advances,,Gholivand;Gholami;Tizhoush;Schenk;Fadaei;Bahrami,2014,44509-44516
485148,"Article; Zhou, Chengcheng; Xu, Wenhan; Zhang, Pengbo; Jiang, Meijuan; Chen, Yuncong; Kwok, Ryan T. K.; Lee, Michelle M. S.; Shan, Guogang; Qi, Ruilian; Zhou, Xin; Lam, Jacky W. Y.; Wang, Shu; Tang, Ben Zhong; Advanced Functional Materials; vol. 29; 4; (2019);","Article; Zhou, Chengcheng; Xu, Wenhan; Zhang, Pengbo; Jiang, Meijuan; Chen, Yuncong; Kwok, Ryan T. K.; Lee, Michelle M. S.; Shan, Guogang; Qi, Ruilian; Zhou, Xin; Lam, Jacky W. Y.; Wang, Shu; Tang, Ben Zhong; Advanced Functional Materials; vol. 29; 4; (2019);",article,,29,Advanced Functional Materials,,"Zhou, Chengcheng;Xu, Wenhan;Zhang, Pengbo;Jiang, Meijuan;Chen, Yuncong;Kwok, Ryan T. K.;Lee, Michelle M. S.;Shan, Guogang;Qi, Ruilian;Zhou, Xin;Lam, Jacky W. Y.;Wang, Shu;Tang, Ben Zhong",2019,


In [128]:
article_df_new[article_df_new['journal name'] == 'C. II']

Unnamed: 0,init_ref,splitted_ref,publ_type,doi,volume,journal name,title,authors,year,pages


In [129]:
temp_df = article_df_new['journal name'].value_counts().reset_index()
temp_df[temp_df['count']>3]

Unnamed: 0,journal name,count
0,Journal of the American Chemical Society,22088
1,Organic Letters,20741
2,Chemical Communications,17453
3,Organometallics,16964
4,Journal of Organic Chemistry,16950
5,Inorganic Chemistry,16763
6,Tetrahedron Letters,16180
7,Journal of Organometallic Chemistry,13734
8,Chemistry - A European Journal,13244
9,Tetrahedron,12065


# recognize dois

In [192]:
papers_metadata = {}
for file in tqdm(os.listdir('/media/oleg/hard_for_data/test_folder/refs_dict_by_issn')):
    try:
        with open(os.path.join('/media/oleg/hard_for_data/test_folder/refs_dict_by_issn', file)) as f:
            papers_metadata.update(json.loads(f.read()))
    except Exception as e:
        print(e)
        pass

 94%|████████████████████████████████████▍  | 2409/2575 [02:27<00:12, 13.26it/s]

Expecting ',' delimiter: line 1 column 256922 (char 256921)


100%|███████████████████████████████████████| 2575/2575 [02:34<00:00, 16.70it/s]


In [193]:
len(papers_metadata)

10762916

In [196]:
j_names = list(set(article_df_new['journal name']))

In [197]:
len(j_names)

2540

In [198]:
for j_name in tqdm(j_names):
    mask_journal = article_df_new['journal name'] == j_name
    
    for year in list(set(article_df_new[mask_journal]['year'])):
        mask_year = article_df_new['year'] == year
        
        dict_slice = {k:v for k,v in papers_metadata.items() if (('year' in v and v['year'] == year) and
                                                                 ('journal name' in v and 
                                                                  v['journal name'] == j_name))}
        for idx in article_df_new[mask_journal & mask_year].index:
            pages = article_df_new.loc[idx, 'pages']
            for doi in dict_slice:
                if 'pages' in dict_slice[doi]:
                    if dict_slice[doi]['pages'] == pages:
                        article_df_new.loc[idx, 'doi'] = doi
                        break        
    

100%|█████████████████████████████████████| 2540/2540 [5:00:10<00:00,  7.09s/it]


In [224]:
article_df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 485150 entries, 0 to 485149
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   init_ref      485150 non-null  object
 1   splitted_ref  485150 non-null  object
 2   publ_type     485150 non-null  object
 3   doi           485150 non-null  object
 4   volume        485150 non-null  object
 5   journal name  485150 non-null  object
 6   title         485150 non-null  object
 7   authors       485150 non-null  object
 8   year          485150 non-null  int64 
 9   pages         485150 non-null  object
dtypes: int64(1), object(9)
memory usage: 37.0+ MB


In [223]:
for j_name in tqdm(j_names):
    mask_journal = article_df_new['journal name'] == j_name
    mask_doi = article_df_new['doi'] == ''
    
    for year in list(set(article_df_new[mask_journal&mask_doi]['year'])):
        mask_year = article_df_new['year'] == year
        
        jname_list = list(set([j_name, j_name.replace('The', '').strip(), j_name.replace('and', '&'),
                     j_name.replace('and', '&amp;'), j_name.replace('&amp;', 'and'),
                     j_name.replace('&', 'and'), 'The ' + j_name]))
        dict_slice = {k:v for k,v in papers_metadata.items() if (('year' in v and v['year'] == year) and
                                                                         ('journal name' in v and 
                                                                          v['journal name'] in jname_list))}
        for idx in article_df_new[mask_journal & mask_year].index:
            pages = article_df_new.loc[idx, 'pages']
            for doi in dict_slice:
                if 'pages' in dict_slice[doi]:
                    if dict_slice[doi]['pages'] == pages:
                        article_df_new.loc[idx, 'doi'] = doi
                        break      

100%|█████████████████████████████████████| 2540/2540 [4:25:32<00:00,  6.27s/it]


In [237]:
num_reps = article_df_new['doi'].value_counts().reset_index()
failed_doi = list(num_reps[num_reps['count'] > 1]['doi'])[1:]

In [241]:
mask = article_df_new['doi'].isin(failed_doi)

In [243]:
article_df_new.loc[mask, 'doi'] = ''

In [244]:
article_df_new.loc[mask]

Unnamed: 0,init_ref,splitted_ref,publ_type,doi,volume,journal name,title,authors,year,pages
48,"Article; Deacon, G.B.; Gatehouse, B. M.; Ney, S. C.; Journal of Organometallic Chemistry; vol. 348; (1988); p. 141 - 148; (from Gmelin)","Article; Deacon, G.B.; Gatehouse, B. M.; Ney, S. C.; Journal of Organometallic Chemistry; vol. 348; (1988); p. 141 - 148;",article,,348,Journal of Organometallic Chemistry,,"Deacon, G.B.;Gatehouse, B. M.;Ney, S. C.",1988,141-148
60,"Article; Sun, Jia-Feng; Chen, Fei; Dougan, Brenda A.; Xu, Hui-Jun; Cheng, Yong; Li, Yi-Zhi; Chen, Xue-Tai; Xue, Zi-Ling; Journal of Organometallic Chemistry; vol. 694; 13; (2009); p. 2096 - 2105;","Article; Sun, Jia-Feng; Chen, Fei; Dougan, Brenda A.; Xu, Hui-Jun; Cheng, Yong; Li, Yi-Zhi; Chen, Xue-Tai; Xue, Zi-Ling; Journal of Organometallic Chemistry; vol. 694; 13; (2009); p. 2096 - 2105;",article,,694,Journal of Organometallic Chemistry,,"Sun, Jia-Feng;Chen, Fei;Dougan, Brenda A.;Xu, Hui-Jun;Cheng, Yong;Li, Yi-Zhi;Chen, Xue-Tai;Xue, Zi-Ling",2009,2096-2105
192,"Article; Haque, Rosenani A.; Ghdhayeb, Mohammed Z.; Salman, Abbas Washeel; Budagumpi, Srinivasa; Khadeer Ahamed, Mohamed B.; Abdul Majid, Amin M.S.; Inorganic Chemistry Communications; vol. 22; (2012); p. 113 - 119;","Article; Haque, Rosenani A.; Ghdhayeb, Mohammed Z.; Salman, Abbas Washeel; Budagumpi, Srinivasa; Khadeer Ahamed, Mohamed B.; Abdul Majid, Amin M.S.; Inorganic Chemistry Communications; vol. 22; (2012); p. 113 - 119;",article,,22,Inorganic Chemistry Communications,,"Haque, Rosenani A.;Ghdhayeb, Mohammed Z.;Salman, Abbas Washeel;Budagumpi, Srinivasa;Khadeer Ahamed, Mohamed B.;Abdul Majid, Amin M.S.",2012,113-119
243,"Article; Shipet, William D.; Sorensen, Erik J.; Journal of the American Chemical Society; vol. 128; 21; (2006); p. 7025 - 7035;","Article; Shipet, William D.; Sorensen, Erik J.; Journal of the American Chemical Society; vol. 128; 21; (2006); p. 7025 - 7035;",article,,128,Journal of the American Chemical Society,,"Shipet, William D.;Sorensen, Erik J.",2006,7025-7035
245,"Article; Liu, Min; Yuan, Wen-bing; Zhang, Qi; Yan, Lan; Yang, Ru-dong; Spectrochimica Acta Part A: Molecular and Biomolecular Spectroscopy; vol. 70; 5; (2008); p. 1114 - 1119;","Article; Liu, Min; Yuan, Wen-bing; Zhang, Qi; Yan, Lan; Yang, Ru-dong; Spectrochimica Acta Part A: Molecular and Biomolecular Spectroscopy; vol. 70; 5; (2008); p. 1114 - 1119;",article,,70,Spectrochimica Acta Part A: Molecular and Biomolecular Spectroscopy,,"Liu, Min;Yuan, Wen-bing;Zhang, Qi;Yan, Lan;Yang, Ru-dong",2008,1114-1119
...,...,...,...,...,...,...,...,...,...,...
484924,"Article; Huang, Shu-Yun; Li, Jian-Qiang; Wu, Xiao-Liu; Zhang, Xiao-Min; Luo, Ming-Biao; Luo, Feng; Inorganic Chemistry Communications; vol. 39; (2014); p. 1 - 4;","Article; Huang, Shu-Yun; Li, Jian-Qiang; Wu, Xiao-Liu; Zhang, Xiao-Min; Luo, Ming-Biao; Luo, Feng; Inorganic Chemistry Communications; vol. 39; (2014); p. 1 - 4;",article,,39,Inorganic Chemistry Communications,,"Huang, Shu-Yun;Li, Jian-Qiang;Wu, Xiao-Liu;Zhang, Xiao-Min;Luo, Ming-Biao;Luo, Feng",2014,1-4
484934,"Article; Smith, Jeremy M.; Long, Jeffrey R.; Inorganic Chemistry; vol. 49; 23; (2010); p. 11223 - 11230;","Article; Smith, Jeremy M.; Long, Jeffrey R.; Inorganic Chemistry; vol. 49; 23; (2010); p. 11223 - 11230;",article,,49,Inorganic Chemistry,,"Smith, Jeremy M.;Long, Jeffrey R.",2010,11223-11230
484951,"Article; Abou-Melha, Khlood S.; Spectrochimica Acta Part A: Molecular and Biomolecular Spectroscopy; vol. 70; 1; (2008); p. 162 - 170;","Article; Abou-Melha, Khlood S.; Spectrochimica Acta Part A: Molecular and Biomolecular Spectroscopy; vol. 70; 1; (2008); p. 162 - 170;",article,,70,Spectrochimica Acta Part A: Molecular and Biomolecular Spectroscopy,,"Abou-Melha, Khlood S.",2008,162-170
485098,"Article; Sedaghat, Tahereh; Aminian, Marjan; Bruno, Giuseppe; Amiri Rudbari, Hadi; Journal of Organometallic Chemistry; vol. 737; (2013); p. 26 - 31;","Article; Sedaghat, Tahereh; Aminian, Marjan; Bruno, Giuseppe; Amiri Rudbari, Hadi; Journal of Organometallic Chemistry; vol. 737; (2013); p. 26 - 31;",article,,737,Journal of Organometallic Chemistry,,"Sedaghat, Tahereh;Aminian, Marjan;Bruno, Giuseppe;Amiri Rudbari, Hadi",2013,26-31


In [245]:
article_df_new['doi'].value_counts()

doi
                                 181406
10.1021/acs.organomet.7b00332         1
10.1016/j.bmcl.2007.12.055            1
10.1039/c8sc05573d                    1
10.1016/j.tetlet.2011.10.095          1
                                  ...  
10.1016/j.bmc.2011.11.020             1
10.1055/s-2005-872080                 1
10.17344/acsi.2017.3844               1
10.1080/00397911.2011.565453          1
10.1021/ol7022104                     1
Name: count, Length: 303745, dtype: int64

In [248]:
mask_doi = article_df_new['doi'] == ''
j_names = list(set(article_df_new.loc[mask_doi]['journal name']))
len(j_names)

2494

In [252]:
for j_name in tqdm(j_names):
    mask_journal = article_df_new['journal name'] == j_name
    mask_doi = article_df_new['doi'] == ''
    for year in list(set(article_df_new[mask_journal&mask_doi]['year'])):
        mask_year = article_df_new['year'] == year
        
        jname_list = list(set([j_name, j_name.replace('The', '').strip(), j_name.replace('and', '&'),
                     j_name.replace('and', '&amp;'), j_name.replace('&amp;', 'and'),
                     j_name.replace('&', 'and'), 'The ' + j_name]))
        dict_slice = {k:v for k,v in papers_metadata.items() if (('year' in v and v['year'] == year) and
                                                                         ('journal name' in v and 
                                                                          v['journal name'] in jname_list))}
        if len(dict_slice) > 0:
            for idx in article_df_new[mask_journal & mask_year].index:
                pages = article_df_new.loc[idx, 'pages']
                vol = article_df_new.loc[idx, 'volume']
                for doi in dict_slice:
                    if 'pages' in dict_slice[doi] and 'volume' in dict_slice[doi]:
                        if dict_slice[doi]['pages'] == pages and dict_slice[doi]['volume'] == vol:
                            article_df_new.loc[idx, 'doi'] = doi
                            break      

100%|█████████████████████████████████████| 2494/2494 [6:18:13<00:00,  9.10s/it]


In [254]:
article_df_new['doi'].value_counts()

doi
                                 172635
10.1142/s1088424614500503             3
10.1021/ja104800w                     2
10.1021/j150324a020                   2
10.1016/j.ica.2010.04.027             2
                                  ...  
10.1039/c8ra08260j                    1
10.13005/ojc/350128                   1
10.1021/acs.inorgchem.7b00845         1
10.1021/ja00343a064                   1
10.1021/ol7022104                     1
Name: count, Length: 309582, dtype: int64

In [24]:
article_df_new == pd.read_csv('/media/oleg/second_ssd/reactions_from_reaxys/article_refs.csv', sep = '\t')

NameError: name 'article_df_new' is not defined

In [253]:
# article_df_new.to_csv('/media/oleg/second_ssd/reactions_from_reaxys/article_refs.csv', sep = '\t', index = False)

In [229]:
papers_metadata['10.1016/0022-328x(88)80543-6']

{'doi': '10.1016/0022-328x(88)80543-6',
 'issn': ['0022-328X'],
 'publisher': 'Elsevier BV',
 'url': 'http://dx.doi.org/10.1016/0022-328x(88)80543-6',
 'volume': '339',
 'pages': 'C1-C4',
 'journal name': 'Journal of Organometallic Chemistry',
 'title': 'Synthesis and absolute configuration of optically pure tricarbonyl(2, 4-cycloheptadienonium)iron tetrafluoroborate',
 'authors': 'Morita,Noboru;Asao,Toyonobu;Sotokawa,Hideo;Hatano,Masahiro;Tajiri,Akio',
 'year': 1988}

In [None]:
replace('&amp;', 'and')
replace('The', '').strip()

In [230]:
article_df_new[article_df_new['doi'] == '10.1016/0022-328x(88)80543-6']

Unnamed: 0,init_ref,splitted_ref,publ_type,doi,volume,journal name,title,authors,year,pages
80056,"Article; Fischer, Helmut; Pashalidis, Ioannis; Journal of Organometallic Chemistry; vol. 348; (1988); p. C1 - C4; (from Gmelin)","Article; Fischer, Helmut; Pashalidis, Ioannis; Journal of Organometallic Chemistry; vol. 348; (1988); p. C1 - C4;",article,10.1016/0022-328x(88)80543-6,348,Journal of Organometallic Chemistry,,"Fischer, Helmut;Pashalidis, Ioannis",1988,C1-C4
168574,"Article; Carre, F.H.; Corriu, R. J. P.; Guerin, C.; Henner, B. J. L.; Wong Chi Man, W. W. C.; Journal of Organometallic Chemistry; vol. 347; (1988); p. C1 - C4; (from Gmelin)","Article; Carre, F.H.; Corriu, R. J. P.; Guerin, C.; Henner, B. J. L.; Wong Chi Man, W. W. C.; Journal of Organometallic Chemistry; vol. 347; (1988); p. C1 - C4;",article,10.1016/0022-328x(88)80543-6,347,Journal of Organometallic Chemistry,,"Carre, F.H.;Corriu, R. J. P.;Guerin, C.;Henner, B. J. L.;Wong Chi Man, W. W. C.",1988,C1-C4
259017,"Article; Castel, Annie; Riviere, Pierre; Satge, Jacques; Ko, Young-Hoon; Journal of Organometallic Chemistry; vol. 342; 1; (1988); p. C1 - C4; (from Gmelin)","Article; Castel, Annie; Riviere, Pierre; Satge, Jacques; Ko, Young-Hoon; Journal of Organometallic Chemistry; vol. 342; 1; (1988); p. C1 - C4;",article,10.1016/0022-328x(88)80543-6,342,Journal of Organometallic Chemistry,,"Castel, Annie;Riviere, Pierre;Satge, Jacques;Ko, Young-Hoon",1988,C1-C4
261702,"Article; Okuda, Jun; Journal of Organometallic Chemistry; vol. 353; (1988); p. C1 - C4; (from Gmelin)","Article; Okuda, Jun; Journal of Organometallic Chemistry; vol. 353; (1988); p. C1 - C4;",article,10.1016/0022-328x(88)80543-6,353,Journal of Organometallic Chemistry,,"Okuda, Jun",1988,C1-C4
306068,"Article; Morita, Noboru; Asao, Toyonobu; Sotokawa, Hideo; Hatano, Masahiro; Tajiri, Akio; Journal of Organometallic Chemistry; vol. 339; (1988); p. C1 - C4; (from Gmelin)","Article; Morita, Noboru; Asao, Toyonobu; Sotokawa, Hideo; Hatano, Masahiro; Tajiri, Akio; Journal of Organometallic Chemistry; vol. 339; (1988); p. C1 - C4;",article,10.1016/0022-328x(88)80543-6,339,Journal of Organometallic Chemistry,,"Morita, Noboru;Asao, Toyonobu;Sotokawa, Hideo;Hatano, Masahiro;Tajiri, Akio",1988,C1-C4
380805,"Article; Kuhn, Norbert; Zauder, Edgar; Journal of Organometallic Chemistry; vol. 340; (1988); p. C1 - C4; (from Gmelin)","Article; Kuhn, Norbert; Zauder, Edgar; Journal of Organometallic Chemistry; vol. 340; (1988); p. C1 - C4;",article,10.1016/0022-328x(88)80543-6,340,Journal of Organometallic Chemistry,,"Kuhn, Norbert;Zauder, Edgar",1988,C1-C4


# Patents

In [178]:
patent_df = ref_df[ref_df['publ_type'] == 'patent']
patent_df 

Unnamed: 0,init_ref,splitted_ref,publ_type
17,"Patent; Institut Univ. de Ciencia i Tecnologia, S.A.; EP2452934; (2012); A1;","Patent; Institut Univ. de Ciencia i Tecnologia, S.A.; EP2452934; (2012); A1;",patent
19,Patent; Taiyuan University of Technology; Li Zhanfeng; Lv Xiang; Yuan Shuqing; Ren Jingkun; Hao Yuying; (18 pag.)CN106495975; (2017); A;,Patent; Taiyuan University of Technology; Li Zhanfeng; Lv Xiang; Yuan Shuqing; Ren Jingkun; Hao Yuying; (18 pag.)CN106495975; (2017); A;,patent
23,Patent; F. HOFFMANN-LA ROCHE AG; WO2008/17465; (2008); A1;,Patent; F. HOFFMANN-LA ROCHE AG; WO2008/17465; (2008); A1;,patent
26,Patent; Guizhou Chinese Academy Of Sciences Natural Result Chemical Emphasis Experiment Shi; Liu Sheng; Xu Mei; Guo Lianghua; Jian Yong; Duan Lian; Xie Kaiqiang; Wan Ke; Pan Weidong; (16 pag.)CN110054579; (2019); A;,Patent; Guizhou Chinese Academy Of Sciences Natural Result Chemical Emphasis Experiment Shi; Liu Sheng; Xu Mei; Guo Lianghua; Jian Yong; Duan Lian; Xie Kaiqiang; Wan Ke; Pan Weidong; (16 pag.)CN110054579; (2019); A;,patent
28,Patent; Istituto di Ricerche di Biologia Molecolare P. Angeletti SPA; US2009/312241; (2009); A1;,Patent; Istituto di Ricerche di Biologia Molecolare P. Angeletti SPA; US2009/312241; (2009); A1;,patent
...,...,...,...
633770,"Patent; TOLERO PHARMACEUTICALS, INC.; SIDDIQUI-JAIN, Adam; WARNER, Steven L.; FLYNN, Paul; BEARSS, David J.; FOULKS, Jason Marc; TOMIMATSU, Nozomi; FUJIMURA, Ken; UMEHARA, Hiroki; NONOYAMA, Akihito; KIGUCHIYA, Akihito; (441 pag.)WO2019/195753; (2019); A1;","Patent; TOLERO PHARMACEUTICALS, INC.; SIDDIQUI-JAIN, Adam; WARNER, Steven L.; FLYNN, Paul; BEARSS, David J.; FOULKS, Jason Marc; TOMIMATSU, Nozomi; FUJIMURA, Ken; UMEHARA, Hiroki; NONOYAMA, Akihito; KIGUCHIYA, Akihito; (441 pag.)WO2019/195753; (2019); A1;",patent
633772,Patent; G. D. Searle &amp; Co.; US4988707; (1991); A;,Patent; G. D. Searle & Co.; US4988707; (1991); A;,patent
633773,Patent; South China Agricultural University; Tang Youzhi; Liu Yahong; Jin Zhen; Wang Le; Xu Zixi; (17 pag.)CN109666009; (2019); A;,Patent; South China Agricultural University; Tang Youzhi; Liu Yahong; Jin Zhen; Wang Le; Xu Zixi; (17 pag.)CN109666009; (2019); A;,patent
633779,Patent; Monsanto Company; US5260262; (1993); A;,Patent; Monsanto Company; US5260262; (1993); A;,patent


In [179]:
def patent2ref(string):
    type_ref = string.split(';')[0]
    ref = [x.strip() for x in string.split(';')[1:] if len(x.strip()) > 0]
    if type_ref == 'Patent': 
        ref_dict = {
             'number': '',
             'year': '',
             'assignee': '',
             'type': '',
             'authors': ''}
        rest = []
       
        for item in ref:
            assigned = False

            year_cands = re.findall(r'\(\d{4}\)', item)
            if len(year_cands) > 0:
                ref_dict['year'] = int(year_cands[0].replace('(', '').replace(')', ''))
                assigned = True
            if not assigned and ref_dict['year'] != '':
                if len(re.findall(r'[A-Z]\d?', item))>0:
                    if re.findall(r'[A-Z]\d?', item)[0] == item:
                        ref_dict['type'] = item
                        assigned = True
            if not assigned and ref_dict['year'] == '':
                if len(re.findall(r'[A-Z]{2}\d{4,15}\/?\d{0,9}', item))> 0:
                    ref_dict['number'] = re.findall(r'[A-Z]{2}\d{4,15}\/?\d{0,9}', item)[0]
                    assigned = True
            if not assigned:
                rest.append(item)
        if len(rest) > 0:
            ref_dict['assignee'] = rest[0]
        if len(rest)> 1:
            ref_dict['authors'] = ';'.join(rest[1:])              
        return ref_dict

In [180]:
patent2ref('Patent; LG Chem, Ltd.; Lee Seong-gyu; Lee Seok-gu; Shin Jun-ho; (9 pag.)KR2020/64619; (2020); A; ')

{'number': 'KR2020/64619',
 'year': 2020,
 'assignee': 'LG Chem, Ltd.',
 'type': 'A',
 'authors': 'Lee Seong-gyu;Lee Seok-gu;Shin Jun-ho'}

In [181]:
patent_df

Unnamed: 0,init_ref,splitted_ref,publ_type
17,"Patent; Institut Univ. de Ciencia i Tecnologia, S.A.; EP2452934; (2012); A1;","Patent; Institut Univ. de Ciencia i Tecnologia, S.A.; EP2452934; (2012); A1;",patent
19,Patent; Taiyuan University of Technology; Li Zhanfeng; Lv Xiang; Yuan Shuqing; Ren Jingkun; Hao Yuying; (18 pag.)CN106495975; (2017); A;,Patent; Taiyuan University of Technology; Li Zhanfeng; Lv Xiang; Yuan Shuqing; Ren Jingkun; Hao Yuying; (18 pag.)CN106495975; (2017); A;,patent
23,Patent; F. HOFFMANN-LA ROCHE AG; WO2008/17465; (2008); A1;,Patent; F. HOFFMANN-LA ROCHE AG; WO2008/17465; (2008); A1;,patent
26,Patent; Guizhou Chinese Academy Of Sciences Natural Result Chemical Emphasis Experiment Shi; Liu Sheng; Xu Mei; Guo Lianghua; Jian Yong; Duan Lian; Xie Kaiqiang; Wan Ke; Pan Weidong; (16 pag.)CN110054579; (2019); A;,Patent; Guizhou Chinese Academy Of Sciences Natural Result Chemical Emphasis Experiment Shi; Liu Sheng; Xu Mei; Guo Lianghua; Jian Yong; Duan Lian; Xie Kaiqiang; Wan Ke; Pan Weidong; (16 pag.)CN110054579; (2019); A;,patent
28,Patent; Istituto di Ricerche di Biologia Molecolare P. Angeletti SPA; US2009/312241; (2009); A1;,Patent; Istituto di Ricerche di Biologia Molecolare P. Angeletti SPA; US2009/312241; (2009); A1;,patent
...,...,...,...
633770,"Patent; TOLERO PHARMACEUTICALS, INC.; SIDDIQUI-JAIN, Adam; WARNER, Steven L.; FLYNN, Paul; BEARSS, David J.; FOULKS, Jason Marc; TOMIMATSU, Nozomi; FUJIMURA, Ken; UMEHARA, Hiroki; NONOYAMA, Akihito; KIGUCHIYA, Akihito; (441 pag.)WO2019/195753; (2019); A1;","Patent; TOLERO PHARMACEUTICALS, INC.; SIDDIQUI-JAIN, Adam; WARNER, Steven L.; FLYNN, Paul; BEARSS, David J.; FOULKS, Jason Marc; TOMIMATSU, Nozomi; FUJIMURA, Ken; UMEHARA, Hiroki; NONOYAMA, Akihito; KIGUCHIYA, Akihito; (441 pag.)WO2019/195753; (2019); A1;",patent
633772,Patent; G. D. Searle &amp; Co.; US4988707; (1991); A;,Patent; G. D. Searle & Co.; US4988707; (1991); A;,patent
633773,Patent; South China Agricultural University; Tang Youzhi; Liu Yahong; Jin Zhen; Wang Le; Xu Zixi; (17 pag.)CN109666009; (2019); A;,Patent; South China Agricultural University; Tang Youzhi; Liu Yahong; Jin Zhen; Wang Le; Xu Zixi; (17 pag.)CN109666009; (2019); A;,patent
633779,Patent; Monsanto Company; US5260262; (1993); A;,Patent; Monsanto Company; US5260262; (1993); A;,patent


In [182]:
refs_list = []
for patent in tqdm(patent_df.to_dict('records')):
    ref_dict = patent2ref(patent['splitted_ref'])
    if ref_dict is not None:
        patent.update(ref_dict)
        refs_list.append(patent)

100%|████████████████████████████████| 143091/143091 [00:01<00:00, 78088.36it/s]


In [183]:
patent_df_new = pd.DataFrame(refs_list)

In [184]:
patent_df_new

Unnamed: 0,init_ref,splitted_ref,publ_type,number,year,assignee,type,authors
0,"Patent; Institut Univ. de Ciencia i Tecnologia, S.A.; EP2452934; (2012); A1;","Patent; Institut Univ. de Ciencia i Tecnologia, S.A.; EP2452934; (2012); A1;",patent,EP2452934,2012,"Institut Univ. de Ciencia i Tecnologia, S.A.",A1,
1,Patent; Taiyuan University of Technology; Li Zhanfeng; Lv Xiang; Yuan Shuqing; Ren Jingkun; Hao Yuying; (18 pag.)CN106495975; (2017); A;,Patent; Taiyuan University of Technology; Li Zhanfeng; Lv Xiang; Yuan Shuqing; Ren Jingkun; Hao Yuying; (18 pag.)CN106495975; (2017); A;,patent,CN106495975,2017,Taiyuan University of Technology,A,Li Zhanfeng;Lv Xiang;Yuan Shuqing;Ren Jingkun;Hao Yuying
2,Patent; F. HOFFMANN-LA ROCHE AG; WO2008/17465; (2008); A1;,Patent; F. HOFFMANN-LA ROCHE AG; WO2008/17465; (2008); A1;,patent,WO2008/17465,2008,F. HOFFMANN-LA ROCHE AG,A1,
3,Patent; Guizhou Chinese Academy Of Sciences Natural Result Chemical Emphasis Experiment Shi; Liu Sheng; Xu Mei; Guo Lianghua; Jian Yong; Duan Lian; Xie Kaiqiang; Wan Ke; Pan Weidong; (16 pag.)CN110054579; (2019); A;,Patent; Guizhou Chinese Academy Of Sciences Natural Result Chemical Emphasis Experiment Shi; Liu Sheng; Xu Mei; Guo Lianghua; Jian Yong; Duan Lian; Xie Kaiqiang; Wan Ke; Pan Weidong; (16 pag.)CN110054579; (2019); A;,patent,CN110054579,2019,Guizhou Chinese Academy Of Sciences Natural Result Chemical Emphasis Experiment Shi,A,Liu Sheng;Xu Mei;Guo Lianghua;Jian Yong;Duan Lian;Xie Kaiqiang;Wan Ke;Pan Weidong
4,Patent; Istituto di Ricerche di Biologia Molecolare P. Angeletti SPA; US2009/312241; (2009); A1;,Patent; Istituto di Ricerche di Biologia Molecolare P. Angeletti SPA; US2009/312241; (2009); A1;,patent,US2009/312241,2009,Istituto di Ricerche di Biologia Molecolare P. Angeletti SPA,A1,
...,...,...,...,...,...,...,...,...
143086,"Patent; TOLERO PHARMACEUTICALS, INC.; SIDDIQUI-JAIN, Adam; WARNER, Steven L.; FLYNN, Paul; BEARSS, David J.; FOULKS, Jason Marc; TOMIMATSU, Nozomi; FUJIMURA, Ken; UMEHARA, Hiroki; NONOYAMA, Akihito; KIGUCHIYA, Akihito; (441 pag.)WO2019/195753; (2019); A1;","Patent; TOLERO PHARMACEUTICALS, INC.; SIDDIQUI-JAIN, Adam; WARNER, Steven L.; FLYNN, Paul; BEARSS, David J.; FOULKS, Jason Marc; TOMIMATSU, Nozomi; FUJIMURA, Ken; UMEHARA, Hiroki; NONOYAMA, Akihito; KIGUCHIYA, Akihito; (441 pag.)WO2019/195753; (2019); A1;",patent,WO2019/195753,2019,"TOLERO PHARMACEUTICALS, INC.",A1,"SIDDIQUI-JAIN, Adam;WARNER, Steven L.;FLYNN, Paul;BEARSS, David J.;FOULKS, Jason Marc;TOMIMATSU, Nozomi;FUJIMURA, Ken;UMEHARA, Hiroki;NONOYAMA, Akihito;KIGUCHIYA, Akihito"
143087,Patent; G. D. Searle &amp; Co.; US4988707; (1991); A;,Patent; G. D. Searle & Co.; US4988707; (1991); A;,patent,US4988707,1991,G. D. Searle & Co.,A,
143088,Patent; South China Agricultural University; Tang Youzhi; Liu Yahong; Jin Zhen; Wang Le; Xu Zixi; (17 pag.)CN109666009; (2019); A;,Patent; South China Agricultural University; Tang Youzhi; Liu Yahong; Jin Zhen; Wang Le; Xu Zixi; (17 pag.)CN109666009; (2019); A;,patent,CN109666009,2019,South China Agricultural University,A,Tang Youzhi;Liu Yahong;Jin Zhen;Wang Le;Xu Zixi
143089,Patent; Monsanto Company; US5260262; (1993); A;,Patent; Monsanto Company; US5260262; (1993); A;,patent,US5260262,1993,Monsanto Company,A,
