# Create ipc scheme mapped to techonology concordance tables

Load the IPC scheme xml data downloaded from the WIPO website

In [1]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from collections import OrderedDict

FILEPATH = 'data/EN_ipc_scheme_20180101.xml'

# based on JaminSore's answer in https://stackoverflow.com/questions/28259301/how-to-convert-an-xml-file-to-nice-pandas-dataframe
def iter_docs(entry):

    text_body = entry.find("{http://www.wipo.int/classifications/ipc/masterfiles}textBody")
    num_text_bodies = len(entry.findall("{http://www.wipo.int/classifications/ipc/masterfiles}textBody"))
    try:
        text = ';'.join([text_part.text for text_part in text_body.iter('{http://www.wipo.int/classifications/ipc/masterfiles}text')])
    except TypeError:
        text = None
    
    entry_dict = entry.attrib
    entry_dict['num_text_bodies'] = num_text_bodies
    entry_dict['text'] = text
    
    yield entry_dict
    
def iter_entry(tree):
    for entry in tree.iter("{http://www.wipo.int/classifications/ipc/masterfiles}ipcEntry"):
        for row in iter_docs(entry):
            yield row
            
tree = ET.parse(FILEPATH)
ipc_scheme = pd.DataFrame(list(iter_entry(tree)))
if ipc_scheme['num_text_bodies'].describe()['mean']!=1:
    print("There's an entry with more than one text body detected. WHYYYY.")
else:
    ipc_scheme.drop('num_text_bodies',axis=1,inplace=True)

Load the ipc_concordance file that was downloaded from the WIPO main page and manually processed 

In [136]:
FILEPATH = 'data/wipo_ipc_concordance_table.csv'
concordance_table = pd.read_csv(FILEPATH)

# create the regex string that will be used for matching later
concordance_table['IPC_code_subclass'] = concordance_table['IPC_code'].str[0:4]
concordance_table['IPC_code_suffix'] = concordance_table['IPC_code'].str[4:-1].str.strip()
concordance_table['IPC_code_suffix'] = [''.join(['0',x]) if len(x)==1 else x for x in concordance_table['IPC_code_suffix']]
concordance_table['regex_string'] = concordance_table['IPC_code_subclass']+'\d\d'+concordance_table['IPC_code_suffix']+'\d+'

# Reindex by regex_string_length. We will be matching regex in this order later so 
# that the granular codes are applied after the broad ones, and won't be overwritten.
# Need the .str.strip() so that pandas doesn't add an additional '\' to the regex that I wrote. RAWR.
regex_length_index = concordance_table['regex_string'].str.len().sort_values().index
reindexed_concordance_table = concordance_table.reindex(regex_length_index)

concordance_map = OrderedDict(zip(reindexed_concordance_table['regex_string'],
                                  reindexed_concordance_table['Field_number']))

# Create the list of 35 technologies from WIPO based on the concordance table
wipo_ipc_technologies = concordance_table[['Field_number','Sector_en','Field_en']].drop_duplicates().reset_index(drop=True)

Add a technology field number to the ipc symbols based on the concordance map

In [139]:
ipc_scheme['wipo_tech_field_number'] = ipc_scheme['symbol'].replace(concordance_map,regex=True)
ipc_scheme['wipo_tech_field_number'] = pd.to_numeric(ipc_scheme['wipo_tech_field_number'],errors='coerce',downcast='integer')

Unnamed: 0,edition,endSymbol,entryType,kind,symbol,text,wipo_tech_field_number
0,,,K,s,A,HUMAN NECESSITIES,
1,,A01,K,t,A01,AGRICULTURE,
2,,,K,c,A01,AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;...,
3,,,K,u,A01B,"SOIL WORKING IN AGRICULTURE OR FORESTRY;PARTS,...",
4,,,K,i,A01B,HAND TOOLS;PLOUGHS;General construction;Specia...,
5,1968090120060101,,K,m,A01B0001000000,Hand tools,29.0
6,1968090120060101,,K,1,A01B0001020000,Spades;Shovels,29.0
7,1968090120060101,,K,2,A01B0001040000,with teeth,29.0
8,1968090120060101,,K,1,A01B0001060000,Hoes;Hand cultivators,29.0
9,1968090120060101,,K,2,A01B0001080000,with a single blade,29.0


Load the jtag technology concordance file

In [150]:
jtag_concordance = pd.read_csv('data/jtag_ipc_concordance_table.csv',dtype={'IPC_code_suffix':str})

# create the regex string that will be used for matching later
jtag_concordance['IPC_code_suffix'] = jtag_concordance['IPC_code_suffix'].replace(np.nan,'')
jtag_concordance['regex_string'] = jtag_concordance['IPC_code_subclass']+'\d\d'+jtag_concordance['IPC_code_suffix']+'\d+'

# Reindex by regex_string_length. We will be matching regex in this order later so 
# that the granular codes are applied after the broad ones, and won't be overwritten.
# Need the .str.strip() so that pandas doesn't add an additional '\' to the regex that I wrote. RAWR.
regex_length_index = jtag_concordance['regex_string'].str.len().sort_values().index
reindexed_jtag_concordance = jtag_concordance.reindex(regex_length_index)

jtag_concordance_map = OrderedDict(zip(reindexed_jtag_concordance['regex_string'],
                                       reindexed_jtag_concordance['technology_subarea_number']))

jtag_technology_subarea_numbers = jtag_concordance['technology_subarea_number'].unique().tolist()

Add a jtag technology subarea number to the ipc symbols based on the jtag concordance map

In [166]:
ipc_scheme['jtag_technology_subarea_number'] = ipc_scheme['symbol'].replace(jtag_concordance_map,regex=True)
ipc_scheme['jtag_technology_subarea_number'] = [x if x in jtag_technology_subarea_numbers else None for x in ipc_scheme['jtag_technology_subarea_number']]
ipc_scheme

Unnamed: 0,edition,endSymbol,entryType,kind,symbol,text,wipo_tech_field_number,jtag_technology_subarea_number
0,,,K,s,A,HUMAN NECESSITIES,,
1,,A01,K,t,A01,AGRICULTURE,,
2,,,K,c,A01,AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;...,,
3,,,K,u,A01B,"SOIL WORKING IN AGRICULTURE OR FORESTRY;PARTS,...",,
4,,,K,i,A01B,HAND TOOLS;PLOUGHS;General construction;Specia...,,
5,1968090120060101,,K,m,A01B0001000000,Hand tools,29.0,
6,1968090120060101,,K,1,A01B0001020000,Spades;Shovels,29.0,
7,1968090120060101,,K,2,A01B0001040000,with teeth,29.0,
8,1968090120060101,,K,1,A01B0001060000,Hoes;Hand cultivators,29.0,
9,1968090120060101,,K,2,A01B0001080000,with a single blade,29.0,


In [172]:
#concordance_table.to_csv('output/wipo_ipc_concordance_table_20180101.csv',index=False)
#wipo_ipc_technologies.to_csv('output/wipo_ipc_technologies_20180101.csv',index=False)
#ipc_scheme.to_csv('output/ipc_scheme_20180101.csv',index=False)

Stuff that wasn't used

In [66]:
# Code from www.austintaylor.io/lxml/python/pandas/xml/dataframe/2016/07/08/convert-xml-to-pandas-dataframe
# didn't work out in the end. But keep here in case.
class XML2DataFrame:

    def __init__(self, tree):
        self.root = tree.getroot()

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            parsed[element.tag] = element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)


FILEPATH = 'data/EN_ipc_scheme_20180101.xml'
tree = ET.parse(FILEPATH)
xml2df = XML2DataFrame(tree)

scheme = xml2df.process_data()
scheme

'HUMAN NECESSITIES'