In [1]:
import os
import xml.etree.ElementTree as ET
import pdb
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import pandas as pd
from sklearn.preprocessing import LabelEncoder




In [2]:
def build_classification_tree(node):
    #pdb.set_trace()
    classification_symbol = node.find('classification-symbol').text
    title_text_list=[]
    cpc_text_list=[]
    title_parts = node.findall('class-title/title-part')
    for title_part in title_parts:
        text_tag = title_part.find('text')
        if text_tag is not None:
            text= text_tag.text
            title_text_list.append(text)
            
    cpc_text = node.findall('class-title/title-part/CPC-specific-text')
    for cpc_part in cpc_text:
        text_tag = cpc_part.find('text')
        if text_tag is not None:
            text= text_tag.text
            cpc_text_list.append(text)

    classification_def = title_text_list + cpc_text_list
    classification_def = [def_item for def_item in classification_def if def_item is not None]
    classification_def=';'.join(classification_def)
    children = node.findall('classification-item')

    if children:
        child_results = [build_classification_tree(child) for child in children]
        return (classification_symbol, classification_def, child_results)
    else:
        return (classification_symbol, classification_def)


In [3]:
# Function to process XML files in a folder
def process_xml_files_in_folder(folder_path):
    #pdb.set_trace()
    classification_trees = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.xml'):
            xml_file_path = os.path.join(folder_path, filename)
            tree = ET.parse(xml_file_path)
            root = tree.getroot()
            baseroot = root[0]  # Assuming the structure is consistent
            classification_tree = build_classification_tree(baseroot)
            #print(classification_tree)
            classification_trees.append((filename, classification_tree))
    return classification_trees

In [4]:
# Full path to folder containing XML files
folder_path1 = os.path.join(os.path.expanduser("~"), "Downloads", "CPCSchemeXML202401")
folder_path2 = os.path.join(os.path.expanduser("~"), "Downloads", "CPCSchemeXML202401","ClassXMLfiles")
# Process XML files in the folder
classification_trees = process_xml_files_in_folder(folder_path1)
c_tree=process_xml_files_in_folder(folder_path2)

# Print classification trees
#classification_trees

In [5]:
import pdb

def get_child(child_list, prev_symbols, result):
    #pdb.set_trace()
    
    for child_tuple in child_list:
        symbol = child_tuple[0]
        definition = child_tuple[1]
        result[symbol] = prev_symbols + [definition]
        
        if len(child_tuple) == 3:
            sub_child = child_tuple[2]
            get_child(sub_child, prev_symbols + [definition], result)

    return result


In [6]:
withSlashDict = {}

In [7]:

for i in range(len(c_tree)):
    #child_list_example =[classification_trees[i][1]]
    child_list_example1 =[c_tree[i][1]]
    
    # Calling the function
    get_child(child_list_example1, [], withSlashDict)
    


In [8]:

for i in range(len(classification_trees)):
    prev=classification_trees[i][0].replace('.xml','')[-4:-1]
    child_list_example =[classification_trees[i][1]]
    #print(prev_sym)
    # Calling the function
    get_child(child_list_example, withSlashDict[prev], withSlashDict)
    


In [9]:
withSlashDict

{'A': ['HUMAN NECESSITIES'],
 'A01': ['HUMAN NECESSITIES',
  'AGRICULTURE',
  'AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING'],
 'A01B': ['HUMAN NECESSITIES',
  'AGRICULTURE',
  'AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING',
  'SOIL WORKING IN AGRICULTURE OR FORESTRY;PARTS, DETAILS, OR ACCESSORIES OF AGRICULTURAL MACHINES OR IMPLEMENTS, IN GENERAL '],
 'A01C': ['HUMAN NECESSITIES',
  'AGRICULTURE',
  'AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING',
  'PLANTING;SOWING;FERTILISING '],
 'A01D': ['HUMAN NECESSITIES',
  'AGRICULTURE',
  'AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING',
  'HARVESTING;MOWING'],
 'A01F': ['HUMAN NECESSITIES',
  'AGRICULTURE',
  'AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING',
  'PROCESSING OF HARVESTED PRODUCE;HAY OR STRAW PRESSES;DEVICES FOR STORING AGRICULTURAL OR HORTICULTURAL PRODUCE '],
 'A01G': ['HUMAN NECESSITIES',
  'AGRICULTURE',
  'AGRICULTURE;FORESTRY;ANIMAL HUSBAN

In [10]:

for key, value_list in withSlashDict.items():
    filtered_list = [value for value in value_list if value != '']
    withSlashDict[key] = filtered_list

In [11]:
import re
withoutSlashDict={}
for key,val in withSlashDict.items():
    new_key=re.sub(r'[^a-zA-Z0-9]','',key)
    withoutSlashDict[new_key]='/ '.join(val)


In [12]:
withoutSlashDict

{'A': 'HUMAN NECESSITIES',
 'A01': 'HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING',
 'A01B': 'HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING/ SOIL WORKING IN AGRICULTURE OR FORESTRY;PARTS, DETAILS, OR ACCESSORIES OF AGRICULTURAL MACHINES OR IMPLEMENTS, IN GENERAL ',
 'A01C': 'HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING/ PLANTING;SOWING;FERTILISING ',
 'A01D': 'HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING/ HARVESTING;MOWING',
 'A01F': 'HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING/ PROCESSING OF HARVESTED PRODUCE;HAY OR STRAW PRESSES;DEVICES FOR STORING AGRICULTURAL OR HORTICULTURAL PRODUCE ',
 'A01G': 'HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;TRAPPING;FISHING/ HORTICULTURE;CULTIVATION OF VEGETABLES, FLOWERS

In [13]:
symbols=list(withoutSlashDict.keys())
definitions=list(withoutSlashDict.values())

In [49]:
print(len(symbols))
print(len(definitions))

260419
260419


In [14]:
df=pd.DataFrame({
    'symbols':symbols,
    'definitions':definitions
})

In [15]:
df

Unnamed: 0,symbols,definitions
0,A,HUMAN NECESSITIES
1,A01,HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FO...
2,A01B,HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FO...
3,A01C,HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FO...
4,A01D,HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FO...
...,...,...
260414,Y10T4831864,GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...
260415,Y10T4831873,GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...
260416,Y10T4831882,GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...
260417,Y10T4831891,GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...


In [26]:
#tokenisation
tok_def=Tokenizer()
tok_def.fit_on_texts(definitions)
word_index = tok_def.word_index
seq_def=tok_def.texts_to_sequences(definitions)


In [50]:
len(seq_def)

260419

In [27]:
max_seq_length=max([len(seq) for seq in seq_def])

In [51]:
max_seq_length

231

In [28]:
pad_seq=pad_sequences(seq_def,maxlen=max_seq_length,padding='post')

In [53]:
len(pad_seq)

260419

In [29]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(symbols)

In [54]:
encoded_labels.shape

(260419,)

In [30]:
x_train,x_test,y_train,y_test=train_test_split(pad_seq,encoded_labels,test_size=0.2,random_state=42)

In [56]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(208335, 231)
(52084, 231)
(208335,)
(52084,)


In [58]:
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=100, input_length=max_seq_length))
model.add(LSTM(units=128))
model.add(Dense(units=len(set(symbols)), activation='softmax'))

In [59]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [60]:
model.fit(x_train, y_train, epochs=2, verbose=1, batch_size=64,validation_split=0.1)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x1bd03d1c310>

In [69]:
import json 
import pandas as pd

In [79]:
file="C:\\Users\\pooja\\Downloads\\JSONFiles\\A\\A01B100.json"
with open(file,'r') as filename:
    data=json.load(filename)
df=pd.DataFrame([data])
df

Unnamed: 0,keyWithSlash,keywithoutSlash,Definition,last_def
0,A01B1/00,A01B100,HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FO...,Hand tools


In [85]:
file="C:\\Users\\pooja\\Downloads\\JSONFiles\\A\\A.json"
with open(file,'r') as filename:
    data=json.load(filename)
new_df=pd.DataFrame([data])
df=pd.concat([df,new_df],ignore_index=True)
df

Unnamed: 0,keyWithSlash,keywithoutSlash,Definition,last_def
0,A01B1/00,A01B100,HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FO...,Hand tools
1,A01B,A01B,HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FO...,"SOIL WORKING IN AGRICULTURE OR FORESTRY;PARTS,..."
2,A01,A01,HUMAN NECESSITIES/ AGRICULTURE/ AGRICULTURE;FO...,AGRICULTURE;FORESTRY;ANIMAL HUSBANDRY;HUNTING;...
3,A,A,HUMAN NECESSITIES,HUMAN NECESSITIES


In [77]:
df=pd.read_json("C:\\Users\\pooja\\Downloads\\JSONFiles\\A\\A01.json")

ValueError: If using all scalar values, you must pass an index

In [None]:
d