In [1]:
import os
import sys
import re
#update with the path to your config file
sys.path.append("F:\Caitlin_PV\Config_File")
import pv_config as config

from lxml import etree
import csv
import unittest
from collections import defaultdict

## Clean Code

#### Helper Functions

In [2]:
def get_xml(valid_xml_file):
    '''
    Return a xml object with patent data
    :param valid_xml_file: a valid xml object
    :return the root object of the xml file, ready to parse
    '''
    tree = etree.parse(valid_xml_file)
    root = tree.getroot()
    return root

In [3]:
def process_patent_numbers(raw_patent_num):
    '''
    Helper function ot transform patent ids into thier final format
    :param raw_patent_num: patent number extracted from the raw XML
    :return cleaned patent id.
    '''
    num = re.findall('\d+', raw_patent_num)[0] #get the just numbers in string form
    if num[0].startswith("0"):
        num = num[1:]
        let = re.findall('[a-zA-Z]+', raw_patent_num) #get the letter prefixes
    if let:
        let = let[0]#list to string
        clean_patent_num = let + num
    else:
        clean_patent_num = num
    return clean_patent_num
    

In [4]:
def process_date(date): 
    '''
    Takes a date formated as 6 numbers and returns it with dashes and days that are 00 replaced with 01
    :params date: a date object formatted as 6 numbers
    :returns cleaned up date
    '''
    if date is not None:
        if date[6:] != "00":
            date = date[:4]+'-'+date[4:6]+'-'+date[6:]
        else:
            date = date[:4]+'-'+date[4:6]+'-'+'01'
    return date

In [5]:
def recursive_children(xml_element, parent_field=""):
    '''
    :params xml_element: xml object can be nested
    :params parent_field: parent of nested xml object
    :returns a dictionary of the tags and texts of nested xml object
    '''
    test_list = []
    if len(xml_element)==0:
        if parent_field:
            test_list.append((parent_field+"-"+xml_element.tag, xml_element.text))
        else:
            test_list.append((xml_element.tag, xml_element.text)) 
        #print xml_element.tag, xml_element.text
    else:
        parent_field = xml_element.tag
        for element in xml_element:
            test_list += recursive_children(element, parent_field)

    return test_list

#### Data Functions

In [195]:
def get_entity(patent, entity_name, attribute_list=None):
    '''
    :params patent: take the xml object representing a patent
    :params entity_name: a string with the xml tag for an entity with single or multiple entities
    :returns a list of default dictionaries with all the data for the entity and processes dates
    '''
    var_list=[]
    xml = patent.findall('.//'+entity_name)
    for field in xml:
        data={}
        if attribute_list:
            for attribute in attribute_list:
                data[attribute]=field.attrib[attribute]
        #recursive function modifies data dictionary defined above
        results_list = recursive_children(field)
        data.update(dict(results_list))
        for key in data.keys():
            if 'date' in key:
                data[key] = process_date(data[key])
        var_list.append(data)
    return var_list

### Processed Entities

In [57]:
#Single
#number-of-claims
#us-application-series code
#figures
#pct-or-regional-publishing-data
#us-term-of-grant
#pct-or-regional-filing-data
#publication-reference: this gives you the patent data
#application-reference, attribute_list=['appl-type']
#invention-title, attribute_list=['id']
#us-application-series-code
#us-botanic


#Multiple
#us-exemplary-claim
#classifications-ipcr
#classification-national
#inventors
#us-references-cited
#assignees
#'us-applicants', attribute_list=['sequence', 'app-type', 'designation', 'applicant-authority-category'
#'inventors', attribute_list=['sequence', 'designation']
#'agents', attribute_list=['sequence', 'rep-type']
#us_related_docs


## To Run Process

In [10]:
patents = get_xml("D:\Caitlin_PV\Data\ipg171212_clean.xml")

## Working Area

In [11]:
patents #is an xml-tree object that has all the patents an information
patents[104] #is the element that represnts the 104th patent 

<Element us-patent-grant at 0x2436b48>

In [12]:
#The main highest level grouings are things like 'abstract','figures', and 'us-bibliographic-data-grant
#'us-bibliographic-data-grant' has most of the fields we are working with

In [205]:
patent = patents[3705]

### PostProcessing

In [None]:
#TODO role for examiners

In [None]:
#TODO #classification-national post processing, line 556

### To do