In [434]:
import re
import pandas as pd
from os import listdir
from os.path import isfile, join

In [507]:
# preparing regular expressions for the desired columns

grant_id = re.compile('file\=\"([U][S]\w\w\d{6})\-\d{8}\.XML\"')
patent_title = re.compile("<invention-title id=\"\w{5,6}\">(.*?)</invention-title>") 
kind = re.compile("<kind>([A-Z]\d)</kind>")
number_of_claim = re.compile("\<number\-of\-claims\>(\d{1,4})\<\/number\-of\-claims\>")
first_name = re.compile("<first-name>(.*?)</first-name>")
last_name = re.compile("<last-name>(.*?)</last-name>")
citation_by_examiner = re.compile("\<category\>cited by examiner<\/category\>")
citation_by_applicant = re.compile("\<category>cited by applicant\<\/category\>")
claim_text = re.compile("<claim-text>[\s\S<]*</claim-text>")
abstract = re.compile("\<abstract id\=\"abstract\"\>\n\<p id\=\"p\-0001\" num\=\"0000\"\>(.*?)\<\/p\>\n\<\/abstract\>")
classification = re.compile("<main-classification>(.*?)</main-classification>")

# cleaner for claim_text
cleaner = re.compile('<.*?>') 
cleaner2 = re.compile('\n')
cleaner3 = re.compile('\,\,\,')
cleaner4 = re.compile("[\.][\,][\,]")
cleaner5 = re.compile("[\,][\,]")
cleaner6 = re.compile("[\;][\,]")

In [508]:
mypath = './dataset'
paths = [mypath+'/'+f for f in listdir(mypath) if isfile(join(mypath, f))]
nbr_of_patents = []

df = pd.DataFrame(columns=['grant_id','patent_title','kind','number_of_claims','inventors',
                           'citations_applicant_count','citations_examiner_count','claims_text','abstract','main_classification'])
for path in paths:
    print(path)
    file = open(file=path, mode='r') #opening the file in read mode
    file_content_raw = file.read()
    file.close()
    text1 = re.compile("<\?xml version=\"1\.0\" encoding\=\"UTF\-8\"\?>")
    file_content = text1.split(file_content_raw)
    while '' in file_content:
        file_content.remove('')
    print("No of patents :", len(file_content))
    nbr_of_patents.append(len(file_content))
    
    gid_list, title_list, kind_list, no_of_claim_list, name_list, applicant_list, examiners_list, claim_list, abstract_list, cpc_list = ([] for i in range(10))

    for line in file_content:

        gid = grant_id.findall(line)
        title = patent_title.findall(line)
        kinds = kind.findall(line)
        sclaim = number_of_claim.findall(line)

        #looking for inventors
        inventors = re.findall("<classification-national.*?>[\s\S]*</classification-national>",line)
        for person in inventors:
            first = first_name.findall(person)
            last = last_name.findall(person)
        name = [firstName +" "+ lastName for firstName, lastName in zip(first,last)]
        if len(name) == 0:
            names = "NA"
        else:
            names = name

        #this code is to count number of citations by applicant
        if len(citation_by_applicant.findall(line)) == 0:
            citation_by_applicants = 0
        else:
            citation_by_applicants = len(citation_by_applicant.findall(line)) 

        # count for citation_by_examiner
        if len(citation_by_examiner.findall(line)) == 0:
            citation_by_examiners = 0
        else:    
            citation_by_examiners = len(citation_by_examiner.findall(line))   

        # Search for claim_text
        if (len(re.findall("<claim-text>[\s\S<]*</claim-text>",line)) == 0):
            claim_text = ["NA"]
        else:
            claim_text = re.findall("<claim-text>[\s\S<]*</claim-text>",line) 

        #Reading abstract
        abst = abstract.findall(line)
        if len(abst) == 0:
            abstracts = ["NA"]
        else:    
            abstracts = abst  
            
        if len(title) == 0:
            title = ["NA"]
            
        #Reading CPC
        classification_national = re.findall("<classification-ipcr>[\s\S<]*</classification-ipcr>",line)
        for cpc in classification_national:
            cpc_section = classification.findall(cpc)
            if len(cpc_section) == 0:
                cpc_section  = ["NA"] 
            else:
                cpc_section ==cpc_section

        # check if there is more than one gid to append all the lists
        if len(gid) != 0:                             
            gid_list.append(gid[0])
            title_list.append(title[0])
            kind_list.append(kinds[0])
            no_of_claim_list.append(sclaim[0])
            name_list.append(names)
            applicant_list.append(citation_by_applicants)
            examiners_list.append(citation_by_examiners)
            claim_list.append(claim_text[0])
            abstract_list.append(abstracts[0])
            cpc_list.append(cpc_section[0])

    #Use cleaners for claim text         
    element = 0
    for items in claim_list:
        claim_list[element] = re.sub(cleaner,'',claim_list[element])
        claim_list[element] = re.sub(cleaner2,',',claim_list[element])
        claim_list[element] = re.sub(cleaner3,',',claim_list[element])
        claim_list[element] = re.sub(cleaner4,'.,',claim_list[element])
        claim_list[element] = re.sub(cleaner5,',',claim_list[element])
        claim_list[element] = re.sub(cleaner6,'; ',claim_list[element])
        element = element + 1

    # Replace abbreviation to get the kind 
    Kind1 = [w.replace('P2', 'Plant Patent Grant(with a published application) issued on or after January 2, 2001') for w in kind_list]
    Kind2 = [w.replace('B2', 'Utility Patent Grant (with a published application) issued on or after January 2, 2001.') for w in Kind1]
    Kind3 = [w.replace('S1', 'Design Patent') for w in Kind2]
    Kind4 = [w.replace('B1', 'Utility Patent Grant (no published application) issued on or after January 2, 2001.') for w in Kind3]

    # Get all results ad DataFrame
    data_frame = pd.DataFrame(
        {'grant_id': gid_list,
         'patent_title': title_list,
         'kind': Kind4,
         'number_of_claims':no_of_claim_list,
         'inventors':name_list,
         'citations_applicant_count':applicant_list,
         'citations_examiner_count':examiners_list,
         'claims_text':claim_list,
         'abstract':abstract_list,
         'main_classification': cpc_list
        })
    
    df = df.append(data_frame, ignore_index=True)

./dataset/1.csv


  df = df.append(data_frame, ignore_index=True)


No of patents : 1
./dataset/ipa230105.xml
No of patents : 8047


  df = df.append(data_frame, ignore_index=True)


In [509]:
stats = pd.DataFrame({'date':[path.split('/')[-1].split('.')[0] for path in paths],'number of patents':nbr_of_patents})
stats.to_csv('./dataset/1.csv')

In [510]:
df.to_csv('./UsptoDataset/1.csv')

In [511]:
# Read uspto dataset
import pandas as pd
import numpy as np


df = pd.read_csv('./UsptoDataset/1.csv')
#usptodataset=df[["grant_id","claims_text","abstract", "main_classification"]]
#usptodataset= usptodataset.dropna()
#Ab_df = usptodataset.reset_index(drop=True)
len(df)

0

In [499]:
df.head()

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,grant_id,patent_title,kind,number_of_claims,inventors,citations_applicant_count,citations_examiner_count,claims_text,abstract,main_classification


In [420]:
#Ab_df["main_classification"].to_csv('./UsptoDataset/test.txt', header=None, index=None, sep=' ', mode='a')

In [482]:
usd_transactions = df[df['main_classification'].str.startswith('D')]
usd_transactions

Unnamed: 0.1,Unnamed: 0,grant_id,patent_title,kind,number_of_claims,inventors,citations_applicant_count,citations_examiner_count,claims_text,abstract,main_classification
0,0,USD0973998,Fruit gum,Design Patent,1,"['Samuel G&#xfc;ttinger', 'Anja Aubert']",0,13,,,D 1109
1,1,USD0973999,Churro-shaped pet treat,Design Patent,1,"['Allyse McCann', 'Tiffany Dawn Potter', 'Erin...",44,13,,,D 1125
2,2,USD0974000,Neck gaiter,Design Patent,1,['James A. Sonntag'],0,39,,,D 2500
3,3,USD0974001,Ski glove,Design Patent,1,['Haiyan Liu'],1,6,,,D 2617
4,4,USD0974002,Glove upper,Design Patent,1,"['Christopher Matthew Weed', 'Mark Lee Butts',...",9,8,,,D 2619
...,...,...,...,...,...,...,...,...,...,...,...
2230,2230,US11541942,Vehicle bottom structure,Utility Patent Grant (with a published applica...,4,"['Kazuki Ohtake', 'Hideo Takeda', 'Yasuhide Ma...",2,6,,A vehicle bottom structure includes a slope de...,D34 28
3324,3324,US11543043,Free-standing faucet,Utility Patent Grant (with a published applica...,7,"['Chun-Hung Li', 'Yung-Cheng Yu', 'Jiun-Li Tsa...",1,5,,A free-standing faucet which includes a valve ...,D23239
4056,4056,US11543779,Timepiece watch face,Utility Patent Grant (no published application...,3,['Meridyth Mischel Webber'],15,9,,A timepiece having a face member with a face s...,D10 23
4057,4057,US11543780,Time displaying apparatus,Utility Patent Grant (no published application...,6,['Christopher George Garcia'],0,6,,A time keeping apparatus includes a first regi...,D10 15


In [483]:
usptodataset=usd_transactions[["grant_id","patent_title","claims_text","abstract","main_classification"]]
usptodataset= usptodataset["abstract"].dropna()
Ab_df = usptodataset.reset_index(drop=True)
len(Ab_df)

11

In [402]:
dataframes = []

for filename in ['./UsptoDataset/df1-60.csv',
                 './UsptoDataset/df61.csv', 
                 './UsptoDataset/df62.csv',
                 './UsptoDataset/df63.csv',
                 './UsptoDataset/df64.csv',
                 './UsptoDataset/df65.csv',
                 './UsptoDataset/df66.csv',
                 './UsptoDataset/df67.csv',
                 './UsptoDataset/df68.csv',
                 './UsptoDataset/df69.csv',
                 './UsptoDataset/df70.csv',
                 './UsptoDataset/df71.csv',
                 './UsptoDataset/df72.csv',
                 './UsptoDataset/df73.csv',
                 './UsptoDataset/df74.csv',
                 './UsptoDataset/df75.csv',
                 './UsptoDataset/df76.csv',
                 './UsptoDataset/df77.csv',
                 './UsptoDataset/df78.csv'
                 
                ]:
    
    df = pd.read_csv(filename)
    dataframes.append(df)
    
df = pd.concat(dataframes, ignore_index=True)
df.to_csv('./UsptoDataset/df-claim.csv')

In [None]:
len(df)