# Extracting US Patent Abstracts
This notebook has been created to extract US Patent Abstracts from USPTO XML files stored in the data folder.


In [1]:
import xml.etree.ElementTree as ET
import os
from os import walk
import pandas as pd

In [2]:
#Define helper functions
def get_abstract(file_path):
    
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    if root.find('abstract') is None:
        return None
    
    for item in root.find('abstract'):
                
        return item.text

In [3]:
# Identify location of xml files
mypath = "data/xml/"

# create an empty list for the file paths
files = []

# walk the initial path looking for files with prlm and apbs in the filename and append those file paths to the list
for (dirpath, dirnames, filenames) in walk(mypath):
    for filename in filenames:
        if 'us1' in filename:
            files.append(mypath + filename)
    break
    
print(len(files), "files were found.")

6946 files were found.


In [4]:
%%time
abstracts = []

for i in range(0,len(files)):
    #print(files[i])
    abstracts.append(get_abstract(files[i]))

CPU times: user 11.3 s, sys: 187 ms, total: 11.5 s
Wall time: 11.5 s


In [5]:
# Create a pandas dataframe
df = pd.DataFrame({'FileName': files,
                     'Abstract': abstracts
                    })

print("{} abstracts found.".format(df.shape[0]))

6946 abstracts found.


In [6]:
# Write Abstracts to csv file
df.to_csv('abstracts.csv', index=False)

In [7]:
# Verify csv was created properly
df = pd.read_csv('abstracts.csv')
df.head()

Unnamed: 0,FileName,Abstract
0,data/xml/us10885437-20210105.xml,Security systems and methods for detecting int...
1,data/xml/us10884005-20210105.xml,The present invention provides biomarkers usef...
2,data/xml/us10887313-20210105.xml,The described technology provides a single sig...
3,data/xml/us10887088-20210105.xml,A computing device includes an interface confi...
4,data/xml/us10887228-20210105.xml,Techniques for enabling peer-to-peer transmiss...
