# Creating a local PatentsView clone using DuckDB

## Imports and utilities

In [1]:
import duckdb
from duckdb import DuckDBPyConnection
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
import yaml

def zipfile_from_url(filename: str, zipped_url: str):
    """Open file from remote ZIP archive."""
    remote = urlopen(zipped_url)
    with ZipFile(BytesIO(remote.read())) as file:
        return file.open(filename)

def create_patentsview_table(con: DuckDBPyConnection, pv_database: str, table_name: str):
    """Download and save given PatentsView table in duckdb database, if it doesn't already exist."""
    filename = f"{table_name}.tsv"
    url = f"{pv_database}/{filename}.zip"

    existing_tables = con.sql("show tables").df().name.values

    if table_name not in existing_tables:
        print(f"Loading {table_name} from {url} ...")
        table = con.read_csv(zipfile_from_url(filename, url), delimiter="\t", all_varchar=True, header=True)
        print(f"Saving to duckdb database...")
        table.create(table_name)
        print("Done.")
        existing_tables = con.sql("show tables").df().name.values


## PatentsView database cloning script

Download all PV tables and save them to duckdb database. This is a large download and requires at least 50Gb of free disk space.

In [2]:
# Create local DuckDB database.
database_name = "patentsview.ddb"
con = duckdb.connect(database_name)

# Loading information about PatentsView data sources from `sources.yml` file.
with open("sources.yml") as file:
    sources = yaml.safe_load(file)

# Replicating all tables in the duckdb database
for type, info in sources.items():
    pv_database = info['database']  # Base URL to use for granted or pre-grant data downloads
    tables = info['tables']  # List of tables
    for table_name in tables:
        create_patentsview_table(con, pv_database, table_name)

## Example usage

Show all tables:

In [3]:
con.sql("show all tables;").df().head()

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,patentsview,main,g_applicant_not_disambiguated,"[patent_id, applicant_sequence, raw_applicant_...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
1,patentsview,main,g_application,"[application_id, patent_id, patent_application...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
2,patentsview,main,g_assignee_disambiguated,"[patent_id, assignee_sequence, assignee_id, di...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
3,patentsview,main,g_assignee_not_disambiguated,"[patent_id, assignee_sequence, assignee_id, ra...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
4,patentsview,main,g_attorney_disambiguated,"[patent_id, attorney_sequence, attorney_id, di...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False


Get a single table:

In [4]:
rawinventor = con.table("g_inventor_not_disambiguated")
rawinventor.limit(5).df()

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id
0,D1006496,0,fl:we_ln:jiang-128,Wenjing,Jiang,0,30zgod902k0u495w9b2sb8xk9
1,12029253,4,fl:ei_ln:baumker-1,Eiko,Bäumker,0,468lxxjgdkuh0uonw27yj3t3f
2,6584128,0,fl:ri_ln:kroeger-1,Richard,Kroeger,FALSE,o2ema6bl3kkh6iwziprbzu4m9
3,4789863,0,fl:th_ln:bush-1,Thomas A.,Bush,FALSE,tkz70bmoqx88n1lfz3fl657lh
4,11161990,1,fl:ma_ln:boudreaux-4,Matthew F.,Boudreaux,False,up8ym622ssh09vpqv9rdsygs0


Joining multiple tables together:

In [5]:
rawinventor = con.table("g_inventor_not_disambiguated")
rawlocation = con.table("g_location_not_disambiguated")
patent = con.table("g_patent")

rawinventor.join(rawlocation, "rawlocation_id", how="left").join(patent, "patent_id", how="left").limit(5).df()

Unnamed: 0,patent_id,inventor_sequence,inventor_id,raw_inventor_name_first,raw_inventor_name_last,deceased_flag,rawlocation_id,location_id,raw_city,raw_state,raw_country,patent_type,patent_date,patent_title,wipo_kind,num_claims,withdrawn,filename
0,5904651,2,fl:do_ln:panescu-1,Dorin,Panescu,False,1npobz83xnovie30ujdjca7jh,c59ba8f0-16c7-11ed-9b5f-1234bde3cd05,Sunnyvale,CA,US,utility,1999-05-18,Systems and methods for visualizing tissue dur...,A,63,0,pftaps19990518_wk20.zip
1,6682379,0,fl:th_ln:walczak-2,Thomas J.,Walczak,False,1npocuytgbqokup1xd2qu75h9,e6abbb2a-16c7-11ed-9b5f-1234bde3cd05,Oconomowoc,WI,US,utility,2004-01-27,Cowl latching system which simplifies the cowl...,B1,22,0,pg040127.zip
2,D825579,0,fl:me_ln:chiu-8,Mei-Ling,Chiu,False,1npoenehi65txp92pt5wfyw6l,b5c9a039-16c7-11ed-9b5f-1234bde3cd05,Taipei,,TW,design,2018-08-14,Portable storage device,S1,1,0,ipg180814.xml
3,6337696,1,fl:jo_ln:buehler-2,John,Buehler,False,1npoi5yhdwdobxdxbk68gqvcc,816d1200-16c8-11ed-9b5f-1234bde3cd05,Redmond,WA,US,utility,2002-01-08,System and method for facilitating generation ...,B1,14,0,pg020108.zip
4,7427186,2,fl:fl_ln:poinsotberthelot-1,Florian,Poinsot-Berthelot,False,1npojfhhva3aejl9h7kdth1zk,173808ad-16c8-11ed-9b5f-1234bde3cd05,Paris,,FR,utility,2008-09-23,Rotary engine with shaft bearing having two st...,B2,12,0,ipg080923.xml


## Using SQL in a notebook

In [6]:
# pip install jupysql jupysql duckdb-engine
%load_ext sql
%sql con
%config SqlMagic.displaycon = False

In [7]:
%%sql
SELECT *
FROM g_patent
LIMIT 5;

patent_id,patent_type,patent_date,patent_title,wipo_kind,num_claims,withdrawn,filename
10000000,utility,2018-06-19,Coherent LADAR using intra-pixel quadrature detection,B2,20,0,ipg180619.xml
10000001,utility,2018-06-19,Injection molding machine and mold thickness control method,B2,12,0,ipg180619.xml
10000002,utility,2018-06-19,Method for manufacturing polymer film and co-extruded film,B2,9,0,ipg180619.xml
10000003,utility,2018-06-19,Method for producing a container from a thermoplastic,B2,18,0,ipg180619.xml
10000004,utility,2018-06-19,"Process of obtaining a double-oriented film, co-extruded, and of low thickness made by a three bubble process that at the time of being thermoformed provides a uniform thickness in the produced tray",B2,6,0,ipg180619.xml
