# Creating a local PatentsView clone using DuckDB

## Imports and utilities

In [1]:
import duckdb
from duckdb import DuckDBPyConnection
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO
import yaml

def zipfile_from_url(filename: str, zipped_url: str):
    """Open file from remote ZIP archive."""
    remote = urlopen(zipped_url)
    with ZipFile(BytesIO(remote.read())) as file:
        return file.open(filename)

def create_patentsview_table(con: DuckDBPyConnection, pv_database: str, table_name: str):
    """Download and save given PatentsView table in duckdb database, if it doesn't already exist."""
    filename = f"{table_name}.tsv"
    url = f"{pv_database}/{filename}.zip"

    existing_tables = con.sql("show tables").df().name.values

    if table_name not in existing_tables:
        print(f"Loading {table_name} from {url} ...")
        table = con.read_csv(zipfile_from_url(filename, url), delimiter="\t", all_varchar=True, header=True)
        print(f"Saving to duckdb database...")
        table.create(table_name)
        print("Done.")
        existing_tables = con.sql("show tables").df().name.values


## PatentsView database cloning script

In [2]:
# Create local DuckDB database.
database_name = "patentsview.ddb"
con = duckdb.connect(database_name)

# Loading information about PatentsView data sources from `sources.yml` file.
with open("sources.yml") as file:
    sources = yaml.safe_load(file)

# Replicating all tables in the duckdb database
for type, info in sources.items():
    pv_database = info['database']  # Base URL to use for granted or pre-grant data downloads
    tables = info['tables']  # List of tables
    for table_name in tables:
        create_patentsview_table(con, pv_database, table_name)

## Example usage

Show all tables:

In [3]:
con.sql("show all tables;")

┌─────────────┬─────────┬───────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────┐
│  database   │ schema  │             name              │                                                                                                                                                                                                      column_names                                                                                                                    

Get a single table:

In [4]:
rawinventor = con.table("g_inventor_not_disambiguated")
rawinventor

┌───────────┬───────────────────┬──────────────────────────┬─────────────────────────┬────────────────────────┬───────────────┬───────────────────────────┐
│ patent_id │ inventor_sequence │       inventor_id        │ raw_inventor_name_first │ raw_inventor_name_last │ deceased_flag │      rawlocation_id       │
│  varchar  │      varchar      │         varchar          │         varchar         │        varchar         │    varchar    │          varchar          │
├───────────┼───────────────────┼──────────────────────────┼─────────────────────────┼────────────────────────┼───────────────┼───────────────────────────┤
│ D1006496  │ 0                 │ fl:we_ln:jiang-128       │ Wenjing                 │ Jiang                  │ 0             │ 30zgod902k0u495w9b2sb8xk9 │
│ 12029253  │ 4                 │ fl:ei_ln:baumker-1       │ Eiko                    │ Bäumker                │ 0             │ 468lxxjgdkuh0uonw27yj3t3f │
│ 6584128   │ 0                 │ fl:ri_ln:kroeger-1       │ Ric

Joining multiple tables together:

In [5]:
rawinventor = con.table("g_inventor_not_disambiguated")
rawlocation = con.table("g_location_not_disambiguated")
patent = con.table("g_patent")

rawinventor.join(rawlocation, "rawlocation_id", how="left").join(patent, "patent_id", how="left")

┌───────────┬───────────────────┬──────────────────────┬─────────────────────────┬────────────────────────┬───────────────┬───────────────────────────┬──────────────────────────────────────┬─────────────────────┬───────────┬─────────────┬─────────────┬─────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────┬────────────┬───────────┬─────────────────────────┐
│ patent_id │ inventor_sequence │     inventor_id      │ raw_inventor_name_first │ raw_inventor_name_last │ deceased_flag │      rawlocation_id       │             location_id              │      raw_city       │ raw_state │ raw_country │ patent_type │ patent_date │                                                               patent_title                                                                │ wipo_kind │ num_claims │ withdrawn │        filename         │
│  varchar  │      varchar      │       varchar        │    