# Harvard USPTO Patent Dataset (HUPD)

## Loading the Sample Dataset, stored locally

In [28]:
## Import relevant libraries and dependencies
# Pretty print
from pprint import pprint
# Datasets load_dataset function
from datasets import load_dataset
# Transformers Autokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# Standard PyTorch DataLoader
from torch.utils.data import DataLoader

Let's use the `load_dataset` function to load all the patent applications that were filed to the USPTO in January 2016. We specify the date ranges of the training and validation sets as January 1-21, 2016 and January 22-31, 2016, respectively. 

In [29]:
# Data loading example
dataset_dict = load_dataset('./sample_dataset.py', 
    data_dir='./sample_json',
    metadata_file="sample_metadata.feather",
    cache_dir='./mnt/data/HUPD/cache',
    ipcr_label=None,
    uniform_split=True
)

print('Loading is done!')

Using custom data configuration default-6c866e603216e695
Reusing dataset patents (./mnt/data/HUPD/cache/patents/default-6c866e603216e695/1.0.1/0d005d4e2200f89fac8ee7f637a6c4ad0ec749df3f747d586c3e015e0be324b4)
100%|██████████| 2/2 [00:00<00:00, 581.37it/s]

Loading is done!





Let's display some information about the training and validation sets.

In [30]:
# Dataset info
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 8
    })
    validation: Dataset({
        features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
        num_rows: 1
    })
})


We can also display the fields within the dataset dictionary, as well as the sizes of the training and validation sets.

In [31]:
# Print dataset dictionary contents and cache directory
print('Dataset dictionary contents:')
pprint(dataset_dict)
print('Dataset dictionary cached to:')
pprint(dataset_dict.cache_files)

Dataset dictionary contents:
{'train': Dataset({
    features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
    num_rows: 8
}),
 'validation': Dataset({
    features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],
    num_rows: 1
})}
Dataset dictionary cached to:
{'train': [{'filename': './mnt/data/HUPD/cache/patents/default-6c866e603216e695/1.0.1/0d005d4e2200f89fac8ee7f637a6c4ad0ec749df3f747d586c3e015e0be324b4/patents-train.arrow'}],
 'validation': [{'filename': './mnt/data/HUPD/cache/patents/default-6c866e603216e695/1.0.1/0d005d4e2200f89fac8ee7f637a6c4ad0ec749df3f747d586c3e015e0be324b4/patents-validation.arrow'}]}


In [32]:
# Print info about the sizes of the train and validation sets
print(f'Train dataset size: {dataset_dict["train"].shape}')
print(f'Validation dataset size: {dataset_dict["validation"].shape}')

Train dataset size: (8, 14)
Validation dataset size: (1, 14)


## Pre-Processing Steps

First, let's establish the label-to-index mapping for the decision status field by assigning the decision status labels to the class indices.

In [33]:
# Label-to-index mapping for the decision status field
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}

# Helper function
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}

Let's now re-label the decision status fields of the examples in the training and validation sets.

In [34]:
# Re-labeling/mapping.
train_set = dataset_dict['train'].map(map_decision_to_string)
val_set = dataset_dict['validation'].map(map_decision_to_string)

8ex [00:00, 2655.04ex/s]
1ex [00:00, 845.97ex/s]


In [13]:
# Display the cached directories of the processed train and validation sets
print('Processed train and validation sets are cached to: ')
pprint(train_set.cache_files)
pprint(val_set.cache_files)

Processed train and validation sets are cached to: 
[{'filename': './mnt/data/HUPD/cache/patents/default-4ab836bc029274ca/1.0.1/0d005d4e2200f89fac8ee7f637a6c4ad0ec749df3f747d586c3e015e0be324b4/cache-4a54da1ef066ec70.arrow'}]
[{'filename': './mnt/data/HUPD/cache/patents/default-4ab836bc029274ca/1.0.1/0d005d4e2200f89fac8ee7f637a6c4ad0ec749df3f747d586c3e015e0be324b4/cache-c1ce181364dc69f4.arrow'}]


For the time being, let's focus on the _abstract_ section of the patent applications.

In [14]:
# Focus on the abstract section and tokenize the text using the tokenizer. 
_SECTION_ = 'abstract'

In [15]:
# Training set
train_set = train_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

100%|██████████| 1/1 [00:00<00:00, 46.30ba/s]


In [16]:
# Validation set
val_set = val_set.map(
    lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),
    batched=True)

100%|██████████| 1/1 [00:00<00:00, 187.16ba/s]


In [17]:
# Set the format
train_set.set_format(type='torch', 
    columns=['input_ids', 'attention_mask', 'decision'])

val_set.set_format(type='torch', 
    columns=['input_ids', 'attention_mask', 'decision'])

Let's use `DataLoader` to crete our training set and validation set loaders. 

In [18]:
# train_dataloader and val_data_loader
train_dataloader = DataLoader(train_set, batch_size=16)
val_dataloader = DataLoader(val_set, batch_size=16)


In [19]:
# Get the next batch
batch = next(iter(train_dataloader))
# Print the ids
pprint(batch['input_ids'])
# Print the labels
pprint(batch['decision'])

tensor([[  101,  1037,  4118,  ...,     0,     0,     0],
        [  101,  1037, 11394,  ...,     0,     0,     0],
        [  101,  2004, 27108,  ...,     0,     0,     0],
        ...,
        [  101,  1037, 14513,  ...,     0,     0,     0],
        [  101,  1037, 10808,  ...,     0,     0,     0],
        [  101,  1996, 11028,  ...,     0,     0,     0]])
tensor([1, 4, 0, 1, 3, 1, 1, 0])


In [20]:
# Print the input and output shapes
input_shape = batch['input_ids'].shape
output_shape = batch['decision'].shape
print(f'Input shape: {input_shape}')
print(f'Output shape: {output_shape}')

Input shape: torch.Size([8, 512])
Output shape: torch.Size([8])


In [21]:
# A helper function that converts ids into tokens
def convert_ids_to_string(tokenizer, input):
    return ' '.join(tokenizer.convert_ids_to_tokens(input))

Let's print an example in the batch.

In [22]:
# Print the example
pprint(convert_ids_to_string(tokenizer,batch['input_ids'][1]))

('[CLS] a dental filling material comprising a the ##rm ##op ##lastic polymer '
 '. the the ##rm ##op ##lastic polymer may be bio ##de ##grad ##able . a bio '
 '##active substance may also be included in the filling material . the the '
 '##rm ##op ##lastic polymer acts as a matrix for the bio ##active substance . '
 'the composition may include other polymer ##ic resin ##s , fill ##ers , '
 'plastic ##izer ##s and other additive ##s typically used in dental materials '
 '. the filling material is used for the filing of root canals . [SEP] [PAD] '
 '[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] '
 '[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] '
 '[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] '
 '[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] '
 '[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] '
 '[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD