# Patent Classification System

#### Viswesh Uppalapati

### Setup and Evironment

In [1]:
# imports
import numpy as np
import pandas as pd
import os

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

### Dataset and Preprocessing

In [2]:
# loading in title and abstract of each patent
description = pd.read_csv('data/patent_text_test_set_nlp_oct31_2022.csv')
description

Unnamed: 0,patent_id,patent_text
0,3930271,Golf glove A golf glove disclosed extra finger...
1,3930272,Crib leg lock A lock height-adjustable crib pl...
2,3930273,Bed safety side rail arrangement A bed safety ...
3,3930274,Assembly use recreational activities The assem...
4,3930276,Wheel spinning vehicle conveying apparatus aut...
...,...,...
99114,4035655,Method device implantation particles substrate...
99115,4035656,Method apparatus use approaching thermonuclear...
99116,4035657,Ozone generator An ozone generator air pump di...
99117,4035658,High power wind turbine kinetic accumulator A ...


In [3]:
# patent ids mapped to class labels (one patent can belong to multiple labels)
labels = pd.read_csv('data/patents_USPC_map_nlp_oct31_2022.csv')
labels

Unnamed: 0,patent_id,mainclass_id
0,3930271,2
1,3930271,473
2,3930272,5
3,3930272,248
4,3930272,403
...,...,...
186893,4035657,422
186894,4035658,290
186895,4035658,416
186896,4035659,307


In [4]:
# store the patent ids that would be part of the test set (15% from each USPC class) and display percentage of test set
test_ids = labels.groupby('mainclass_id').apply(lambda x : x.sample(int(x.count() * 0.15)))['patent_id'].unique()
len(test_ids)/99119

0.2523935875059272

In [5]:
# number of classes (len of output layer of NN)
# use this order of classes to us
classes = list(labels['mainclass_id'].unique())
num_classes = len(classes)
num_classes

330

In [6]:
# make the y labels vector of size 330 using the ordering in classes
temp = dict()
def make_labels(row):
    if row['patent_id'] not in temp:
        temp[row['patent_id']] = [0 for i in range(0, 330)]
    temp[row['patent_id']][classes.index(row['mainclass_id'])] = 1

labels.apply(make_labels, axis = 1)
len(temp.keys())

99119

In [7]:
# create the combines multi label y values for classification
target = pd.Series(list(temp.values()))
ids = pd.Series(list(temp.keys()))
target_labels = pd.concat([ids, target], axis = 1)
target_labels = target_labels.rename(columns = {0: "patent_id", 1: "labels"})
target_labels

Unnamed: 0,patent_id,labels
0,3930271,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,3930272,"[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3930273,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3930274,"[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,3930276,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ..."
...,...,...
99114,4035655,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
99115,4035656,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
99116,4035657,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
99117,4035658,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
# create a combined dataset
data = description.merge(target_labels, on = 'patent_id')
data

Unnamed: 0,patent_id,patent_text,labels
0,3930271,Golf glove A golf glove disclosed extra finger...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,3930272,Crib leg lock A lock height-adjustable crib pl...,"[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3930273,Bed safety side rail arrangement A bed safety ...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3930274,Assembly use recreational activities The assem...,"[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,3930276,Wheel spinning vehicle conveying apparatus aut...,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
99114,4035655,Method device implantation particles substrate...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
99115,4035656,Method apparatus use approaching thermonuclear...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
99116,4035657,Ozone generator An ozone generator air pump di...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
99117,4035658,High power wind turbine kinetic accumulator A ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
# split into train and test set, roughly a 25% test and 75% train set
test = data[data['patent_id'].isin(test_ids)]
train = data[~data['patent_id'].isin(test_ids)]
len(test), len(train)

(25017, 74102)

In [10]:
X_train = train['patent_text']
y_train = train['labels']

X_test = test['patent_text']
y_test = test['labels']

In [11]:
X_train.iloc[0]

"Golf glove A golf glove disclosed extra finger pocket index middle finger pockets securing one finger one hand golf player fingers player's hand."

In [12]:
X_test.iloc[0]

'Wheel spinning vehicle conveying apparatus automatic wheel washers An automobile conveyor use conjunction wheel spinning device automatic wheel washer including endless chain plurality selectively engageable dogs pivotally secured thereto. The dogs normally travel position urge auto washer whereat outboard end close association chain beneath automobile first supporting surface. Operating means included whereby automobile position least one dogs automatically moved position dog extends automobile supporting surface, contacts wheel, urges car washer apparatus. At wheel spinning washing station chain forced downwardly position whereat passes beneath mechanism necessary perform wheel spinning operation still retaining outboard end dog automobile supporting surface. The continual contact dog assures automobile moves washer apparatus continuous motion.'

### Model and BERT Setup

In [13]:
# preprocessor and BERT layer reference from tensorflow hub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
#bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")



In [14]:
# Keras Model

# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.2, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(num_classes, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [15]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [16]:
# record precision and recall as model metrics
METRICS = [
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

# Usage of Adam Optimizer and binary crossentropy loss function
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

### Model Evaluation

In [None]:
model.fit(X_train.to_list(), y_train.to_list(), epochs=2)

Epoch 1/2
 475/2316 [=====>........................] - ETA: 5:12:24 - loss: 0.0383 - precision: 0.0055 - recall: 0.0028

In [None]:
model.save('uspc_classifier')