In [145]:
#Extracting topics and sub-topics hierarchical structure in uspto reviews using the Python package BERTopic

In [146]:
# Install bertopic
#!pip install bertopic

In [147]:
# Data processing
import pandas as pd
import numpy as np

# Text preprocessiong
import nltk
nltk.download('stopwords')

# Dimension reduction
from umap import UMAP

# Clustering
from hdbscan import HDBSCAN

# Count vectorization
from sklearn.feature_extraction.text import CountVectorizer

# Topic model
from bertopic import BERTopic

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [148]:
# Read uspto dataset
usptodata = pd.read_csv('U.S. Patents.csv')
usptodataset=usptodata[["grant_id","claims_text","abstract"]]
usptodataset= usptodataset.dropna()
usptodataset = usptodataset.reset_index(drop=True)
usptodataset.head()

Unnamed: 0,grant_id,claims_text,abstract
0,USPP030977,1. A new and distinct Mango plant characterize...,"A new and distinct variety of Mango plant, her..."
1,USPP030978,1. A new and distinct apple tree substantially...,&#x2018;Honeysuckle Rose #1-6&#x2019; is a new...
2,USPP030979,1. A new and distinct variety of peach tree as...,"A new and distinct peach tree variety, <i>Prun..."
3,USPP030980,1. A new and distinct variety of raspberry pla...,This invention relates to a new and distinct v...
4,USPP030981,1. A new and distinct Strawberry plant named &...,A new and distinct cultivar of Strawberry plan...


In [149]:
usptodataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7013 entries, 0 to 7012
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   grant_id     7013 non-null   object
 1   claims_text  7013 non-null   object
 2   abstract     7013 non-null   object
dtypes: object(3)
memory usage: 164.5+ KB


In [150]:
# divide dataset to train and test
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(usptodataset, test_size=0.2, random_state=25)

In [151]:
df_train

Unnamed: 0,grant_id,claims_text,abstract
1417,US10457170,1. A flexible motor vehicle work surface opera...,A flexible motor vehicle work surface is opera...
1243,US10456992,1. A layer deposition system to form a plurali...,Apparatus and associated methods for a user co...
3782,US10459649,"1. A method comprising:,receiving a write requ...",One or more techniques and/or systems are prov...
1185,US10456929,1. A soft bending actuator finger for use in c...,A soft robotic bending actuator has a stiff la...
2785,US10458622,1. A reflection member for reflecting light em...,A reflection member includes at least one poly...
...,...,...,...
2934,US10458777,1. A method of measuring a metrology target el...,"Targets, target elements and target design met..."
2191,US10458022,1. A method for anti-corrosive treatment of me...,"A method for corrosion protection treatment, c..."
6618,US10462550,"1. A storage device comprising:,a first case c...","A storage device includes a first case, a seco..."
318,US10456037,1. A terminal device configured to be able to ...,A terminal device is provided which is configu...


In [152]:
#df_test

In [153]:
# NLTK English stopwords
new_stopwords = ["shown", "design", "ornamental", "describe", "described", "described."]
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(new_stopwords)

In [154]:
#clean Data
def cleantext(df): 
    
    df_train['cleaned_text'] = df_train['claims_text'].replace(r'\'|\"|\,|\.|\?|\+|\-|\/|\=|\(|\)|\n|"', '', regex=True)
    df_train['cleaned_text'] = df_train['cleaned_text'].replace("  ", " ")
    
    # convert tweets to lowercase
    df_train['cleaned_text'] = df_train['cleaned_text'].str.lower()
    
     #remove_symbols
    df_train['cleaned_text']  = df_train['cleaned_text'].replace(r'[^a-zA-Z0-9]', " ", regex=True)
    
    #remove punctuations 
    df_train['cleaned_text'] = df_train['cleaned_text'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)
    
    #remove_URL(x):
    df_train['cleaned_text']  = df_train['cleaned_text'].replace(r'https.*$', "", regex = True)
    
    #remove stopwords and words_to_remove
    df_train['fully_cleaned_text'] = df_train['cleaned_text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stopwords]))
    
    return df_train

docs = cleantext(df_train)

In [155]:
docs = list(docs.loc[:, "fully_cleaned_text"].values)

In [156]:
#Build a Basic BERTopic Model

In [157]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15,    #means that the local neighborhood size for UMAP is 15
                  n_components=5,    #indicates that the target dimension from UMAP is 5. This is the dimension of data that will be passed into the clustering model.
                  min_dist=0.0,      #controls how tightly UMAP is allowed to pack points together. It's the minimum distance between points in the low-dimensional space.
                  metric='cosine',   #indicates that we will use cosine to measure the distance.
                  random_state=100)  #sets a random seed to make the UMAP results reproducible.

# Count vectorizer
vectorizer_model = CountVectorizer(stop_words=stopwords) # if for counting the words frequency. Passing the extended stop words list helps us to remove noises from the top words representing each topic.

In [158]:
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, 
                       vectorizer_model=vectorizer_model, 
                       diversity=0.8,                      #helps to remove the words with the same or similar meanings.
                       min_topic_size=200,                 # is the minimum number of documents in a topic
                       top_n_words=6,                      #indicates that we will use the top 6 words to represent the topic.
                       language="English",
                       calculate_probabilities=True)      # means that the probabilities of each document belonging to each topic are calculated

# Run BERTopic model
topics = topic_model.fit_transform(docs)

# Get the list of topics
topic_model.get_topic_info() 

Unnamed: 0,Topic,Count,Name
0,-1,1319,-1_least_device_configured_comprises
1,0,1407,0_claim_surface_configured_comprises
2,1,1038,1_data_second_network_method
3,2,949,2_second_display_optical_substrate
4,3,519,3_wherein_composition_said_x2014
5,4,378,4_first_circuit_wherein_terminal


In [159]:
#Extract Topic Hierarchy

In [174]:
# Hierachical topics
hierarchical_topics = topic_model.hierarchical_topics(docs)

# Take a look at the data
hierarchical_topics
#Parent_ID: is a new topic ID created for the parent topics.
#Parent_Name: is a list of top words describing the parent topic.
#Topics : is a list of child topic numbers included in the parent topic. All the child topic numbers in this column are from the basic BERTopic model
#Child_Left_ID : is the left child topic number. This child topic number can be from the basic BERTopic model or an existing parent topic number.
#Child_Left_Name: has the top words describing the left child topic.
#Child_Right_ID: is the right child topic number.
#Child_Right_Name: has the top words describing the right child topic.
#Distance: shows the distance between the left and right child topics.

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 18.43it/s]


Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
3,8,device_least_comprises_configured_based,"[0, 1, 2, 3, 4]",3,wherein_composition_said_x2014_sequence,7,device_said_configured_comprises_based,1.068359
2,7,device_said_configured_comprises_based,"[0, 1, 2, 4]",5,least_light_configured_layer_direction,6,data_method_plurality_configured_processor,0.763683
1,6,data_method_plurality_configured_processor,"[1, 4]",1,data_second_network_method_plurality,4,first_circuit_wherein_terminal_coupled,0.5907
0,5,least_light_configured_layer_direction,"[0, 2]",0,claim_surface_configured_comprises_inner,2,second_display_optical_substrate_comprises,0.500312


In [175]:
#Create Mapping Between Parent and Child Topics
# Visualize heirarchical topics
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [173]:
#Another way to visualize the topic hierarchy is to create a topic tree
# Topic tree
tree = topic_model.get_topic_tree(hierarchical_topics)

# Print out the tree
print(tree)

.
├─■──wherein_composition_said_x2014_sequence ── Topic: 3
└─device_said_configured_comprises_based
     ├─least_light_configured_layer_direction
     │    ├─■──claim_surface_configured_comprises_inner ── Topic: 0
     │    └─■──second_display_optical_substrate_comprises ── Topic: 2
     └─data_method_plurality_configured_processor
          ├─■──data_second_network_method_plurality ── Topic: 1
          └─■──first_circuit_wherein_terminal_coupled ── Topic: 4



In [163]:
#New Document Topic Predictions

In [164]:
#docs_test = cleantext_test(new_review)
new_review = " A method of driving a machine having an electrical drive motor for supplying required torque of predetermined value and having at least one component, which carries out a periodic motion, comprising the steps of: acquiring a first set of control data for the drive motor, the first set of control data being arranged to drive a motor at a first defined torque for a first measured angular speed; acquiring at least one second set of control data for the drive motor, the second set of control data being arranged to drive the motor at a second defined torque at said first measured angular speed; acquiring at least third and fourth sets of control data for the drive motor, the third and fourth sets of control data being arranged to drive the motor at different defined torques at a second measured angular speed; retrievably storing said control data for said plurality of defined torques at each of said plurality of angular speeds of the drive motor a plurality of defined torques applicable by the drive motor; and measuring an actual value of the angular speed of the drive motor; retrieving the control data for the measured speed, and controlling the drive motor based on the retrieved control data so as to apply the required torque at the measured instantaneous angular speed."
new_doc = {'claims_text':[new_review]}
new_doc = pd.DataFrame(new_doc)
new_doc

Unnamed: 0,claims_text
0,A method of driving a machine having an elect...


In [165]:
#clean sample Data
def cleantext_test(new_doc): 
    
    new_doc['cleaned_text'] = new_doc['claims_text'].replace(r'\'|\"|\,|\.|\?|\+|\-|\/|\=|\(|\)|\n|"', '', regex=True)
    df_test['cleaned_text'] = new_doc['cleaned_text'].replace("  ", " ")
    
    # convert tweets to lowercase
    new_doc['cleaned_text'] = new_doc['cleaned_text'].str.lower()
    
     #remove_symbols
    new_doc['cleaned_text']  = new_doc['cleaned_text'].replace(r'[^a-zA-Z0-9]', " ", regex=True)
    
    #remove punctuations 
    new_doc['cleaned_text'] = new_doc['cleaned_text'].replace(r'[[]!"#$%\'()\*+,-./:;<=>?^_`{|}]+',"", regex = True)
    
    #remove_URL(x):
    new_doc['cleaned_text']  = new_doc['cleaned_text'].replace(r'https.*$', "", regex = True)
    
    #remove stopwords and words_to_remove
    new_doc['fully_cleaned_text'] = new_doc['cleaned_text'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stopwords]))
    
    return new_doc


In [166]:
#sample data
docs_test = cleantext_test(new_doc)
docs_test

Unnamed: 0,claims_text,cleaned_text,fully_cleaned_text
0,A method of driving a machine having an elect...,a method of driving a machine having an elect...,method driving machine electrical drive motor ...


In [167]:
#New Document Topic Predictions
docs_test_clean = list(docs_test["fully_cleaned_text"].values)

# Find topics
num_of_topics = 3
similar_topics, similarity = topic_model.find_topics(docs_test_clean[0], top_n=num_of_topics); 

# Print results
print(f'The most similar child topic is {similar_topics[0]}, and the similarities is {np.round(similarity,2)[0]}')
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

The most similar child topic is 1, and the similarities is 0.2
The top 3 similar topics are [1, 2, -1], and the similarities are [0.2 0.1 0.1]


In [168]:
#each topic has a similarity score , based on this we can identify the priority of topics

In [169]:
# Visualize top topic keywords
topic_model.visualize_barchart(similar_topics)