In [1]:
# Read uspto dataset
import pandas as pd
import numpy as np
import nltk

usptodata = pd.read_csv('U.S. Patents.csv')
usptodataset=usptodata[["grant_id","claims_text","abstract"]]
usptodataset= usptodataset.dropna()
usptodataset = usptodataset.reset_index(drop=True)

#Tokenize
usptodataset['tokenized_claims_text'] = usptodataset.apply(lambda row: nltk.word_tokenize(row['abstract']), axis=1)
usptodataset.head()


Unnamed: 0,grant_id,claims_text,abstract,tokenized_claims_text
0,USPP030977,1. A new and distinct Mango plant characterize...,"A new and distinct variety of Mango plant, her...","[A, new, and, distinct, variety, of, Mango, pl..."
1,USPP030978,1. A new and distinct apple tree substantially...,&#x2018;Honeysuckle Rose #1-6&#x2019; is a new...,"[&, #, x2018, ;, Honeysuckle, Rose, #, 1-6, &,..."
2,USPP030979,1. A new and distinct variety of peach tree as...,"A new and distinct peach tree variety, <i>Prun...","[A, new, and, distinct, peach, tree, variety, ..."
3,USPP030980,1. A new and distinct variety of raspberry pla...,This invention relates to a new and distinct v...,"[This, invention, relates, to, a, new, and, di..."
4,USPP030981,1. A new and distinct Strawberry plant named &...,A new and distinct cultivar of Strawberry plan...,"[A, new, and, distinct, cultivar, of, Strawber..."


In [2]:
from top2vec import Top2Vec
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import en_core_web_sm 

allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]
docs = docs = list(usptodataset.loc[:, "abstract"].values)

def lemmatize(docs, allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]):

    nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
    lemmatized_docs = []
    for doc in docs:
        doc = nlp(doc)
        tokens = []
        for token in doc:
            if token.pos_ in allowed_postags:
                tokens.append(token.lemma_)
        lemmatized_docs.append(" ".join(tokens))
    return (lemmatized_docs)


def tokenize(docs):

    tokenized_docs = []
    for doc in docs:
        tokens = gensim.utils.simple_preprocess(doc, deacc=True)
        tokenized_docs.append(tokens)
    return (tokenized_docs)


# Pre-process input: lemmatization and tokenization
lemmatized_docs = lemmatize(docs)
tokenized_docs = tokenize(lemmatized_docs)

# Mapping from word IDs to words
id2word = corpora.Dictionary(tokenized_docs)

# Prepare Document-Term Matrix
corpus = []
for doc in tokenized_docs:
    corpus.append(id2word.doc2bow(doc))

In [3]:
from top2vec import Top2Vec
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import en_core_web_sm 

docs = list(usptodataset.loc[:, "abstract"].values)


# Create jointly embedded topic, document and word vectors
Top2Vec_model = Top2Vec(
  docs, 
  embedding_model = 'doc2vec', # Embedding model: See [1,2] for supported models
  min_count = 50,              # Ignore words less frequent than this value
  umap_args = None,            # Dict of custom args for UMAP
  hdbscan_args = None          # Dict of custom argd for HDBSCAN
  )

# Visualization examples: See [1,2] for more details

# Search the closest 5 topics to the input query "faith"
# topic_words, word_scores, topic_scores, topic_nums = Top2Vec_model.search_topics(
#     keywords = ["faith"], 
#     num_topics = 5)

# Plot the resulting topics as wordclouds
# for topic in topic_nums:
#     topic_model.generate_topic_wordcloud(topic)

2023-01-08 20:48:15,829 - top2vec - INFO - Pre-processing documents for training
2023-01-08 20:48:17,251 - top2vec - INFO - Creating joint document/word embedding
2023-01-08 20:48:39,435 - top2vec - INFO - Creating lower dimension embedding of documents
2023-01-08 20:49:01,375 - top2vec - INFO - Finding dense areas of documents
2023-01-08 20:49:01,663 - top2vec - INFO - Finding topics


In [4]:
topic_words, word_scores, topic_nums = Top2Vec_model.get_topics()
print(topic_words)

[['user' 'information' 'server' ... 'program' 'implemented' 'determine']
 ['shaft' 'axial' 'engaging' ... 'wall' 'rotation' 'along']
 ['voltage' 'converter' 'circuit' ... 'driving' 'exceeds' 'time']
 ...
 ['implant' 'bone' 'joint' ... 'distal' 'hollow' 'wall']
 ['neural' 'learning' 'networks' ... 'packets' 'audio' 'features']
 ['compounds' 'compositions' 'treating' ... 'reaction' 'chain' 'activity']]


In [13]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.coherencemodel import CoherenceModel

cm = CoherenceModel(topics=topic_words, corpus=corpus, dictionary=id2word, coherence='u_mass')
coherence_Umass = cm.get_coherence()  # get coherence value

In [14]:
print("u_mass is: ", coherence_Umass)

u_mass is:  -7.170394440167691


In [28]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.coherencemodel import CoherenceModel

cm = CoherenceModel(topics=topic_words, corpus=corpus, texts=tokenized_docs, dictionary=id2word, coherence='c_v')
coherence_CV = cm.get_coherence()  # get coherence value

In [29]:
print("c_v is: ", coherence_CV)

c_v is:  0.49601431280473146


In [30]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.coherencemodel import CoherenceModel

cm = CoherenceModel(topics=topic_words, corpus=corpus, texts=tokenized_docs, dictionary=id2word, coherence='c_npmi')
coherence_npmi = cm.get_coherence() 

In [31]:
print("c_npmi is: ", coherence_npmi)

c_npmi is:  -0.04580006351283724
