In [39]:
import pandas as pd

df_Abstract = pd.read_csv('final-dataset(A1-31306samples)-train-topicmodel.csv')
df_Abstract.head()

len(df_Abstract)

31306

In [40]:
df_Abstract.head()

Unnamed: 0,publication_number,country_code,kind_code,title,abstract,claims,publication_date,ipc_code,cpc_code,first_claim,claim_lengths,sub_classes,sub_class,abstract_lengths
0,US2020097067A1,US,A1,Artificial Intelligence System and Interactive...,"A reality interactive responding system, compr...","1 . A reality interactive responding system, c...",20200326,G06F9/448,G16H40/67,"1 . A reality interactive responding system, c...",200,"['G06F9/448', 'G16H40/67']","['G06F', 'G16H']",123
1,US2020098473A1,US,A1,Data Storage and Retrieval System for Non-Cont...,A web-based interface enables medical personne...,What is claimed is: \n \n 1 . A da...,20200326,G16H40/67,H04L67/1097,What is claimed is: 1 . A data storage and ret...,200,"['G16H40/67', 'H04L67/1097']","['G16H', 'H04L']",142
2,US2020098451A1,US,A1,Hybrid analysis framework for prediction of ou...,A facility for predicting patient outcomes on ...,"1 . A method in a computing system, comprising...",20200326,G16H10/20,G16H10/20,"1 . A method in a computing system, comprising...",200,"['G16H10/20', 'G16H10/20']","['G16H', 'G16H']",81
3,US2020098458A1,US,A1,Medical cannabis platform with physician and p...,"Through a physician&#39;s portal, a platform c...",What is claimed is: \n \n 1 . A me...,20200326,G16H80/00,A61K36/185,What is claimed is: 1 . A method for providing...,200,"['G16H80/00', 'A61K36/185']","['G16H', 'A61K']",148
4,US2020093988A1,US,A1,Patient day planning systems and methods,"Infusion systems, infusion devices, and relate...",What is claimed is: \n \n 1 . A me...,20200326,G16H20/17,A61M2230/201,What is claimed is: 1 . A method of monitoring...,200,"['G16H20/17', 'A61M2230/201']","['G16H', 'A61M']",110


In [41]:
first_record = df_Abstract['abstract'][1]
first_record

'A web-based interface enables medical personnel to remotely monitor medical devices. A monitoring system records operational data and alarms from the medical devices in a file. However, since network connections between the medical devices and the monitoring system are intermittent, the file does not contain a contiguous stream of data for each medical device. The file pauses recording during gaps in network connectivity. The system displays current data, as well as a list of alarms. If medical personnel wish to view more detail about an earlier time or one of the alarms, the system calculates where in the file the medical device data was recorded. This calculation accounts for the discontiguous nature of the data. The system uses times the network connection is made and broken to calculate an index into the file that corresponds to the time of the user-selected alarm.'

In [42]:
import torch
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer



# Load the Sentence Transformers model
sentence_model = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')
#sentence_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Adjust UMAP hyperparameters
umap_model = UMAP(n_neighbors=3, 
                  n_components=3, 
                  min_dist=0.05, 
                  metric='cosine', 
                  random_state=100)  # Utilize all available CPU cores for UMAP preprocessing

# Adjust HDBSCAN hyperparameters
hdbscan_model = HDBSCAN( min_cluster_size=80,
                        min_samples=40, 
                        metric='euclidean', 
                        cluster_selection_method='eom', 
                        prediction_data=True)

# Adjust CountVectorizer hyperparameters
vectorizer_model = CountVectorizer(ngram_range=(1, 3), min_df=10, max_df=0.5)


# c-TF-IDF
#ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

# Run Model
topic_model = BERTopic(umap_model=umap_model,
                       embedding_model=sentence_model, 
                       hdbscan_model=hdbscan_model, 
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       verbose=True)

# Replace 'df['first_claim']' with your data source for the first_claim column
topics, probabilities = topic_model.fit_transform(df_Abstract['abstract'])


Batches:   0%|          | 0/979 [00:00<?, ?it/s]

2023-10-03 19:20:04,003 - BERTopic - Transformed documents to Embeddings
2023-10-03 19:20:12,451 - BERTopic - Reduced dimensionality
2023-10-03 19:20:14,190 - BERTopic - Clustered reduced embeddings


In [43]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

documents = pd.DataFrame({"Document": df_Abstract['abstract'],
                          "ID": range(len(df_Abstract['abstract'])),
                          "Topic": topics})

documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]


# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_npmi')
coherence_c_nmpi = coherence_model.get_coherence()
print("c_npmi is: ",coherence_c_nmpi)

c_npmi is:  -0.1921969735818733


In [44]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

documents = pd.DataFrame({"Document":df_Abstract['abstract'],
                          "ID": range(len(df_Abstract['abstract'])),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence_CV= coherence_model.get_coherence()
print("C_V is: ", coherence_CV)

C_V is:  0.3445794195598435


In [45]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

documents = pd.DataFrame({"Document":df_Abstract['abstract'],
                          "ID": range(len(df_Abstract['abstract'])),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='u_mass')
coherence_u_mass= coherence_model.get_coherence()
print("u_mass is: ", coherence_u_mass)

u_mass is:  -0.6258166391734031


In [46]:
 topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,17053,-1_sleep_anatomical_3d_skin,"[sleep, anatomical, 3d, skin, pump, radiation,...",[The invention relates to a method and a devic...
1,0,1593,0_disclosed are_the disclosure_disclosure prov...,"[disclosed are, the disclosure, disclosure pro...",[Disclosed are methods and apparatus for deter...
2,1,1589,1_invention relates_invention relates to_prese...,"[invention relates, invention relates to, pres...",[The invention relates to a telepresence syste...
3,2,862,2_pharmacy_dispensing_the prescription_container,"[pharmacy, dispensing, the prescription, conta...",[Automated pill dispensing apparatus for filin...
4,3,758,3_ecg_cardiac_blood pressure_the ecg,"[ecg, cardiac, blood pressure, the ecg, pulse,...",[A device for graphically reconstructing infor...
5,4,576,4_the medical image_image processing_medical i...,"[the medical image, image processing, medical ...",[An image management apparatus capable of rece...
6,5,525,5_exercise_fitness_the exercise_an exercise,"[exercise, fitness, the exercise, an exercise,...",[A multiple exercise activity recording system...
7,6,482,6_certain embodiments_invention provide_presen...,"[certain embodiments, invention provide, prese...",[Certain embodiments of the present invention ...
8,7,391,7_glucose_insulin_blood glucose_glucose level,"[glucose, insulin, blood glucose, glucose leve...",[Techniques and devices for determining and ge...
9,8,364,8_ultrasound_an ultrasound_the ultrasound_ultr...,"[ultrasound, an ultrasound, the ultrasound, ul...",[Provided are an ultrasound diagnosis apparatu...


In [47]:
#adding topics and probs for eachdoc in dataset
df_Abstract['topics'] = topics
df_Abstract['prob'] = probabilities
df_Abstract_topic=df_Abstract[["publication_number","title","abstract","sub_classes","sub_class","topics","prob"]]
df_Abstract_topic

Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob
0,US2020097067A1,Artificial Intelligence System and Interactive...,"A reality interactive responding system, compr...","['G06F9/448', 'G16H40/67']","['G06F', 'G16H']",-1,0.000000
1,US2020098473A1,Data Storage and Retrieval System for Non-Cont...,A web-based interface enables medical personne...,"['G16H40/67', 'H04L67/1097']","['G16H', 'H04L']",32,1.000000
2,US2020098451A1,Hybrid analysis framework for prediction of ou...,A facility for predicting patient outcomes on ...,"['G16H10/20', 'G16H10/20']","['G16H', 'G16H']",-1,0.000000
3,US2020098458A1,Medical cannabis platform with physician and p...,"Through a physician&#39;s portal, a platform c...","['G16H80/00', 'A61K36/185']","['G16H', 'A61K']",-1,0.000000
4,US2020093988A1,Patient day planning systems and methods,"Infusion systems, infusion devices, and relate...","['G16H20/17', 'A61M2230/201']","['G16H', 'A61M']",-1,0.000000
...,...,...,...,...,...,...,...
31301,US2016253489A1,User authentication system,A user authentication system performs user rec...,"['G16H10/60', 'G06F21/32']","['G16H', 'G06F']",51,1.000000
31302,US2016253467A1,"Diagnosis support apparatus and method, and no...",A diagnosis support apparatus for diagnosis of...,"['G16H10/60', 'A61B5/743']","['G16H', 'A61B']",-1,0.000000
31303,US2016253462A1,Novel open-access scheduling system that optim...,A patient appointment schedule is generated fo...,"['G16H40/20', 'G06F19/327']","['G16H', 'G06F']",-1,0.000000
31304,US2016249985A1,Interrelated point acquisition for navigated s...,The present invention relates to a method for ...,"['G16H20/40', 'G06F19/324']","['G16H', 'G06F']",1,0.893985


# prediction

In [48]:
import pandas as pd

df_Abstract_test = pd.read_csv('test-queries-USPTO(A1)-2023-G16H.csv')
df_Abstract_test.head()

Unnamed: 0,publication_numbers,abstract,first_claim,class_codes
0,US20230238130A1,A physiological sensor has light emitting sour...,1. A physiological monitoring device comprisin...,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/..."
1,US20230270344A1,A wearable monitoring device includes a band c...,"1. A monitoring device, comprising:\na band co...","A61B5/02405,A61B5/01,A61B5/16,A61B5/02055,A61B..."
2,US20230200909A1,A number of improvements are provided relating...,1-25. (canceled) 26. A method for guiding a fr...,"A61B17/17,A61B2090/061,A61B90/06,A61B2090/365,..."
3,US20230218347A1,Embodiments include a system for determining c...,1-184. (canceled) 185. A computer-implemented ...,"G06V10/46,G06V20/698,G06T2207/20112,G06T7/13,A..."
4,US20230063013A1,A community based response system for providin...,1. (canceled) 2. A community based response sy...,"H04M1/72418,G08B,G08,G08B25/016,G,H04W4/023,G1..."


# test1

In [49]:
df_Abstract_test.iloc[0]


publication_numbers                                      US20230238130A1
abstract               A physiological sensor has light emitting sour...
first_claim            1. A physiological monitoring device comprisin...
class_codes            A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...
Name: 0, dtype: object

In [50]:
test1 = df_Abstract_test.loc[0, 'abstract']
test1

'A physiological sensor has light emitting sources, each activated by addressing at least one row and at least one column of an electrical grid. The light emitting sources are capable of transmitting light of multiple wavelengths and a detector is responsive to the transmitted light after attenuation by body tissue.'

In [51]:
import numpy as np 

# Find topics
num_of_topics = 5
similar_topics, similarity = topic_model.find_topics(test1, top_n=num_of_topics); 

# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

for idx, topic_idx in enumerate(similar_topics):
    topic = topic_model.get_topic(topic_idx)
    keywords = ' '.join(str(keyword) for keyword in topic[0])
    print(f"Topic {idx+1}: {keywords}")

The top 5 similar topics are [34, 44, 3, 41, 32], and the similarities are [0.67 0.66 0.64 0.64 0.64]
Topic 1: vital sign 0.05143725764421457
Topic 2: wearable device 0.041425610176207606
Topic 3: ecg 0.04761030111462619
Topic 4: alarm 0.1034350669712211
Topic 5: clock 0.027062087341974385


In [52]:
filter_topics_filter = df_Abstract_topic[df_Abstract_topic['topics'] == 43]
filter_topics_filter = filter_topics_filter.sort_values('prob', ascending=False)
filter_topics_filter

Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob
30,US2020097651A1,Systems and methods to achieve robustness and ...,"According to some embodiments, a system, metho...","['G16H40/63', 'G06F2221/034']","['G16H', 'G06F']",43,1.000000
18309,US2022076842A1,Medical information processing system and method,"According to one embodiment, a medical informa...","['G16H50/70', 'G06F18/23']","['G16H', 'G06F']",43,1.000000
22830,US2010106518A1,System And Method For Providing Optimized Pati...,"In particular embodiments, method, apparatus a...","['G16H40/20', 'G16H20/40']","['G16H', 'G16H']",43,1.000000
20695,US2013290010A1,Medication service terminal and method of moni...,The inventive concept herein relates to medica...,"['G16H20/10', 'G16H70/40']","['G16H', 'G16H']",43,1.000000
19467,US2018330822A1,Medical information processing device and medi...,A medical information processing apparatus acc...,"['G16H50/20', 'G16H50/70']","['G16H', 'G16H']",43,1.000000
...,...,...,...,...,...,...,...
4419,US2004034289A1,"System for monitoring health, wellness and fit...",An apparatus for monitoring human status param...,"['G16H40/67', 'A61B5/369']","['G16H', 'A61B']",43,0.963390
23621,US2010332443A1,Cyclical Behavior Modification,Embodiments treat cyclical behaviors based on ...,"['G16H20/70', 'G16H40/67']","['G16H', 'G16H']",43,0.950257
21998,US2014243684A1,"System and method for creating, processing, an...",A system and method for creating and processin...,"['A61B5/00', 'G16H30/20']","['A61B', 'G16H']",43,0.948861
24561,US2022248965A1,Self-calibrating glucose monitor,A medical system including processing circuitr...,"['A61B5/1455', 'G16H40/40']","['A61B', 'G16H']",43,0.940127


In [53]:
# Define the range of topic IDs you want to retrieve documents for
selected_topic_range =[43, 5, 49, 42, 16]  # Replace with your desired range of topic IDs

# Create an empty DataFrame to store the top 10 documents
top_10_documents_test1 = pd.DataFrame()

# Iterate through the selected topic IDs
for selected_topic_id in selected_topic_range:
    # Filter the dataset based on the current topic ID
    filtered_df = df_Abstract_topic[df_Abstract_topic['topics'] == selected_topic_id]
    
    # Sort the filtered dataset by similarity scores in descending order
    sorted_df = filtered_df.sort_values(by='prob', ascending=False)
    
    # Retrieve the top 10 documents for the current topic
    top_10_for_topic = sorted_df.head(10)
    
    # Append the top 10 documents for the current topic to the result DataFrame
    top_10_documents_test1 = top_10_documents_test1.append(top_10_for_topic)

# Display the top 10 documents for all selected topics
top_10_documents_test1

Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob
30,US2020097651A1,Systems and methods to achieve robustness and ...,"According to some embodiments, a system, metho...","['G16H40/63', 'G06F2221/034']","['G16H', 'G06F']",43,1.0
18309,US2022076842A1,Medical information processing system and method,"According to one embodiment, a medical informa...","['G16H50/70', 'G06F18/23']","['G16H', 'G06F']",43,1.0
22830,US2010106518A1,System And Method For Providing Optimized Pati...,"In particular embodiments, method, apparatus a...","['G16H40/20', 'G16H20/40']","['G16H', 'G16H']",43,1.0
20695,US2013290010A1,Medication service terminal and method of moni...,The inventive concept herein relates to medica...,"['G16H20/10', 'G16H70/40']","['G16H', 'G16H']",43,1.0
19467,US2018330822A1,Medical information processing device and medi...,A medical information processing apparatus acc...,"['G16H50/20', 'G16H50/70']","['G16H', 'G16H']",43,1.0
19406,US2018310885A1,"Method of managing disease, and apparatuses op...",A disease management method and apparatuses pe...,"['A61B5/00', 'G16H40/67']","['A61B', 'G16H']",43,1.0
19181,US2018264258A1,Detection of noise signals in cardiac signals,Medical device systems include processing circ...,"['A61N1/365', 'G16H50/20']","['A61N', 'G16H']",43,1.0
18950,US2022157412A1,Clinical trial matching apparatus,A clinical trial matching apparatus according ...,"['G16B40/00', 'G16H70/40']","['G16B', 'G16H']",43,1.0
18776,US2022130554A1,Method for an ai powered automated analyzer fo...,Method for an AI Powered Automated Analyzer fo...,"['G16H15/00', 'G16H50/70']","['G16H', 'G16H']",43,1.0
18739,US2022130556A1,Health management apparatus and health managem...,A health management apparatus according to an ...,"['G16H50/80', 'A61B5/7275']","['G16H', 'A61B']",43,1.0


In [54]:
num_records = top_10_documents_test1.shape[0]
num_records

50

In [55]:
# # Define a custom function to format the text
# def format_text(text):
#     # Remove brackets and single quotes, split by comma and strip whitespace
#     cleaned_text = ''.join(text).replace('[', '').replace(']', '').replace("'", '').split(',')
#     # Filter out empty strings and strip whitespace
#     cleaned_text = [item.strip() for item in cleaned_text if item.strip()]
#     # Join the cleaned text with commas and spaces
#     formatted_text = ', '.join(cleaned_text)
#     return formatted_text

# # Apply the custom function to the 'sub_classes' and 'sub_class' columns
# top_10_documents_test1['combined_class'] = top_10_documents_test1.apply(lambda x: format_text(x['sub_classes']) + ', ' + format_text(x['sub_class']), axis=1)

# # Display the resulting DataFrame with the 'combined' column
# print(top_10_documents_test1[['sub_classes', 'sub_class', 'combined_class']])


In [56]:
# # Get the class_codes from the first record of df_Abstract_test and convert it to a string, then split it by comma
# class_codes_value = df_Abstract_test['class_codes'].iloc[0]
# if pd.notna(class_codes_value):
#     target_sub_classes = str(class_codes_value).split(',')
# else:
#     target_sub_classes = []

# # Define a function to check for matching classes
# def has_matching_class(row):
#     # Extract the first 4 letters from each code in class_codes and combined_class
#     target_codes = [code[:4] for code in target_sub_classes]
#     combined_codes = [code[:4] for code in str(row['combined_class']).split(',')]

#     # Check if there are any common codes
#     return any(code in combined_codes for code in target_codes)

# # Apply the function to create a boolean mask
# top_10_documents_test1['has_matching_class'] = top_10_documents_test1.apply(has_matching_class, axis=1)

# top_10_documents_test1

In [57]:
# top_10_documents_test1.shape[0]

# list of 100 queries

In [58]:
# Assuming you want to predict topics for the first 100 samples in 'df_Abstract_test'
num_samples_to_predict = 100

num_of_topics = 5
results = []

# Assuming you have a list of 100 queries in test
for query in df_Abstract_test['abstract'][:num_samples_to_predict]:
    similar_topics, similarity = topic_model.find_topics(query, top_n=num_of_topics)
    results.append((similar_topics, similarity))

# Now, the 'results' list contains the similar topics and similarities for each query
# You can access the results for a specific query like this:
for i, (similar_topics, similarity) in enumerate(results):
    print(f"Query {i + 1}: Similar Topics {similar_topics}, Similarity {similarity}")


Query 1: Similar Topics [34, 44, 3, 41, 32], Similarity [0.6707194, 0.66445553, 0.64145833, 0.6402898, 0.63622487]
Query 2: Similar Topics [44, 34, 29, 51, 56], Similarity [0.66972625, 0.6250118, 0.61390185, 0.5831846, 0.57547414]
Query 3: Similar Topics [48, 37, 45, 40, 53], Similarity [0.65560573, 0.6495347, 0.643225, 0.62408566, 0.62396574]
Query 4: Similar Topics [23, 6, 3, 26, 40], Similarity [0.8103322, 0.787099, 0.7512642, 0.7457191, 0.74056697]
Query 5: Similar Topics [41, 26, 34, 30, 3], Similarity [0.6394714, 0.625569, 0.62008077, 0.6158669, 0.61405635]
Query 6: Similar Topics [36, 32, 34, 44, -1], Similarity [0.74751335, 0.70330185, 0.70124954, 0.67774254, 0.67121464]
Query 7: Similar Topics [5, 44, 29, 35, 3], Similarity [0.7631782, 0.70626724, 0.7045376, 0.67319113, 0.6695421]
Query 8: Similar Topics [5, 29, 44, 32, 22], Similarity [0.7967857, 0.6920734, 0.686291, 0.6781874, 0.665908]
Query 9: Similar Topics [48, 37, 2, -1, 56], Similarity [0.6860242, 0.6817516, 0.6791699,

In [59]:
# Assuming you want to predict topics for the first 100 samples in 'df_Abstract_test'
num_samples_to_predict = 100

num_of_topics = 5
results = []

# Create an empty DataFrame to store the results
result_df_q = pd.DataFrame(columns=['query_publication_numbers', 'query_class_codes', 'query_abstract', 'query_predicted_topics'])

# Assuming you have a list of 100 queries in 'df_Abstract_test'
for i, (query, publication_number, abstract, class_codes) in enumerate(zip(df_Abstract_test['abstract'][:num_samples_to_predict], 
                                                    df_Abstract_test['publication_numbers'][:num_samples_to_predict],
                                                    df_Abstract_test['abstract'][:num_samples_to_predict],
                                                    df_Abstract_test['class_codes'][:num_samples_to_predict])):
    similar_topics, similarity = topic_model.find_topics(query, top_n=num_of_topics)
    results.append((similar_topics, similarity))
    
    # Store the results in the DataFrame
    result_df_q = result_df_q.append({
        'query_abstract': query,
        'query_predicted_topics': similar_topics,
        'query_publication_numbers': publication_number,
        'query_class_codes': class_codes
    }, ignore_index=True)
    
result_df_q


Unnamed: 0,query_publication_numbers,query_class_codes,query_abstract,query_predicted_topics
0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,"[34, 44, 3, 41, 32]"
1,US20230270344A1,"A61B5/02405,A61B5/01,A61B5/16,A61B5/02055,A61B...",A wearable monitoring device includes a band c...,"[44, 34, 29, 51, 56]"
2,US20230200909A1,"A61B17/17,A61B2090/061,A61B90/06,A61B2090/365,...",A number of improvements are provided relating...,"[48, 37, 45, 40, 53]"
3,US20230218347A1,"G06V10/46,G06V20/698,G06T2207/20112,G06T7/13,A...",Embodiments include a system for determining c...,"[23, 6, 3, 26, 40]"
4,US20230063013A1,"H04M1/72418,G08B,G08,G08B25/016,G,H04W4/023,G1...",A community based response system for providin...,"[41, 26, 34, 30, 3]"
...,...,...,...,...
95,US20230017310A1,"G06Q10/10,G,G06F,G06F16/90,G06F16/95,G16H30/20...",Features are disclosed for remote storage of m...,"[16, 55, 30, 17, 18]"
96,US20230091925A1,"H04L41/5061,G,G06,G06F,H04L67/00,H,H04,G06F16/...",Some of the embodiments herein provide a seaml...,"[6, 47, 55, -1, 40]"
97,US20230010638A1,"A61M5/142,A61,Y10S128/00,G06T2219/2016,G06F8/6...",A method and system is disclosed for operating...,"[19, 0, 36, 1, -1]"
98,US20230009812A1,"A61B5/14551,A61,G,A61B5/021,A61B5/746,A61B,A61...",A patient monitor including a physiological me...,"[52, 34, 43, 26, 3]"


In [60]:
# Check if the 'query_publication_numbers' field is unique
is_unique = result_df_q['query_publication_numbers'].nunique() == len(result_df_q)

if is_unique:
    print("The 'query_publication_numbers' field is unique.")
else:
    print("The 'query_publication_numbers' field is not unique.")


The 'query_publication_numbers' field is unique.


In [61]:
import pandas as pd

# Create an empty DataFrame to store the results
result_df = pd.DataFrame(columns=['publication_number', 'title', 'abstract', 'sub_classes', 'sub_class', 'topics', 'prob', 'query_publication_numbers', 'query_class_codes', 'query_abstract', 'query_predicted_topics'])

# Define the number of documents to retrieve for each topic
num_of_documents_to_retrieve = 10

for i, row in result_df_q.iterrows():
    query = row['query_abstract']
    predicted_topics = row['query_predicted_topics']
    
    for topic_id in predicted_topics:
        # Filter 'df_Abstract_topic' to get the top 'num_of_documents_to_retrieve' documents for the current topic_id
        topic_documents = df_Abstract_topic[df_Abstract_topic['topics'] == topic_id]
        
        # Sort the documents by probability in descending order
        topic_documents = topic_documents.sort_values(by='prob', ascending=False).head(num_of_documents_to_retrieve)
        
        # Append the results to the 'result_df' DataFrame
        for _, doc_row in topic_documents.iterrows():
            result_df = result_df.append({
                'query_publication_numbers': row['query_publication_numbers'],
                'query_class_codes': row['query_class_codes'],
                'query_abstract': query,
                'query_predicted_topics': [topic_id],  # Assign the current topic_id as a list
                'publication_number': doc_row['publication_number'],
                'title': doc_row['title'],
                'abstract': doc_row['abstract'],
                'sub_classes': doc_row['sub_classes'],
                'sub_class': doc_row['sub_class'],
                'topics': doc_row['topics'],
                'prob': doc_row['prob']
            }, ignore_index=True)

# Now, 'result_df' contains the top 10 most probable documents for each predicted topic list for each query
result_df


Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob,query_publication_numbers,query_class_codes,query_abstract,query_predicted_topics
0,US2020185085A1,Predictive maintenance for large medical imagi...,A predictive maintenance alerting device ( 40 ...,"['G16H30/40', 'G06N20/00']","['G16H', 'G06N']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34]
1,US2009163832A1,Data managing device for a diagnostic instrument,A data managing device for connection to a dia...,"['A61B5/20', 'G16H40/63']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34]
2,US2014247153A1,Patient monitoring systems and messages that s...,A system for is provided for using telemetry d...,"['H04W4/029', 'G16H50/30']","['H04W', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34]
3,US2014206950A1,Ward cloud system,A ward cloud system comprises an intelligent c...,"['G16H10/60', 'A61B5/7285']","['G16H', 'A61B']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34]
4,US2013303863A1,Patient Monitoring Apparatus,A patient monitoring system including a sensin...,"['A61B5/00', 'G16H40/67']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34]
...,...,...,...,...,...,...,...,...,...,...,...
4995,US2010240980A1,Wan-Based Remote Mobile Monitoring Method And ...,A WAN-based remote mobile monitoring method an...,"['G06Q50/22', 'G16H40/67']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50]
4996,US2010185711A1,Online monitoring of patient for routine checkups,"A method, system, and computer program product...","['G16H70/40', 'G06F16/258']","['G16H', 'G06F']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50]
4997,US2010076275A1,System and Method for Remote Healthcare Monito...,A system and method for remote health monitori...,"['G16H40/67', 'G16H10/20']","['G16H', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50]
4998,US2010049543A1,Health data integration system and the method ...,A health data integration system and the metho...,"['G06Q50/00', 'G16H50/20']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50]


In [62]:
result_df['query_class_codes'][0]

'A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/7221,A61B5/14532,G16H40/00,A61B5/14546,G16H40/60,A61B5/68,A61B2562/08,H,A61B1/00,A61B2562/221,A61B5/7475,A61B2562/18,A61B5/746,A61B5/7405,H05K,G16H40/67,A61B,Y10,A61B5/6813,Y,A61B2562/185,G16H10/40,A61B5/6826,A61B5/02,A61B5/0002,A61B5/1455,A61B5/6832,A,A61B5/7235,A61B5/6801,A61B5/7278,G16H,A61B5/026,A61B5/742,A61B2562/22,A61B5/024,G16,A61B5/02416,A61B5/683,A61B5/0015,A61B5/6825,Y10S,A61B5/1495,A61B5/72,A61B5/0261,A61B2562/085,A61B2562/222,A61B5/7275,A61B5/00,A61B5/145,A61B5/74,G16H10/00,H05K999/99,A61B5/7246,Y10S439/909,A61B5/14552,A61B5/6838,A61B5/6815,A61B5/7271,H05K999/00,A61B5/02427,A61B5/0205,Y10S439/00,A61B5/0022,A61B5/6814,A61B2562/00,H05'

In [63]:
# Add a new column to store the filtered codes
result_df['query_codes_G16H'] = ''

# Define a function to extract codes starting with 'G61H' from the class codes
def extract_G16H_codes(class_codes):
    codes = class_codes.split(',')
    return ','.join([code for code in codes if code.startswith('G16H')])

# Iterate through rows and update the 'query_codes_G61H' column
for index, row in result_df.iterrows():
    class_codes = row['query_class_codes']
    filtered_codes = extract_G16H_codes(class_codes)
    result_df.at[index, 'query_codes_G16H'] = filtered_codes

# Display the updated DataFrame
result_df


Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob,query_publication_numbers,query_class_codes,query_abstract,query_predicted_topics,query_codes_G16H
0,US2020185085A1,Predictive maintenance for large medical imagi...,A predictive maintenance alerting device ( 40 ...,"['G16H30/40', 'G06N20/00']","['G16H', 'G06N']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G..."
1,US2009163832A1,Data managing device for a diagnostic instrument,A data managing device for connection to a dia...,"['A61B5/20', 'G16H40/63']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G..."
2,US2014247153A1,Patient monitoring systems and messages that s...,A system for is provided for using telemetry d...,"['H04W4/029', 'G16H50/30']","['H04W', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G..."
3,US2014206950A1,Ward cloud system,A ward cloud system comprises an intelligent c...,"['G16H10/60', 'A61B5/7285']","['G16H', 'A61B']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G..."
4,US2013303863A1,Patient Monitoring Apparatus,A patient monitoring system including a sensin...,"['A61B5/00', 'G16H40/67']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G..."
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,US2010240980A1,Wan-Based Remote Mobile Monitoring Method And ...,A WAN-based remote mobile monitoring method an...,"['G06Q50/22', 'G16H40/67']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60"
4996,US2010185711A1,Online monitoring of patient for routine checkups,"A method, system, and computer program product...","['G16H70/40', 'G06F16/258']","['G16H', 'G06F']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60"
4997,US2010076275A1,System and Method for Remote Healthcare Monito...,A system and method for remote health monitori...,"['G16H40/67', 'G16H10/20']","['G16H', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60"
4998,US2010049543A1,Health data integration system and the method ...,A health data integration system and the metho...,"['G06Q50/00', 'G16H50/20']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60"


In [64]:
# Create a new column to store the common code
import re
result_df['exact_match_code'] = ''

# Iterate through rows and compare 'sub_classes' and 'query_codes_G16H'
for index, row in result_df.iterrows():
    sub_classes_str = row['sub_classes']  # Data format in this field "['A61B5/00', 'G16H40/67']"
    query_codes_G16H = row['query_codes_G16H']  # Data format in this field 'G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H'
    
    # Custom extraction of values from sub_classes_str
    sub_classes = re.findall(r"'([^']*)'", sub_classes_str)
    
    # Split the codes into lists
    sub_class_list = [code.strip() for code in sub_classes]
    query_codes_list = query_codes_G16H.split(',')
    
    # Check for common codes
    exact_match_code = [code for code in sub_class_list if code in query_codes_list]
    
    # Join the common codes into a single string
    exact_match_code_str = ','.join(exact_match_code)
    
    # Update the 'exact_match_code' column with the exact_match_code
    result_df.at[index, 'exact_match_code'] = exact_match_code_str

# Display the updated DataFrame
result_df


Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob,query_publication_numbers,query_class_codes,query_abstract,query_predicted_topics,query_codes_G16H,exact_match_code
0,US2020185085A1,Predictive maintenance for large medical imagi...,A predictive maintenance alerting device ( 40 ...,"['G16H30/40', 'G06N20/00']","['G16H', 'G06N']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",
1,US2009163832A1,Data managing device for a diagnostic instrument,A data managing device for connection to a dia...,"['A61B5/20', 'G16H40/63']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",
2,US2014247153A1,Patient monitoring systems and messages that s...,A system for is provided for using telemetry d...,"['H04W4/029', 'G16H50/30']","['H04W', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",
3,US2014206950A1,Ward cloud system,A ward cloud system comprises an intelligent c...,"['G16H10/60', 'A61B5/7285']","['G16H', 'A61B']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",
4,US2013303863A1,Patient Monitoring Apparatus,A patient monitoring system including a sensin...,"['A61B5/00', 'G16H40/67']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",G16H40/67
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,US2010240980A1,Wan-Based Remote Mobile Monitoring Method And ...,A WAN-based remote mobile monitoring method an...,"['G06Q50/22', 'G16H40/67']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",
4996,US2010185711A1,Online monitoring of patient for routine checkups,"A method, system, and computer program product...","['G16H70/40', 'G06F16/258']","['G16H', 'G06F']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",
4997,US2010076275A1,System and Method for Remote Healthcare Monito...,A system and method for remote health monitori...,"['G16H40/67', 'G16H10/20']","['G16H', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",
4998,US2010049543A1,Health data integration system and the method ...,A health data integration system and the metho...,"['G06Q50/00', 'G16H50/20']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",


In [65]:
# result_df.sample(n=100)

In [66]:
# Calculate the count of 'exact_match_code' for each group and assign it to all rows within the group
result_df['count_exact_match_top50'] = result_df.groupby('query_publication_numbers')['exact_match_code'].transform(lambda x: x[x != ''].count())
result_df

Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob,query_publication_numbers,query_class_codes,query_abstract,query_predicted_topics,query_codes_G16H,exact_match_code,count_exact_match_top50
0,US2020185085A1,Predictive maintenance for large medical imagi...,A predictive maintenance alerting device ( 40 ...,"['G16H30/40', 'G06N20/00']","['G16H', 'G06N']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",,14
1,US2009163832A1,Data managing device for a diagnostic instrument,A data managing device for connection to a dia...,"['A61B5/20', 'G16H40/63']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",,14
2,US2014247153A1,Patient monitoring systems and messages that s...,A system for is provided for using telemetry d...,"['H04W4/029', 'G16H50/30']","['H04W', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",,14
3,US2014206950A1,Ward cloud system,A ward cloud system comprises an intelligent c...,"['G16H10/60', 'A61B5/7285']","['G16H', 'A61B']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",,14
4,US2013303863A1,Patient Monitoring Apparatus,A patient monitoring system including a sensin...,"['A61B5/00', 'G16H40/67']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",G16H40/67,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,US2010240980A1,Wan-Based Remote Mobile Monitoring Method And ...,A WAN-based remote mobile monitoring method an...,"['G06Q50/22', 'G16H40/67']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",,20
4996,US2010185711A1,Online monitoring of patient for routine checkups,"A method, system, and computer program product...","['G16H70/40', 'G06F16/258']","['G16H', 'G06F']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",,20
4997,US2010076275A1,System and Method for Remote Healthcare Monito...,A system and method for remote health monitori...,"['G16H40/67', 'G16H10/20']","['G16H', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",,20
4998,US2010049543A1,Health data integration system and the method ...,A health data integration system and the metho...,"['G06Q50/00', 'G16H50/20']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",,20


In [67]:
import pandas as pd


# Initialize RFR as NaN
result_df['RFR'] = float('nan')

# Create a dictionary to store the first occurrence index for each group
first_occurrence_indices = {}

# Iterate through the DataFrame
for index, row in result_df.iterrows():
    query_pub_num = row['query_publication_numbers']
    exact_match_code = row['exact_match_code']

    # Check if the exact_match_code is not empty and it's the first occurrence within the group
    if exact_match_code != '' and not pd.isna(exact_match_code) and query_pub_num not in first_occurrence_indices:
        first_occurrence_indices[query_pub_num] = index

# Calculate RFR for each group based on the first occurrence within each group
for query_pub_num, first_occurrence_index in first_occurrence_indices.items():
    group_indices = result_df[result_df['query_publication_numbers'] == query_pub_num].index
    result_df.loc[group_indices, 'RFR'] = first_occurrence_index - group_indices[0] + 1

# Print the updated DataFrame
result_df

Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob,query_publication_numbers,query_class_codes,query_abstract,query_predicted_topics,query_codes_G16H,exact_match_code,count_exact_match_top50,RFR
0,US2020185085A1,Predictive maintenance for large medical imagi...,A predictive maintenance alerting device ( 40 ...,"['G16H30/40', 'G06N20/00']","['G16H', 'G06N']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",,14,5.0
1,US2009163832A1,Data managing device for a diagnostic instrument,A data managing device for connection to a dia...,"['A61B5/20', 'G16H40/63']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",,14,5.0
2,US2014247153A1,Patient monitoring systems and messages that s...,A system for is provided for using telemetry d...,"['H04W4/029', 'G16H50/30']","['H04W', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",,14,5.0
3,US2014206950A1,Ward cloud system,A ward cloud system comprises an intelligent c...,"['G16H10/60', 'A61B5/7285']","['G16H', 'A61B']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",,14,5.0
4,US2013303863A1,Patient Monitoring Apparatus,A patient monitoring system including a sensin...,"['A61B5/00', 'G16H40/67']","['A61B', 'G16H']",34,1.0,US20230238130A1,"A61B5/14551,A61,A61B5/0295,A61B5/6829,G,A61B5/...",A physiological sensor has light emitting sour...,[34],"G16H40/00,G16H40/60,G16H40/67,G16H10/40,G16H,G...",G16H40/67,14,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,US2010240980A1,Wan-Based Remote Mobile Monitoring Method And ...,A WAN-based remote mobile monitoring method an...,"['G06Q50/22', 'G16H40/67']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",,20,3.0
4996,US2010185711A1,Online monitoring of patient for routine checkups,"A method, system, and computer program product...","['G16H70/40', 'G06F16/258']","['G16H', 'G06F']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",,20,3.0
4997,US2010076275A1,System and Method for Remote Healthcare Monito...,A system and method for remote health monitori...,"['G16H40/67', 'G16H10/20']","['G16H', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",,20,3.0
4998,US2010049543A1,Health data integration system and the method ...,A health data integration system and the metho...,"['G06Q50/00', 'G16H50/20']","['G06Q', 'G16H']",50,1.0,US20230138516A1,"G16H,G16H10/00,G06Q10/1093,G06Q10/10,G16,G06Q1...",A system to manage records in a healthcare pra...,[50],"G16H,G16H10/00,G16H40/20,G16H40/00,G16H10/60",,20,3.0


In [68]:
# Filter the DataFrame to keep only the top 5 records within each group
result_df_filter_top5 = result_df.groupby('query_publication_numbers').apply(lambda x: x.head(5)).reset_index(drop=True)

# Count the occurrences of count_exact_match if it has data
result_df_filter_top5['count_exact_match_count'] = result_df_filter_top5['exact_match_code'].apply(lambda x: 1 if x != '' else 0)

# Calculate the total count for each group
result_df_filter_top5['count_exact_match_top5'] = result_df_filter_top5.groupby('query_publication_numbers')['count_exact_match_count'].transform('sum')
df_top5=result_df_filter_top5[["publication_number","title","abstract","sub_classes","sub_class","topics","prob", "query_publication_numbers","query_abstract","query_codes_G16H","exact_match_code","count_exact_match_top50","count_exact_match_top5"]]
df_top5

Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob,query_publication_numbers,query_abstract,query_codes_G16H,exact_match_code,count_exact_match_top50,count_exact_match_top5
0,US2020160993A1,Artificial Intelligence Based Alert System,A mechanism is provided to implement an artifi...,"['G16H15/00', 'G06T7/0012']","['G16H', 'G06T']",41,1.0,US20230001090A1,Methods and systems for delaying alarms that i...,"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,1
1,US2012303085A1,Methods and apapratus for manually suspending ...,The capability to suspend a patient alert rela...,"['A61N1/365', 'G16H40/63']","['A61N', 'G16H']",41,1.0,US20230001090A1,Methods and systems for delaying alarms that i...,"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,1
2,US2019180592A1,Closed loop alarm management,"Methods, systems, and devices for patient moni...","['G16H40/63', 'G08B29/10']","['G16H', 'G08B']",41,1.0,US20230001090A1,Methods and systems for delaying alarms that i...,"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,1
3,US2019156937A1,Priority alerts based on medical information,A method and apparatus are disclosed herein fo...,"['G16H50/20', 'G16H50/30']","['G16H', 'G16H']",41,1.0,US20230001090A1,Methods and systems for delaying alarms that i...,"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,1
4,US2019130730A1,Alarm Management,"Methods, systems, and devices for patient moni...","['G16H40/67', 'A61B5/0022']","['G16H', 'A61B']",41,1.0,US20230001090A1,Methods and systems for delaying alarms that i...,"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",G16H40/67,17,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,US2020194106A1,System and device for non-invasive detection o...,Wearable device has at least one sensor operab...,"['G16H40/67', 'G16H20/30']","['G16H', 'G16H']",44,1.0,US20230270344A1,A wearable monitoring device includes a band c...,"G16H40/00,G16H40/67,G16H20/30,G16H30/40,G16H30...","G16H40/67,G16H20/30",25,4
496,US2018261329A1,Personalized Health-Information Based on Genet...,This disclosure relates to technologies for ev...,"['C12Q1/6827', 'G16H50/30']","['C12Q', 'G16H']",44,1.0,US20230270344A1,A wearable monitoring device includes a band c...,"G16H40/00,G16H40/67,G16H20/30,G16H30/40,G16H30...",,25,4
497,US2014052405A1,Concussion detection and communication system,A concussion detection and communication syste...,"['G16H20/70', 'G16H40/67']","['G16H', 'G16H']",44,1.0,US20230270344A1,A wearable monitoring device includes a band c...,"G16H40/00,G16H40/67,G16H20/30,G16H30/40,G16H30...",G16H40/67,25,4
498,US2014018945A1,Method and apparatus for determining effect of...,The present disclosure concerns determining ph...,"['A63B24/00', 'G16H20/30']","['A63B', 'G16H']",44,1.0,US20230270344A1,A wearable monitoring device includes a band c...,"G16H40/00,G16H40/67,G16H20/30,G16H30/40,G16H30...",G16H20/30,25,4


In [69]:
filtered_df = result_df[result_df['query_publication_numbers'] == 'US20230001090A1']
filtered_df

Unnamed: 0,publication_number,title,abstract,sub_classes,sub_class,topics,prob,query_publication_numbers,query_class_codes,query_abstract,query_predicted_topics,query_codes_G16H,exact_match_code,count_exact_match_top50,RFR
3900,US2020160993A1,Artificial Intelligence Based Alert System,A mechanism is provided to implement an artifi...,"['G16H15/00', 'G06T7/0012']","['G16H', 'G06T']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,5.0
3901,US2012303085A1,Methods and apapratus for manually suspending ...,The capability to suspend a patient alert rela...,"['A61N1/365', 'G16H40/63']","['A61N', 'G16H']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,5.0
3902,US2019180592A1,Closed loop alarm management,"Methods, systems, and devices for patient moni...","['G16H40/63', 'G08B29/10']","['G16H', 'G08B']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,5.0
3903,US2019156937A1,Priority alerts based on medical information,A method and apparatus are disclosed herein fo...,"['G16H50/20', 'G16H50/30']","['G16H', 'G16H']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,5.0
3904,US2019130730A1,Alarm Management,"Methods, systems, and devices for patient moni...","['G16H40/67', 'A61B5/0022']","['G16H', 'A61B']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",G16H40/67,17,5.0
3905,US2019131014A1,"Process, control unit, computer program produc...",A process for providing failure safety for a p...,"['G16H80/00', 'G16H10/60']","['G16H', 'G16H']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,5.0
3906,US2019051383A1,Intelligent sepsis alert,A system for determining a likelihood of curre...,"['G16H10/60', 'G16H50/20']","['G16H', 'G16H']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,5.0
3907,US2019035493A1,"Methods of analyte monitoring, and devices and...","Methods, devices, and systems are provided tha...","['G16H15/00', 'G16H50/20']","['G16H', 'G16H']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,5.0
3908,US2019008384A1,Systems and methods for managing patient-trigg...,Systems and methods for managing machine-gener...,"['A61B5/00', 'G16H40/67']","['A61B', 'G16H']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",G16H40/67,17,5.0
3909,US2018271454A1,System and method for the identification and s...,The system and method described herein represe...,"['G16H10/60', 'A61B5/024']","['G16H', 'A61B']",41,1.0,US20230001090A1,"A61M5/142,A61,G08B,G08,G16H20/17,A61B5/4836,G,...",Methods and systems for delaying alarms that i...,[41],"G16H20/17,G16H40/00,G16H40/60,G16H40/67,G16H20...",,17,5.0


In [70]:
result_df.to_csv('BERTopic-without-Preprocessing-Abstract-BM25', index=False)

In [71]:
average_count_exact_match_top5 = df_top5.groupby('query_publication_numbers')['count_exact_match_top5'].mean().reset_index()
average_count_exact_match_top5


Unnamed: 0,query_publication_numbers,count_exact_match_top5
0,US20230001090A1,1.0
1,US20230001263A1,3.0
2,US20230005591A1,4.0
3,US20230005611A1,0.0
4,US20230009812A1,2.0
...,...,...
95,US20230255518A1,3.0
96,US20230259481A1,2.0
97,US20230260617A1,5.0
98,US20230264029A1,1.0


In [72]:
average_count_exact_match_top5 = average_count_exact_match_top5['count_exact_match_top5'].mean()
print("Average of count_exact_match_top5:", average_count_exact_match_top5)

Average of count_exact_match_top5: 1.85


In [73]:
average_count_exact_match_top50 = result_df.groupby('query_publication_numbers')['count_exact_match_top50'].mean().reset_index()
average_count_exact_match_top50

Unnamed: 0,query_publication_numbers,count_exact_match_top50
0,US20230001090A1,17.0
1,US20230001263A1,12.0
2,US20230005591A1,25.0
3,US20230005611A1,7.0
4,US20230009812A1,11.0
...,...,...
95,US20230255518A1,10.0
96,US20230259481A1,13.0
97,US20230260617A1,13.0
98,US20230264029A1,9.0


In [74]:
average_count_exact_match_top50 = average_count_exact_match_top50['count_exact_match_top50'].mean()
print("Average of count_exact_match_top50:", average_count_exact_match_top50)

Average of count_exact_match_top50: 15.19


In [75]:
avg_RFR =  result_df.groupby('query_publication_numbers')['RFR'].mean().reset_index()
avg_RFR

Unnamed: 0,query_publication_numbers,RFR
0,US20230001090A1,5.0
1,US20230001263A1,1.0
2,US20230005591A1,1.0
3,US20230005611A1,16.0
4,US20230009812A1,1.0
...,...,...
95,US20230255518A1,2.0
96,US20230259481A1,2.0
97,US20230260617A1,1.0
98,US20230264029A1,5.0


In [76]:
avg_RFR = avg_RFR['RFR'].mean()
print("Average of RFR:", avg_RFR)

Average of RFR: 4.8979591836734695
