In [2]:
import requests
import pandas as pd
import json

from numba.scripts.generate_lower_listing import description

with open('credentials.json') as credentials:
     config = json.load(credentials)
       
        
    
    
api_key = config['api_key']

if api_key:
    print("Key loaded successfully")

url = 'https://cpsideas.aha.io/api/v1/bookmarks/custom_pivots'
ideas_url = f'{url}/7422318987526014353'

headers = {
    'Authorization': f'Bearer {api_key}',
    'Content-Type': 'application/json'
}

# Initialize an empty list to collect data from all pages
all_ideas = []

# Start by setting the current page to 1
current_page = 1

# Fetch data while the current page is less than or equal to the total number of pages
while True:
    # Append the current page number to the URL
    paginated_url = f'{ideas_url}?page={current_page}'
    
    # Make the request
    response = requests.get(paginated_url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()  # Convert response to JSON
        
        # Extract the total number of pages from the first response
        if current_page == 1:
            total_pages = data['pagination'][0]['total_pages']
            print(f"Total pages: {total_pages}")
        
        # Extract the rows part of the JSON which contains the actual data
        rows_data = data.get('rows', [])
        
        # Convert rows into a flat structure for each idea
        for row in rows_data:
            idea = { 
                'id': row[0].get('plain_value', ''),
                'name': row[1].get('plain_value', ''),
                'categories': row[2].get('plain_value', ''),
                'assigned_to': row[3].get('plain_value', ''),
                'status': row[4].get('plain_value', ''),
                'created_at': row[5].get('plain_value', ''),
                'votes': row[6].get('plain_value', ''),
                'tags': row[7].get('plain_value', ''),
                'description': row[8].get('plain_value', ''),
            }
            all_ideas.append(idea)
        
        # Increment to the next page
        current_page += 1

        # Break the loop if we've reached the last page
        if current_page > total_pages:
            break
    else:
        print(f"Error: {response.status_code} - {response.text}")
        break

# Convert the collected data into a DataFrame
df = pd.DataFrame(all_ideas)

# Display the DataFrame
print(df.head())

    
    

Key loaded successfully
Total pages: 21
          id                                               name  \
0   DFE-I-62          OR preference card search in applications   
1  DFE-I-521       Add OR Preference Cards Patient Demographics   
2   GRH-I-44       Most recent data displayed within the portal   
3   DFE-I-16                        Prioritize escribe request    
4   DFE-I-78  340B: Capture actual NDC that is scanned with ...   

                                        categories     assigned_to  \
0                               Patient Scheduling   Blakely Prine   
1     Patient Scheduling,TruBridge EHR - Financial   Blakely Prine   
2                                     MyCareCorner  Brandon Cooper   
3                            Medication Management    Brent Runkle   
4  Medication Management,TruBridge EHR - Financial    Brent Runkle   

                 status            created_at  votes           tags  \
0  Product Owner Review  2022-05-17T19:47:17Z     13  OR Manageme

In [3]:
documents = df['description'].dropna().tolist()
documents[0]


'EWS preference cards are antiquated so the ability to search and do patient charges from applications. Also the ability to choose multiple cards for a case that will cross reference so there are not duplicate charges.'

In [4]:
print(documents)




In [5]:
from top2vec import Top2Vec



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
!pip install nltk



In [7]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trey0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df['description'].dropna(inplace=True)

In [9]:
df['description']


0       EWS preference cards are antiquated so the abi...
1       Can the OR preference cards include the patien...
2       Showing the most recent available date so that...
3       providers need to have the ability to prioriti...
4       340B/HRSA requires documentation of actual NDC...
                              ...                        
2008    This is a great report to follow up on any ord...
2009    Historically there has always been an inconsis...
2010    When Pharmacy Verifies can free text unit if m...
2011    Our pediatric physicians would like the abilit...
2012    We need to be able to send letters via Thrive ...
Name: description, Length: 2013, dtype: object

In [10]:
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

df['description'] = df['description'].apply(remove_stopwords)

In [11]:
df['description']

0       EWS preference cards antiquated ability search...
1       preference cards include patient's height, wei...
2       Showing recent available date something shows ...
3       providers need ability prioritize escribe inco...
4       340B/HRSA requires documentation actual NDC ad...
                              ...                        
2008    great report follow orders providers ordered d...
2009    Historically always inconsistency mouse clicks...
2010    Pharmacy Verifies free text unit mis-match uni...
2011    pediatric physicians would like ability add pa...
2012    need able send letters via Thrive patients res...
Name: description, Length: 2013, dtype: object

In [12]:
df['cleaned_description'] = df['description']


In [13]:
df['cleaned_description']

0       EWS preference cards antiquated ability search...
1       preference cards include patient's height, wei...
2       Showing recent available date something shows ...
3       providers need ability prioritize escribe inco...
4       340B/HRSA requires documentation actual NDC ad...
                              ...                        
2008    great report follow orders providers ordered d...
2009    Historically always inconsistency mouse clicks...
2010    Pharmacy Verifies free text unit mis-match uni...
2011    pediatric physicians would like ability add pa...
2012    need able send letters via Thrive patients res...
Name: cleaned_description, Length: 2013, dtype: object

In [14]:
df

Unnamed: 0,id,name,categories,assigned_to,status,created_at,votes,tags,description,cleaned_description
0,DFE-I-62,OR preference card search in applications,Patient Scheduling,Blakely Prine,Product Owner Review,2022-05-17T19:47:17Z,13,OR Management,EWS preference cards antiquated ability search...,EWS preference cards antiquated ability search...
1,DFE-I-521,Add OR Preference Cards Patient Demographics,"Patient Scheduling,TruBridge EHR - Financial",Blakely Prine,Product Owner Review,2023-02-21T16:59:13Z,2,OR Management,"preference cards include patient's height, wei...","preference cards include patient's height, wei..."
2,GRH-I-44,Most recent data displayed within the portal,MyCareCorner,Brandon Cooper,Open for Comment,2021-08-10T17:43:09Z,3,,Showing recent available date something shows ...,Showing recent available date something shows ...
3,DFE-I-16,Prioritize escribe request,Medication Management,Brent Runkle,Open for Comment,2022-05-17T15:42:48Z,5,,providers need ability prioritize escribe inco...,providers need ability prioritize escribe inco...
4,DFE-I-78,340B: Capture actual NDC that is scanned with ...,"Medication Management,TruBridge EHR - Financial",Brent Runkle,Open for Comment,2022-05-18T15:44:17Z,22,,340B/HRSA requires documentation actual NDC ad...,340B/HRSA requires documentation actual NDC ad...
...,...,...,...,...,...,...,...,...,...,...
2008,DFE-I-2040,Patient identifier on Physician Order Data Rep...,Clinical Report,Default (Unassigned),Needs Review,2024-10-09T15:27:27Z,2,,great report follow orders providers ordered d...,great report follow orders providers ordered d...
2009,DFE-I-2041,MOUSE CLICKS SHOULD BE CONSISTENT THROUGHOUT T...,Ancillary,Default (Unassigned),Needs Review,2024-10-09T17:04:12Z,6,,Historically always inconsistency mouse clicks...,Historically always inconsistency mouse clicks...
2010,DFE-I-2042,When Pharmacy Verifies can free text unit if m...,Medication Management,Default (Unassigned),Needs Review,2024-10-10T18:39:57Z,1,,Pharmacy Verifies free text unit mis-match uni...,Pharmacy Verifies free text unit mis-match uni...
2011,DFE-I-2043,Ability to add growth charts to notes,Notes,Default (Unassigned),Needs Review,2024-10-11T16:23:15Z,1,,pediatric physicians would like ability add pa...,pediatric physicians would like ability add pa...


In [15]:
# Function to clean the text
import re
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
custom_stopwords = set(stopwords.words('english')).union(['it', 'an', 'the', 'that', 'on', 'is', 'in', 'as', 'to'])

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Lowercase the text
    text = text.lower()
    # Remove stopwords and lemmatize
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in custom_stopwords])
    return text


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\trey0\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
df['cleaned_description'] = df['cleaned_description'].apply(clean_text)

In [17]:
df['cleaned_description']

0       ew preference card antiquated ability search p...
1       preference card include patient height weight ...
2       showing recent available date something show w...
3       provider need ability prioritize escribe incom...
4       bhrsa requires documentation actual ndc admini...
                              ...                        
2008    great report follow order provider ordered day...
2009    historically always inconsistency mouse click ...
2010    pharmacy verifies free text unit mismatch unit...
2011    pediatric physician would like ability add pat...
2012    need able send letter via thrive patient resul...
Name: cleaned_description, Length: 2013, dtype: object

In [18]:
df.dropna(inplace=True)

In [19]:
df

Unnamed: 0,id,name,categories,assigned_to,status,created_at,votes,tags,description,cleaned_description
0,DFE-I-62,OR preference card search in applications,Patient Scheduling,Blakely Prine,Product Owner Review,2022-05-17T19:47:17Z,13,OR Management,EWS preference cards antiquated ability search...,ew preference card antiquated ability search p...
1,DFE-I-521,Add OR Preference Cards Patient Demographics,"Patient Scheduling,TruBridge EHR - Financial",Blakely Prine,Product Owner Review,2023-02-21T16:59:13Z,2,OR Management,"preference cards include patient's height, wei...",preference card include patient height weight ...
2,GRH-I-44,Most recent data displayed within the portal,MyCareCorner,Brandon Cooper,Open for Comment,2021-08-10T17:43:09Z,3,,Showing recent available date something shows ...,showing recent available date something show w...
3,DFE-I-16,Prioritize escribe request,Medication Management,Brent Runkle,Open for Comment,2022-05-17T15:42:48Z,5,,providers need ability prioritize escribe inco...,provider need ability prioritize escribe incom...
4,DFE-I-78,340B: Capture actual NDC that is scanned with ...,"Medication Management,TruBridge EHR - Financial",Brent Runkle,Open for Comment,2022-05-18T15:44:17Z,22,,340B/HRSA requires documentation actual NDC ad...,bhrsa requires documentation actual ndc admini...
...,...,...,...,...,...,...,...,...,...,...
2008,DFE-I-2040,Patient identifier on Physician Order Data Rep...,Clinical Report,Default (Unassigned),Needs Review,2024-10-09T15:27:27Z,2,,great report follow orders providers ordered d...,great report follow order provider ordered day...
2009,DFE-I-2041,MOUSE CLICKS SHOULD BE CONSISTENT THROUGHOUT T...,Ancillary,Default (Unassigned),Needs Review,2024-10-09T17:04:12Z,6,,Historically always inconsistency mouse clicks...,historically always inconsistency mouse click ...
2010,DFE-I-2042,When Pharmacy Verifies can free text unit if m...,Medication Management,Default (Unassigned),Needs Review,2024-10-10T18:39:57Z,1,,Pharmacy Verifies free text unit mis-match uni...,pharmacy verifies free text unit mismatch unit...
2011,DFE-I-2043,Ability to add growth charts to notes,Notes,Default (Unassigned),Needs Review,2024-10-11T16:23:15Z,1,,pediatric physicians would like ability add pa...,pediatric physician would like ability add pat...


In [20]:
df_2 = df['cleaned_description']


In [21]:
df_2.apply(clean_text)



0       ew preference card antiquated ability search p...
1       preference card include patient height weight ...
2       showing recent available date something show w...
3       provider need ability prioritize escribe incom...
4       bhrsa requires documentation actual ndc admini...
                              ...                        
2008    great report follow order provider ordered day...
2009    historically always inconsistency mouse click ...
2010    pharmacy verifies free text unit mismatch unit...
2011    pediatric physician would like ability add pat...
2012    need able send letter via thrive patient resul...
Name: cleaned_description, Length: 2013, dtype: object

In [22]:
df_2.dropna(inplace=True)

In [23]:
df_2


0       ew preference card antiquated ability search p...
1       preference card include patient height weight ...
2       showing recent available date something show w...
3       provider need ability prioritize escribe incom...
4       bhrsa requires documentation actual ndc admini...
                              ...                        
2008    great report follow order provider ordered day...
2009    historically always inconsistency mouse click ...
2010    pharmacy verifies free text unit mismatch unit...
2011    pediatric physician would like ability add pat...
2012    need able send letter via thrive patient resul...
Name: cleaned_description, Length: 2013, dtype: object

In [24]:
docs = df_2.tolist()


In [25]:
docs[0]


'ew preference card antiquated ability search patient charge application also ability choose multiple card case cross reference duplicate charge'

In [26]:
docs[8]

''

In [27]:
while('' in docs):
    docs.remove('')



In [28]:
length = len(docs)

print(length)



1967


In [29]:
!pip install top2vec[sentence-transformers]



In [30]:
!pip install top2vec[sentence_encoders]



In [31]:
!pip install torch sentence_transformers






In [32]:
umap_args = { 'n_neighbors' : 10,
              'n_components' : 2,
              'metric' : 'cosine',
              'random_state' : 42,
            }

hdbscan_args = {   
                'min_cluster_size' : 10,
                'min_samples' : 5,
                'metric' : 'euclidean',
                'cluster_selection_method': 'eom'
}

model = Top2Vec(docs,
                embedding_model = 'distiluse-base-multilingual-cased',
                speed = 'learn',
                workers = 4,
                umap_args = umap_args,
                hdbscan_args = hdbscan_args
                )
model.get_num_topics()



2024-10-13 10:14:13,865 - top2vec - INFO - Pre-processing documents for training
2024-10-13 10:14:14,418 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model
2024-10-13 10:14:20,806 - top2vec - INFO - Creating joint document/word embedding
2024-10-13 10:16:58,429 - top2vec - INFO - Creating lower dimension embedding of documents
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
2024-10-13 10:17:36,614 - top2vec - INFO - Finding dense areas of documents
2024-10-13 10:17:36,775 - top2vec - INFO - Finding topics


44

In [33]:
topic_sizes, topic_nums = model.get_topic_sizes()


In [34]:
model.get_topic_sizes()



(array([124,  98,  82,  77,  72,  72,  67,  66,  64,  62,  59,  51,  50,
         50,  47,  46,  46,  42,  42,  41,  41,  41,  40,  37,  36,  36,
         34,  32,  32,  31,  31,  30,  30,  29,  29,  29,  29,  29,  26,
         20,  18,  17,  16,  16], dtype=int64),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43], dtype=int64))

In [35]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=0, num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()
    


Document: 325, Score: 0.8231557011604309
-----------
need add ability print order order req order chron would formatted based current functionality future order order requisition applicable order type nursing pharmacy ancillary would available hospital clinic released future order additional need current order req header pull clinic table create logic apply order nonfuture order would need pull header different table add order req action button future order detail screen order pending future order status order released button visible future order hx button via order detail
-----------

Document: 1026, Score: 0.774498701095581
-----------
need report audit log type show removesdeletes order future order prior released weve multiple time see clinic ehrs interface dashboard order sent thrive look ims dashboard thrive see order received future order order run released future order report either mean removeddeleted directly future order queue need figure staff member able correct since happ

AttributeError: 'Top2Vec' object has no attribute 'topic'

In [38]:
topic_words, word_scores, topic_nums = model.get_topics(43)

In [40]:
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f"words: {words}")
    


0
words: ['order' 'patient' 'scheduled' 'schedule' 'nurse' 'hospital' 'radiology'
 'nursing' 'clinical' 'medical' 'physician' 'multiple' 'future' 'clinic'
 'prescription' 'prior' 'require' 'setting' 'needed' 'column' 'type'
 'already' 'easier' 'required' 'cannot' 'time' 'line' 'system'
 'demographic' 'well' 'nice' 'easily' 'many' 'set' 'address' 'better'
 'need' 'great' 'management' 'request' 'list' 'staff' 'select' 'right'
 'client' 'correct' 'number' 'twc' 'history' 'pharmacy']
1
words: ['nurse' 'patient' 'clinical' 'nursing' 'hospital' 'medical' 'clinic'
 'physician' 'date' 'history' 'record' 'radiology' 'time' 'schedule'
 'documentation' 'document' 'hour' 'needed' 'easily' 'need' 'scheduled'
 'data' 'column' 'well' 'health' 'chart' 'easier' 'already' 'scan'
 'currently' 'current' 'update' 'log' 'better' 'require' 'year' 'scanned'
 'great' 'nice' 'required' 'helpful' 'day' 'demographic' 'could' 'right'
 'check' 'cannot' 'registration' 'multiple' 'evident']
2
words: ['patient' 'nurse

In [41]:
umap_args = { 'n_neighbors' : 10,
              'n_components' : 2,
              'metric' : 'cosine',
              'random_state' : 42,
            }

hdbscan_args = {   
                'min_cluster_size' : 10,
                'min_samples' : 5,
                'max_cluster_size' : 20,
                'metric' : 'euclidean',
                'cluster_selection_method': 'eom'
}

model2 = Top2Vec(docs,
                embedding_model = 'distiluse-base-multilingual-cased',
                speed = 'learn',
                workers = 4,
                umap_args = umap_args,
                hdbscan_args = hdbscan_args
                )
model2.get_num_topics()



2024-10-13 10:19:10,166 - top2vec - INFO - Pre-processing documents for training
2024-10-13 10:19:10,726 - top2vec - INFO - Downloading distiluse-base-multilingual-cased model
2024-10-13 10:19:13,797 - top2vec - INFO - Creating joint document/word embedding
2024-10-13 10:20:59,637 - top2vec - INFO - Creating lower dimension embedding of documents
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
2024-10-13 10:21:25,091 - top2vec - INFO - Finding dense areas of documents
2024-10-13 10:21:25,212 - top2vec - INFO - Finding topics


25

In [42]:
topic_sizes2, topic_nums2 = model2.get_topic_sizes()

In [43]:
model2.get_topic_sizes()

(array([312, 237, 114, 105,  91,  91,  89,  82,  79,  75,  72,  68,  66,
         62,  54,  51,  51,  50,  44,  33,  32,  29,  28,  27,  25],
       dtype=int64),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24], dtype=int64))

In [45]:
topic_words2, word_scores2, topic_nums2 = model2.get_topics(25)

In [46]:
for words, scores, num in zip(topic_words2, word_scores2, topic_nums2):
    print(num)
    print(f"words: {words}")
    

0
words: ['patient' 'nurse' 'clinical' 'hospital' 'nursing' 'physician' 'medical'
 'clinic' 'could' 'able' 'documentation' 'document' 'may' 'schedule'
 'hour' 'cannot' 'time' 'scan' 'radiology' 'column' 'scanned' 'check'
 'needed' 'record' 'well' 'easily' 'helpful' 'nice' 'scheduled' 'right'
 'better' 'available' 'would' 'staff' 'table' 'evident' 'ability' 'health'
 'great' 'easier' 'correct' 'require' 'allow' 'need' 'print' 'client'
 'already' 'currently' 'pdf' 'required']
1
words: ['nurse' 'nursing' 'patient' 'clinical' 'hospital' 'medical' 'clinic'
 'physician' 'radiology' 'staff' 'demographic' 'name' 'well' 'correct'
 'communication' 'right' 'twc' 'employee' 'update' 'number' 'column'
 'field' 'health' 'insurance' 'person' 'client' 'already' 'profile'
 'better' 'flowchart' 'evident' 'contact' 'tracking' 'list' 'easily'
 'history' 'date' 'dashboard' 'text' 'chart' 'workflow' 'pharmacy' 'nice'
 'multiple' 'schedule' 'message' 'documentation' 'cannot' 'code' 'easier']
2
words: ['presc

In [47]:
documents, document_scores, document_ids = model2.search_documents_by_topic(topic_num=0, num_docs=20)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()
    
    

Document: 1036, Score: 0.697073221206665
-----------
need easy way see order signed provider audit survey etc idea would show order chronology go review style choose expanded would also show signed datetime course able print necessary right could see one order time go report attachment nurse note patient progress note show lot information order
-----------

Document: 495, Score: 0.691240131855011
-----------
lab staff primarily spend time lab schedule currently need reprint lab label access patient function screen adding schedule would save efficient adding patient list account number field would allow lab staff select patient directly schedule save time
-----------

Document: 1800, Score: 0.6906635165214539
-----------
could able customize hour summary build patient summary print would beneficial pick specific thing would like include facility would help nurse able something look glance even next computer specific facility floor
-----------

Document: 1242, Score: 0.6880486011505127
-

In [48]:
df3 = pd.DataFrame(all_ideas)

In [60]:
df3

Unnamed: 0,id,name,categories,assigned_to,status,created_at,votes,tags,description
0,DFE-I-62,OR preference card search in applications,Patient Scheduling,Blakely Prine,Product Owner Review,2022-05-17T19:47:17Z,13,OR Management,EWS preference cards are antiquated so the abi...
1,DFE-I-521,Add OR Preference Cards Patient Demographics,"Patient Scheduling,TruBridge EHR - Financial",Blakely Prine,Product Owner Review,2023-02-21T16:59:13Z,2,OR Management,Can the OR preference cards include the patien...
2,GRH-I-44,Most recent data displayed within the portal,MyCareCorner,Brandon Cooper,Open for Comment,2021-08-10T17:43:09Z,3,,Showing the most recent available date so that...
3,DFE-I-16,Prioritize escribe request,Medication Management,Brent Runkle,Open for Comment,2022-05-17T15:42:48Z,5,,providers need to have the ability to prioriti...
4,DFE-I-78,340B: Capture actual NDC that is scanned with ...,"Medication Management,TruBridge EHR - Financial",Brent Runkle,Open for Comment,2022-05-18T15:44:17Z,22,,340B/HRSA requires documentation of actual NDC...
...,...,...,...,...,...,...,...,...,...
2005,DFE-I-2037,Omit vs Override reasons > OTHER to prompt for...,Medication Management,Default (Unassigned),Needs Review,2024-10-09T12:48:58Z,4,,"When on the EMAR: 1. If a med is omitted, the ..."
2006,DFE-I-2038,Communication Center Fax Preview,Communication Center,Default (Unassigned),Needs Review,2024-10-09T13:18:52Z,1,,It would be great to have a Fax preview panel ...
2007,DFE-I-2039,Communication Center Faxing forward fax to ano...,Communication Center,Default (Unassigned),Needs Review,2024-10-09T13:22:01Z,1,,When you have the fax open and are looking at ...
2008,DFE-I-2040,Patient identifier on Physician Order Data Rep...,Clinical Report,Default (Unassigned),Needs Review,2024-10-09T15:27:27Z,1,,This is a great report to follow up on any ord...


In [49]:
df_stop_test = df3['description']


In [62]:
df_stop_test


0       EWS preference cards are antiquated so the abi...
1       Can the OR preference cards include the patien...
2       Showing the most recent available date so that...
3       providers need to have the ability to prioriti...
4       340B/HRSA requires documentation of actual NDC...
                              ...                        
2005    When on the EMAR: 1. If a med is omitted, the ...
2006    It would be great to have a Fax preview panel ...
2007    When you have the fax open and are looking at ...
2008    This is a great report to follow up on any ord...
2009    Historically there has always been an inconsis...
Name: description, Length: 2010, dtype: object

In [50]:
docs2 = df_stop_test.tolist()
docs2[0]



'EWS preference cards are antiquated so the ability to search and do patient charges from applications. Also the ability to choose multiple cards for a case that will cross reference so there are not duplicate charges.'

In [51]:
umap_args = { 'n_neighbors' : 10,
              'n_components' : 2,
              'metric' : 'cosine',
              'random_state' : 42,
            }

hdbscan_args = {   
                'min_cluster_size' : 10,
                'min_samples' : 5,
                'max_cluster_size' : 20,
                'metric' : 'euclidean',
                'cluster_selection_method': 'eom'
}

model3 = Top2Vec(docs2,
                embedding_model = 'doc2vec',
                speed = 'deep-learn',
                workers = 4,
                umap_args = umap_args,
                hdbscan_args = hdbscan_args
                )
model3.get_num_topics()


2024-10-13 10:23:51,470 - top2vec - INFO - Pre-processing documents for training
2024-10-13 10:23:52,027 - top2vec - INFO - Creating joint document/word embedding
2024-10-13 10:25:15,671 - top2vec - INFO - Creating lower dimension embedding of documents
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
2024-10-13 10:25:27,895 - top2vec - INFO - Finding dense areas of documents
2024-10-13 10:25:27,990 - top2vec - INFO - Finding topics


33

In [52]:
topic_sizes3, topic_nums3 = model3.get_topic_sizes()
model3.get_topic_sizes()


(array([91, 83, 80, 80, 80, 78, 76, 74, 73, 73, 69, 68, 68, 66, 63, 62, 60,
        59, 58, 54, 54, 52, 52, 51, 50, 50, 48, 48, 45, 42, 42, 35, 29],
       dtype=int64),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],
       dtype=int64))

In [54]:
topic_words3, word_scores3, topic_nums3 = model.get_topics(43)


In [55]:
for words, scores, num in zip(topic_words3, word_scores3, topic_nums3):
    print(num)
    print(f"words: {words}")


0
words: ['order' 'patient' 'scheduled' 'schedule' 'nurse' 'hospital' 'radiology'
 'nursing' 'clinical' 'medical' 'physician' 'multiple' 'future' 'clinic'
 'prescription' 'prior' 'require' 'setting' 'needed' 'column' 'type'
 'already' 'easier' 'required' 'cannot' 'time' 'line' 'system'
 'demographic' 'well' 'nice' 'easily' 'many' 'set' 'address' 'better'
 'need' 'great' 'management' 'request' 'list' 'staff' 'select' 'right'
 'client' 'correct' 'number' 'twc' 'history' 'pharmacy']
1
words: ['nurse' 'patient' 'clinical' 'nursing' 'hospital' 'medical' 'clinic'
 'physician' 'date' 'history' 'record' 'radiology' 'time' 'schedule'
 'documentation' 'document' 'hour' 'needed' 'easily' 'need' 'scheduled'
 'data' 'column' 'well' 'health' 'chart' 'easier' 'already' 'scan'
 'currently' 'current' 'update' 'log' 'better' 'require' 'year' 'scanned'
 'great' 'nice' 'required' 'helpful' 'day' 'demographic' 'could' 'right'
 'check' 'cannot' 'registration' 'multiple' 'evident']
2
words: ['patient' 'nurse

In [56]:
documents, document_scores, document_ids = model3.search_documents_by_topic(topic_num=3, num_docs=80)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()


    
    

Document: 1091, Score: 0.5925801396369934
-----------
I would like to be able to pin or make a list of the situations that I am keeping track of without having to search for them everytime I need to look at it.
-----------

Document: 1136, Score: 0.5917020440101624
-----------
This will save me time when searching to add a new problem. Also, will have the refined ICD-10 instead of searching each time through problems. This would go along with the DFE-1-692/290
-----------

Document: 1928, Score: 0.5492861270904541
-----------
I am using the comments to update the status of a problem each time like other EMR's do. however, there is not timestamp on these comments and you need to delete old info and type in new info each time.

ideally, would love for the old comments to be visible at least with an expanded view.

but, at the minimum, would love a way to have the comment be time-stamped.
-----------

Document: 1682, Score: 0.5066575407981873
-----------
MYCARECORNER SHOULD GIVE THE OPTIO

In [57]:
documents, document_scores, document_ids = model3.search_documents_by_keywords(keywords=["patient"], num_docs=15)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()
    
    
  
    

Document: 1659, Score: 0.3585568070411682
-----------
see attached information regarding urgent patient safety issue
-----------

Document: 453, Score: 0.31905072927474976
-----------
Having alert on patient's chart can help staff and physicians know more about a patient's restrictions on medication, discharge, updating insurance next visit. By putting alerts when opening the charts, it helps from having to look back at past visits for urgent details. For example, if a patient was counseled on seeing a specialist for an issue regarding a medication that our provider can no longer provide, when the patient returns and is seen by a different provider, a message will pop up as a precaution in case they are returning for the same issue.
-----------

Document: 144, Score: 0.2927452027797699
-----------
This would be HUGE for patient safety! Most titration medications are high risk meds.
-----------

Document: 926, Score: 0.2715892493724823
-----------
Request to update whiteboard person pro