In [None]:
# Setting the environment and libraries
#[cmd]python -m venv pubmedreq
#[cmd] .\pubmedreq\Scripts\activate
#%pip install requests
#%pip install pandas
#%pip install eutils


##### Step 1-1: Getting data from pubmed
How to work with E-utils: 
- https://www.ncbi.nlm.nih.gov/books/NBK25497/
- https://www.youtube.com/watch?v=BCG-M5k-gvE

- E-utiil: https://www.nlm.nih.gov/oet/ed/insidersguide/welcome-to-e-utilities.html
- https://readthedocs.org/projects/eutils/downloads/pdf/stable/#:~:text=eutils%20is%20a%20Python%20package,using%20their%20E%2Dutilities%20interface.&text=You%20are%20encouraged%20to%20browse%20issues. 
- https://www.youtube.com/watch?v=iCFVVexp30o

[2024Feb] Note: E-utils has been updated 

In [None]:
# v1: tansfroming normal query to formatted query Done >> Get search results (esearch) DONE >> get the data (efetch) DONE >> Store in database (epost) DONE 
import urllib.parse
from typing import Tuple
from math import nan
import requests
import xml.etree.ElementTree as ET
import pickle
import xml.etree.ElementTree as ET



def format_pubmed_query(query: str) -> str:
    """
    This function takes a query string and formats it to be suitable for searching in PubMed using eutils.
    It will 1- replace three charecters of space, qutation mark, and hashtag to equal charecter. 2- it will lower case all strings.
    
    Parameters:
    query (str): The string to be formatted.

    Returns:
    str: The formatted query string.

    Examples:
    >>> format_pubmed_query('cancer biology')
    'cancer+biology'
    >>> format_pubmed_query('human genetics')
    'human+genetics'
    """
    
    transform_dic={
        r' ': r'+',
        r'"': r'%22',
        r'#': r'%23'
    }
    transform_key=list(transform_dic.keys())
    
    formatted_query_list=[]
    for string in query:
        if string in  transform_dic:
            new_string=transform_dic[string]
            formatted_query_list.append(new_string)
        else:
            formatted_query_list.append(string.lower())
            
    formatted_query = "".join(formatted_query_list)
    
    print(f'The formatted query::: {formatted_query}')
    return formatted_query

def validate_download_mode(download_mode:str) -> str :
    """
    Validates the downloade_mode. We will do this to find out wether we should use esearch or efetch.
    
    Args:
    download_mode: the download mode defined by user
    
    Returns:
    download_mode: return the download mode if it is correctly defined. Otherwise, it will raise error.
    """
    
    if download_mode not in ['full', 'summary']:
        raise ValueError("Invalid download_mode. Please choose between 'full' or 'summary'.")
    else:
        return download_mode
def validate_retmode_rettype_basedon_db(ret_mode:str,ret_type:str, db:str) -> Tuple[str, str]: #TODO # I completed this for pubmed only. You should complete this by using this table: https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly
    """
    Validates the return mode and return type based on the database provided.
    
    Args:
    ret_mode (str): The return mode to validate.
    ret_type (str): The return type to validate.
    db (str): The database to base the validation on.
    
    Returns:
    Tuple[str, str]: A tuple containing the validated return mode and return type, if correctly defined. Otherwise, it will raise error.
    """
    
    validation_dic={
        'pubmed': { # rettype #retm
            '': 'xml',
            'medline':'text',
            'uilist': 'text',
            'abstract': 'text'},
        }
    
    if db in list(validation_dic.keys()):
        if (ret_type, ret_mode.lower()) in list(validation_dic[db].items()):
            return ret_mode.lower(), ret_type
        else:
            raise ValueError(f"Invalid ret_type ('{ret_type}') and ret_mode ('{ret_mode}') pair for your database. Please choose the current pair based on your database (See here: https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly).")
    else:
        print('WARNING: The correct validation for your database is not complete. Add this to the validation_dic or use it carefully.')
        return ret_mode.lower(), ret_type
def get_UID_from_search(formatted_query:str, db:str):
    base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    
    #assemble the esearch URL
    url = base + "esearch.fcgi?db=" + db + "&term=" + formatted_query + "&usehistory=y"
    print(f"The esearch url is: {url}")
    #post the esearch URL
    response = requests.get(url)
    #parse WebEnv and QueryKey
    web = response.text.split('<WebEnv>')[1].split('</WebEnv>')[0]
    key = response.text.split('<QueryKey>')[1].split('</QueryKey>')[0]
    count=response.text.split('<Count>')[1].split('</Count>')[0]
    print(f'The count of retrieved objects using esearch::: {count}')
    return response, web, key


def get_data_from_UID(web, key, 
                        download_mode:str='full', #full or summary
                        db:str = 'pubmed', 
                        ret_mode: str='XML',
                        ret_type:str ='',):
    
    download_mode=validate_download_mode(download_mode)
    ret_mode, ret_type=validate_retmode_rettype_basedon_db(ret_mode, ret_type , db)
    base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    
    if download_mode == 'summary':
        #assemble the esummary URL
        url = base + "esummary.fcgi?db=" + db + "&query_key=" + key + "&WebEnv=" + web
        print(f"The esummary url is: {url}")
        #post the esummary URL
        doc_sums = requests.get(url).text
        return doc_sums
        
    elif download_mode == 'full':
        #assemble the efetch URL #fore more detail see this page: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch or this table: https://www.ncbi.nlm.nih.gov/books/NBK25499/table/chapter4.T._valid_values_of__retmode_and/?report=objectonly
        url = base + "efetch.fcgi?db=" + db + "&query_key=" + key + "&WebEnv=" + web 
        print(f"The efetch url is: {url}")
        #post the efetch URL
        doc_full = requests.get(url).text
        return doc_full

def save_cache(cache, cache_file_path):
    with open(cache_file_path, 'wb') as f:
        pickle.dump(cache, f)
        print (f'Updated cache saved at {cache_file_path}')

def load_cache(cache_file_path):
    try:
        with open(cache_file_path, 'rb') as f:
            print("Cache loaded.")
            return pickle.load(f)
    except FileNotFoundError:
        print("No previously saved cache was found. Initializing cache.")
        return None
    
def searchquery_cache(query:str, cache):
    # Check if the result is already cached
    if query in cache:
        print("Loading data from cache...")
        return cache[query]
    
    # If not cached, return 'new_query'
    print("Query not found in cache. Returning 'new_query'.")
    return 'new_query'

def handler_query(query:str, cache_file_path='cache.pkl', overwrite_cache:bool=False,
                         download_mode:str ='full' , db:str = 'pubmed', ret_mode: str='xml',ret_type:str =''):
    #loading cache
    cache=load_cache(cache_file_path)
    if cache:
        docs=searchquery_cache(query, cache)
        if docs:
            if overwrite_cache:
                print('Previous query in the cache found, but I ignored it due to overwrite_cache=True')
                cache={}
            else:    
                return docs
    else:
        cache={}
  
    formatted_query = format_pubmed_query(query)

    response, web, key = get_UID_from_search(formatted_query, db)
    docs = get_data_from_UID(web, key, download_mode=download_mode, db = db, ret_mode=ret_mode, ret_type=ret_type)
    
    cache[query]=docs
    save_cache(cache,cache_file_path)
    return docs


In [None]:
#---------use area ---------------
# Example usage
#query=AI_term= r'''"computer assist*"[Title/Abstract] OR "computer assist*"[Title/Abstract] OR "computer aid*"[Title/Abstract] OR "computer aid*"[Title/Abstract] OR "artificial intelligen*"[Title/Abstract] OR "deep learning*"[Title/Abstract] OR "Machine Learning"[Title/Abstract] OR "Deep learning"[Title/Abstract] OR "neural network*"[Title/Abstract] OR "random forest*"[Title/Abstract] OR "decision tree*"[Title/Abstract] OR "support vector machine*"[Title/Abstract] OR "naive bayes"[Title/Abstract] OR "k nearest neighbor*"[Title/Abstract] OR "Gradient Boosting"[Title/Abstract]'''
def lab_main():
    query=Girish_term=r'( ( (Nadkarni GN) OR ((("Ira Hofer") OR ("Ira S Hofer") OR ("Hofer IS\") OR ("Hofer IS")) AND (Sinai)) OR (("Lili Chan" OR "Chan L") AND (sinai)) OR (((Ashwin Sawant) OR (AS Sawant)) AND (Sinai)) OR ((Ali Soroush) AND (Sinai)) OR (Duong SQ) OR (Apakama D) OR (Abbott EE) OR ((Vaid A) AND (Sinai)) ) AND ("2018/01/01"[Date - Publication] : "3000"[Date - Publication]) )'

    cache_file_path='cache.pkl'

    download_mode:str ='full' #full or summary
    db:str = 'pubmed'
    ret_mode: str='xml'
    ret_type:str =''

    docs=handler_query(query, 
                    cache_file_path=cache_file_path, overwrite_cache=False,
                    download_mode=download_mode , db=db, ret_mode=ret_mode,ret_type=ret_type)
    
    return docs

docs=lab_main()

##### Step 1-2: Cleaning and Structuring data

In [None]:
# v1 turning xml to str, then to dictionary DONE >> clean dataframe with desired information DONE
#[Done >> def xmlstring_to_dic_v2] When I transform xml to dictionayr I am losing some infomraiton, such as AbstracText, when it is mentioned under named xml headings.
#[] >> after previous debug I should change the codes, but the way we trasnfomr xml to dic is much better and cleaner.
# Added mesh term, lanugae, ISSN, doi to the extractor
# Note to understand the amount of data loss uding the previous transformation, size of dataframe turned to 4.5gig from 800mg.
import xml.etree.ElementTree as ET
import pandas as pd
from pandas import json_normalize
from IPython.display import Markdown
import random
import pandas
import sys

def xmlstring_to_dic_v2(xml_string):
    root = ET.fromstring(xml_string)
    return _element_to_dict(root)
def _element_to_dict(element):
    if len(element) == 0:
        return element.text

    result = {}
    for child in element:
        child_dict = _element_to_dict(child)
        if child.tag in result:
            if isinstance(result[child.tag], list):
                result[child.tag].append(child_dict)
            else:
                result[child.tag] = [result[child.tag], child_dict]
        else:
            result[child.tag] = child_dict

    return result

#def xml_to_dic_v1(elem):
#    # Gather the element's attributes
#    data_dict = elem.attrib   
#    
#    # Gather the element's children
#    for child in elem:
#        # If element already exists then append to it, otherwise assign a list to it
#        data_dict[child.tag] = data_dict.get(child.tag, [])
#        data_dict[child.tag].append(elem_to_dict(child))
#        
#    # If it has no children, the text data are stored in the dictionary too.
#    if not data_dict:  
#        data_dict = elem.text  
#        
#    return data_dict


def find_selected_article_info_2extract(docs_dic: dict) -> list:

    articles_info_uniquekeys =set() # to store all uniue keys in articles
    
    for article in docs_dic['PubmedArticle']:
        article_info_dic = article['MedlineCitation']
        for key in article_info_dic.keys():
            articles_info_uniquekeys.add(key)
    
    display(Markdown(f"All unique keys: {articles_info_uniquekeys}"))
    input("""Press any key after thinking about what information you want to extract from the printed list.
          Afterward, I will ask you to include the information you want to include, one-by-one.""")
    
    selected_article_info_2extract=[]
    #getting three random numbers from 0 to lentgh of dic
    list_length = len(docs_dic['PubmedArticle'])
    three_random_numbers = [random.randint(1, list_length) for _ in range(3)]
    for info in articles_info_uniquekeys:
        try:
            example_1=docs_dic['PubmedArticle'][three_random_numbers[0]]['MedlineCitation'][info]
        except:
            example_1=" "
        try:
            example_2=docs_dic['PubmedArticle'][three_random_numbers[1]]['MedlineCitation'][info]
        except:
            example_2=" "
        try:
            example_3=docs_dic['PubmedArticle'][three_random_numbers[2]]['MedlineCitation'][info]
        except:
            example_3=" "
            
        examples=f"""
        -------- Example 1: {example_1} --------
        -------- Example 2: {example_2} --------
        -------- Example 3: {example_3} --------
        """

        answer = input(f'''Enter y/yes if you want to include this info, or any other thing to exclude it. -----INFO KEY:  {info} -----
                       Three sample of the infromation in this info header is: {examples}
                       
                       ''')
        if answer.lower() in ['yes', 'y']:
            selected_article_info_2extract.append(info)
            
    print(f"""
            Store this desired list, and re-use it for future runs of this function.
            selected_article_info_2extract = {selected_article_info_2extract}
            """)
    
    return selected_article_info_2extract

def get_info_from_RealArticle(RealArticle_dic: dict):
    PubModel=RealArticle_dic.get('PubModel',' ')
    Elocation=RealArticle_dic.get('ELocationID','')
    Language=RealArticle_dic.get('Language','') 
    
    Journal_dic= RealArticle_dic['Journal']
    if Journal_dic:
        Journal_Title= Journal_dic.get('Title')
        Journal_TitleAbbreviation= Journal_dic.get('ISOAbbreviation')
        Journal_ISSN=Journal_dic.get('ISSN')

    ArticleTitle= RealArticle_dic.get('ArticleTitle','')
    AbstractText=RealArticle_dic.get('Abstract','')

    Author_affiliation_pair_dic = RealArticle_dic.get('AuthorList','') 
    GrantList = RealArticle_dic.get('GrantList','') 
    ArticleDate=RealArticle_dic.get('ArticleDate','')
    PublicationTypeList= RealArticle_dic.get('PublicationTypeList','') 
    
    return PubModel, Elocation,Journal_Title,Language, Journal_TitleAbbreviation,Journal_ISSN, ArticleTitle, AbstractText, Author_affiliation_pair_dic, GrantList, ArticleDate,PublicationTypeList
    
def PubmedArticleSet_dic_extractor(docs_dic, selected_article_info_2extract=None):
    
    if selected_article_info_2extract == None:
        selected_article_info_2extract = find_selected_article_info_2extract(docs_dic)
        
    cleandata_df=pd.DataFrame()
    row = 0
    for article in docs_dic['PubmedArticle']:
        article_info_dic = article['MedlineCitation']
        for key in article_info_dic.keys():
            
            if key in selected_article_info_2extract:
                if key == 'Article':
                    RealArticle_dic = article_info_dic['Article']
                    PubModel,Elocation, Journal_Title,Language, Journal_TitleAbbreviation,Journal_ISSN, ArticleTitle, AbstractText, \
                        Author_affiliation_pair_dic, GrantList, ArticleDate,PublicationTypeList = get_info_from_RealArticle(RealArticle_dic)
                    cleandata_df.at[row,'Article_PubModel'] =str(PubModel)
                    cleandata_df.at[row,'Article_Language'] =str(Language)
                    cleandata_df.at[row,'Article_Elocation'] =str(Elocation)
                    cleandata_df.at[row,'Article_Journal_Title'] =str(Journal_Title)
                    cleandata_df.at[row, 'Article_Journal_TitleAbbreviation'] =str(Journal_TitleAbbreviation)
                    cleandata_df.at[row,'Article_Journal_ISSN'] =str(Journal_ISSN)
                    
                    cleandata_df.at[row, 'Article_ArticleTitle'] =str(ArticleTitle)
                    cleandata_df.at[row, 'Article_AbstractText'] =str(AbstractText)
                    
                    cleandata_df.at[row, 'Article_Author_affiliation_pair_dic'] =str(Author_affiliation_pair_dic)
                    cleandata_df.at[row, 'Article_GrantList'] =str(GrantList)
                    cleandata_df.at[row, 'Article_ArticleDate'] =str(ArticleDate)
                    cleandata_df.at[row, 'Article_PublicationTypeList'] =str(PublicationTypeList)

                else:
                    info_value = article_info_dic[key]
                    cleandata_df.at[row, key] = str(info_value)
            
        row +=1
        
    return cleandata_df
        
# turning xml to str
# transforming xml root to dictionary
docs_dic = xmlstring_to_dic_v2(docs)
# transforming multi-level dic to dataframe     
print(f"size of dictionary: {sys.getsizeof(docs_dic)/1000} mb")

#transforming to dataframe and cleaning the dictionary
docs_df=PubmedArticleSet_dic_extractor(docs_dic, selected_article_info_2extract)
print(f"size of cleaned dataframe: {sys.getsizeof(docs_df)/1000} mb")

In [None]:
#deppere cleaning for website
import ast
import pandas
from zmq import Errno

def get_clean_doi(liststring:str):
    def string_to_list(value):
        try:
            # Try converting string representation of list to an actual list
            result = ast.literal_eval(value)
            if isinstance(result, list):
                return result
        except:
            # If it fails or the result is not a list, return the original value
            pass
        return value

    temp = string_to_list(liststring)
    if isinstance(temp, list):
        if len(temp) > 1:
            doi= temp[1]
        else:
            doi = temp[0]
    else:
        doi=temp
    
    doi_address="https://doi.org/"+ doi
    return doi_address

def get_clean_abstract(abstractdict_string):
    try:
        actual_dict = ast.literal_eval(abstractdict_string)
        abstracttext = actual_dict['AbstractText']
        
        # Check if the abstract text is a list
        if isinstance(abstracttext, list):
            # Join the strings in the list with spaces
            abstracttext = ' '.join([obj for obj in abstracttext if isinstance(obj, str)])
        # If the abstract text is not a list, check if it's a string
        elif isinstance(abstracttext, str):
            return abstracttext
        else:
            return ""
    except (ValueError, SyntaxError, KeyError):
        # Return an empty string in case of an error
        return ""
    
    return abstracttext

def get_clean_author_list(dict_string):
    try:
        author_dict = ast.literal_eval(dict_string)
        
        # Check if 'Author' is a dictionary and wrap it in a list if so
        authors = author_dict['Author']
        if isinstance(authors, dict):
            authors = [authors]  # Make it a list of one dictionary
        
        clean_author_list = []
        
        for author in authors:
            last_name = author.get('LastName', '')
            fore_name = author.get('ForeName', '')
            initials = author.get('Initials', '')
            
            if fore_name:
                formatted_name = f"{last_name}, {fore_name}"
            else:
                formatted_name = f"{last_name}, {initials}"
            
            clean_author_list.append(formatted_name)
        
        return "; ".join(clean_author_list)
    except Exception as e:
        print(f"An unexpected error occurred: {e}. \n dict_string: {dict_string}")
        return " "

def get_clean_date(date_string):
    try:
        date_dict=ast.literal_eval(date_string)
        year = date_dict.get('Year', '')
        month = date_dict.get('Month', '01')
        day = date_dict.get('Day', '01')

        if year and month and day:
            return "-".join([year, month, day])
        else:
            return ""
    except:
        return ""


clean_docs_df=docs_df.copy()
clean_docs_df = clean_docs_df.loc[~clean_docs_df['Article_ArticleTitle'].str.startswith('{')]

clean_docs_df['clean_doi'] = clean_docs_df['Article_Elocation'].apply(get_clean_doi)
clean_docs_df['clean_abstract'] = clean_docs_df['Article_AbstractText'].apply(get_clean_abstract)
clean_docs_df['clean_authors'] = clean_docs_df['Article_Author_affiliation_pair_dic'].apply(get_clean_author_list)
clean_docs_df['clean_date']=clean_docs_df['Article_ArticleDate'].apply(get_clean_date)
clean_docs_df

##### Step 1-3: Add tags to articles

#v1_Pipeline: Prompt for generating the list [you should send it one by one]

**E1**: my list

**E2**: You are an expert librarian. You are an expert in the field of medical terminology and machine learning terminology. Please give me a list of words that can point to the mentioned field. I want to find relevant articles based on these set of keywords. I will use title and abstract. Return a dictionary like this "mentioned fields": ["relevant word 1", "relevant word 2",...]. After that, carefully review the list and keep the words that are specific and sensitive to this field. Field: Critical Care.


**E3**: You are an expert librarian. You are an expert in the field of medical terminology and machine learning terminology. I want to search for the existence of keywords in a field in the title/abstract of scientific articles. I will use regex in python. Please enrich (add) more equivalent terms that I should look for. Here are two examples:
When searching "Cardiovascular Disease" I will miss "cardiovascular Diseases" so I should change "Cardiovascular Disease" to "Cardiovascular Disease*".
When searching for "Acute Kidney Injury (AKI)" I will miss articles that has "acute kidney injury" so I should change this to "Acute Kidney Injury" and " AKI " (with spaces before and after it to make sure I will not include a word with aki at the middle e.g. "akin"). I will give you the initial list, and you return the revised list.   

**E4**: I want to use this list to find an existence of a keyword in a text and classify the text. The text is the concatenated text of title and abstract of a scientific paper. The keyword is a major research topic or medical subspecialty. For each keyword I prepared a list of words that if they exist, I will understand that this article is related to my keyword (research topic or medical subspecialty). I am using regex for this propose. I want you to revise the provided list and I want to have the best specific and sensitive list related to the keyword. carefully revisit the term list to: 1- add or remove terms 2- correct the conditions (OR or *), considering that I am using regex. 3- shorten the terms to capture more accurate text classification. Your only output should be the best possible corrected dictionary.   The initial dictionary: 

**E5**: look through text to:
- Remove un-related terms
- Remove repetitive words (e.g. cardiovascular disease when cardiovascular is in the list)
- If necessary, add \\\bsomething\\\b where needed to avoid capturing unwanted words (e.g. capturing "akin" when looking for "AKI")

**E6**: running the loop and getting the un-tagged results, and manually adding terms

#v2_Pipeline: All-in-One-Prompt [preferably should be run on GPT-4]

**E_allinone**

---
**Your role:** You are an expert librarian with profound knowledge in medical terminology and machine learning jargon.

**Objective**: Create a refined keyword dictionary for accurate text classification, targeting the research topic of "XXXXX." Utilize PubMed articles, focusing on titles and abstracts, and employ Python regex for sophisticated text search and matching.

**Context:** I require your assistance in constructing a text classification pipeline. The objective is to systematically follow each step, designated as E, to establish a dictionary of keywords relevant to a specified TOPIC, which is a significant research domain or medical subspecialty. I have previously gathered articles from PubMed. The text classification is performed using Python's regex capabilities.

**Initial Keyword Generation (E1):** Provide a catalog of specific words indicative of the mentioned TOPIC. This compilation will serve as the foundation for identifying pertinent articles. 
- Output: A dictionary format, {"XXXXX": ["keyword 1", "keyword 2", ...]}.

**Keyword Enrichment with Regex Patterns (E2):** Revise the term list to add/remove terms, correct conditions (using OR, *, etc.), and shorten terms for more accurate classification.
- Examples: Adjust "Cardiovascular Disease" to "Cardiovascular Disease*" to include plural forms, and "Acute Kidney Injury (AKI)" to include both "Acute Kidney Injury" and spaced " AKI " for precision.
- Output: An updated keyword dictionary with added terms and regex patterns.

**Term List Optimization (E3):** 
1. Incorporate or eliminate terms as deemed necessary.
2. Rectify conditions (OR or *), considering the regex framework.
3. Condense terms to facilitate more accurate text classification. Output only the optimized dictionary.
- Output: The most refined version of the keyword dictionary.

**E4:** Scrutinize the text to:
- Eliminate irrelevant or overly general terms.
- Remove redundant terms (e.g., "cardiovascular disease" when "cardiovascular" is already included).
- Append pertinent terms (shortened versions), if warranted.
- For terms with less than 5 strings, integrate \\b into the term, like \\bTERM\\b. 
- Finalize the dictionary by ensuring it captures the essence of "XXXXX" accurately and efficiently.


**TOPIC:** "time series"

---
**Your role:** You are an expert librarian with profound knowledge in medical terminology and machine learning jargon.

**Objective**: Create a refined keyword dictionary for accurate text classification, targeting the research topic of "XXXXX." Utilize PubMed articles, focusing on titles and abstracts, and employ Python regex for sophisticated text search and matching.

**Context:** I require your assistance in constructing a text classification pipeline. The objective is to systematically follow each step, designated as E, to establish a dictionary of keywords relevant to a specified TOPIC, which is a significant research domain or medical subspecialty. I have previously gathered articles from PubMed. The text classification is performed using Python's regex capabilities.

**Initial Keyword Generation (E1):** Provide a catalog of specific words indicative of the mentioned TOPIC. This compilation will serve as the foundation for identifying pertinent articles. 
- Output: A dictionary format, {"XXXXX": ["keyword 1", "keyword 2", ...]}.

**Keyword Enrichment with Regex Patterns (E2):** Revise the term list to add/remove terms, correct conditions (using OR, *, etc.), and shorten terms for more accurate classification.
- Examples: Adjust "Cardiovascular Disease" to "Cardiovascular Disease*" to include plural forms, and "Acute Kidney Injury (AKI)" to include both "Acute Kidney Injury" and spaced " AKI " for precision.
- Output: An updated keyword dictionary with added terms and regex patterns.

**Term List Optimization (E3):** 
1. Incorporate or eliminate terms as deemed necessary.
2. Rectify conditions (OR or *), considering the regex framework.
3. Condense terms to facilitate more accurate text classification. Output only the optimized dictionary.
- Output: The most refined version of the keyword dictionary.

**E4:** Scrutinize the text to:
- Eliminate irrelevant or overly general terms.
- Remove redundant terms (e.g., "cardiovascular disease" when "cardiovascular" is already included).
- Append pertinent terms (shortened versions), if warranted.
- For terms with less than 5 strings, integrate \\b into the term, like \\bTERM\\b. 
- Finalize the dictionary by ensuring it captures the essence of "XXXXX" accurately and efficiently.

**Example of desired output:**
"Cardiology": [
        "cardiovascular (disease|diseases?)", "heart (disease|diseases?)", "\\bMI\\b", 
        "angina", "coronary artery disease", "heart failure", "arrhythmia", 
        "\\bAF\\b", "cardiac (surgery|catheterization)", "stent", 
        "angioplasty", "electrocardiogram", "echocardiography", "hypertension", "stroke", 
        "cardiomyopathy", "peripheral arterial disease", "valvular heart disease", 
        "(acute|chronic) (heart failure|MI)", "\\b(PCI|CABG|ECG|EKG)\\b", 
        "\\b(PAD|VHD)\\b"
    ]



**TOPIC:** "time series"



[**NEXT**: running the loop and getting the un-tagged results, and manually adding terms]


#v3 left terms looking manually

Cardiac Arrest
coronary artery disease
Cardiometabolic
Peripheral Artery Disease

acute kidney injury
Chronic Kidney Disease
Nephrology
kidney function
"\\bAKI"
renal
Albuminuria
Hemodialysis
Glomerulopathy
Genetically
Kidney Disease
Glomerular


Machine Learning

postoperative
Preoperative
reoperation
Bariatric Surgery
Readmission
Fracture

Natural Language Processing

Epigenetic
Genome-Wide
gene expression
Proteomic
Whole-genome sequencing
genotype
Polygenic
gene expression
Genetic pleiotropy
Phenome-Wide
Whole genome sequence

Respiratory: asthma

COVID-19, SARS-CoV-2, Vaccinees

Articles with no good term

Discrepancy between predicted and measured exercise intensity for eliciting the maximal rate of lipid oxidation.
Epidemiology of Cutaneous Lupus Erythematosus Among Adults Over Four Decades (1976-2018): A Lupus Midwest Network (LUMEN) Study.
Rheumatoid Arthritis, Cognitive Impairment, and Neuroimaging Biomarkers: Results from the Mayo Clinic Study of Aging.
A Platform for Designing and Conducting Innovative Digital N-of-1 Trials.
Rising incidence and prevalence of systemic lupus erythematosus: a population-based study over four decades.
Multi-ancestry genetic study of type 2 diabetes highlights the power of diverse populations for discovery and translation.
Genetic discovery and risk characterization in type 2 diabetes across diverse populations.




In [None]:
#tags_dic

topics = {
    "Cardiology",
    "Gastroenterology",
    "Nephrology",
    "Critical Care",
    "Post-operative Care",
    "LLM-NLP",
    "Machine Vision",
    "Recurrent Learning",
    "Machine Learning",
    "Deep Learning",
    "Signal Processing",
    "Bioinformatics",
    "time series"
}

tags_dic_E1 = {
    "Cardiology": ["Heart", "Cardiac", "Cardiovascular", "Coronary"],
    "Gastroenterology": ["Gastroenterology", "Digestive", "Gastrointestinal", "Stomach", "Liver", "Pancreatic"],
    "Nephrology": ["Kidney", "Renal", "Urology", "Dialysis", "Chronic kidney disease"],
    "Critical Care": ["Intensive Care", "ICU", "Critical Health", "Emergency Medicine"],
    "Post-operative Care": ["Post-surgery Care", "Recovery Care", "Surgical Aftercare", "Rehabilitation"],
    "LLM + NLP": ["Language Models", "Natural Language Processing", "Text Mining", "Semantic Analysis"],
    "Machine Vision": ["Computer Vision", "Image Analysis", "Echo", "Computed tomography", "CT scan","CT image", "MRI", "Ultrasound", "Radiography", "PET"],
    "Recurrent Learning": ["Recurrent Neural Networks", "RNN", "LSTM", "Feedback Networks", "Sequence Analysis"],
    "Machine Learning": ["ML", "Artificial Intelligence", "AI", "Statistical Learning", "Predictive Modeling"],
    "Deep Learning": ["Neural Networks", "Deep Neural Networks", "DNN", "Convolutional Neural Networks", "CNN", "GANs"],
    "Signal Processing": ["Digital Signal Processing", "DSP", "ICP", "ECG", "EEG", "Biomedical Signals", "Biomedical Imaging"],
}


tags_dic_E2 = {
    "Cardiology": [
        "Cardiolog*",  # The primary field of study
        "Cardiovascular Disease*",  # General term for diseases involving the heart or blood vessels
        "Coronary Artery Disease*",  # A major disease category within cardiology
        "Heart Failure",  # A key condition treated by cardiologists
        "Arrhythmia",  # Includes various types of irregular heartbeat
        "Electrocardiography",  # A diagnostic tool in cardiology
        "Echocardiography",  # Ultrasound of the heart, a main diagnostic technique
        "Cardiac Catheterization",  # A procedure for diagnosing and treating cardiovascular conditions
        "Percutaneous Coronary Intervention",  # A non-surgical method used to treat coronary artery disease
        "Stent Placement",  # A common treatment for coronary artery disease
        "Cardiac Surgery",  # Including bypass surgery and other heart-related surgical procedures
        "Heart Transplant",  # The surgical procedure to replace a diseased heart with a healthy one
        "Hypertension",  # High blood pressure, a major cardiovascular risk factor
        "Myocardial Infarction",  # Scientific term for a heart attack
        "Atrial Fibrillation",  # A common type of arrhythmia
        "Cardiac Imaging",  # Including CT, MRI, and other imaging technologies specific to cardiology
        "Vascular Health",  # As it pertains to the circulatory system outside of the heart
        "Angiography",  # Imaging of blood vessels, used for diagnosing heart diseases
        "Heart Valve Disease",  # Diseases involving one or more of the heart valves
        "Cardiac Rehabilitation",  # Programs for recovering heart disease patients
        "Preventive Cardiology",  # Focused on preventing heart diseases
        "Lipidology",  # Study of blood lipids and their relation to heart disease
        "Thrombosis",  # Blood clots that can lead to heart attack or stroke
        "Cardiomyopathy",  # Diseases of the heart muscle
        "Congenital Heart Disease",  # Heart abnormalities present from birth
        "Heart Rhythm Disorders",  # General term for conditions affecting the heart's rhythm
        "Interventional Cardiology",  # Subspecialty focusing on catheter-based treatments of heart diseases
    ],
   "Gastroenterology": [
        "Gastroenterology",  # The primary field of study
        "Gastrointestinal Diseases",  # Broad term for diseases of the GI tract
        "Inflammatory Bowel Disease (IBD)",  # Including Crohn's disease and Ulcerative Colitis
        "Irritable Bowel Syndrome (IBS)",  # A common disorder affecting the large intestine
        "Hepatology",  # Study of liver, gallbladder, biliary tree, and pancreas
        "Endoscopy",  # Diagnostic procedure widely used in gastroenterology
        "Colonoscopy",  # A type of endoscopy of the colon
        "Gastroesophageal Reflux Disease (GERD)",  # Chronic digestive disease
        "Peptic Ulcer Disease",  # Ulcers of the stomach or duodenum
        "Celiac Disease",  # An immune reaction to eating gluten
        "Gastrointestinal Cancer",  # Includes colorectal, gastric, liver cancers, etc.
        "Hepatitis",  # Inflammation of the liver
        "Liver Transplantation",  # Surgical option for liver failure or disease
        "Pancreatitis",  # Inflammation of the pancreas
        "Gallstones",  # Hardened deposits in the gallbladder
        "Gastrointestinal Bleeding",  # Bleeding in the GI tract
        "Gastrointestinal Endoscopy",  # Procedure to examine the GI tract
        "Esophageal Diseases",  # Diseases affecting the esophagus
        "Barrett's Esophagus",  # Condition related to GERD
        "Helicobacter pylori Infection",  # Common cause of peptic ulcers
        "Colitis",  # Inflammation of the inner lining of the colon
        "Diverticulitis",  # Inflammation or infection of diverticula in the colon
        "Fecal Microbiota Transplantation (FMT)",  # Treatment for certain GI conditions
        "GI Motility Disorders",  # Problems with movement in the GI tract
        "Capsule Endoscopy",  # Procedure using a tiny wireless camera to take pictures of the digestive tract
        "Biliary Tract Disorders",  # Disorders affecting bile ducts, gallbladder, and associated structures
        "Nutritional Gastroenterology",  # Study of nutrition’s impact on GI diseases
    ],
    "Nephrology": [
        "Nephrology",  # The primary field of study
        "Chronic Kidney Disease (CKD)",  # Long-term kidney damage and decreased function
        "Acute Kidney Injury (AKI)",  # Sudden loss of kidney function
        "End-Stage Renal Disease (ESRD)",  # Final stage of chronic kidney disease
        "Kidney Transplantation",  # Surgical procedure to place a healthy kidney from a donor into a person with ESRD
        "Dialysis",  # Procedure to remove waste products and excess fluid from the blood when the kidneys stop working properly
        "Hemodialysis",  # A type of dialysis using an artificial kidney machine
        "Peritoneal Dialysis",  # A type of dialysis where the lining of the abdomen filters the blood
        "Renal Biopsy",  # Procedure to remove kidney tissue for laboratory examination
        "Glomerulonephritis",  # Inflammation of the tiny filters in the kidneys (glomeruli)
        "Polycystic Kidney Disease (PKD)",  # Genetic disorder characterized by the growth of numerous cysts in the kidneys
        "Nephrotic Syndrome",  # Kidney disorder causing the body to excrete too much protein in the urine
        "Electrolyte Imbalances",  # Disturbances in the levels of electrolytes in the blood
        "Hypertension and Kidneys",  # High blood pressure related to kidney function
        "Renal Pharmacology",  # Study of how drugs affect the kidney and its functions
        "Kidney Stones",  # Solid masses made of crystals that originate in the kidneys
        "Renal Pathology",  # Study of kidney diseases at the microscopic level
        "Uremia",  # Condition involving accumulation of waste products in the blood
        "Anemia in Chronic Kidney Disease",  # Common complication of CKD involving a decrease in red blood cells
        "Renal Replacement Therapy (RRT)",  # Treatments to replace the function of failing kidneys
        "Kidney Function Tests",  # Tests to check how well the kidneys are working
        "Proteinuria",  # Presence of excess proteins in the urine
        "Hematuria",  # Presence of blood in the urine
        "Renal Nutrition",  # Dietary management for kidney disease patients
        "Vascular Access for Dialysis",  # Methods to access the bloodstream for hemodialysis treatment
        "Renal Genetics",  # Study of genetic factors in kidney diseases
        "Diabetic Nephropathy",  # Kidney damage resulting from diabetes
    ],
    "Critical Care": [
        "Critical Care",  # The primary field of study
        "Intensive Care Medicine",  # Another term for Critical Care
        "Mechanical Ventilation",  # Life support technique used in critical care
        "Acute Respiratory Distress Syndrome (ARDS)",  # A severe lung condition requiring intensive care
        "Sepsis",  # A life-threatening response to infection treated in ICUs
        "Septic Shock",  # A severe and deadly form of sepsis
        "Cardiac Arrest",  # Sudden, unexpected loss of heart function
        "Resuscitation",  # Revival techniques for cardiac arrest or near-death conditions
        "Critical Care Nursing",  # Specialized nursing for critically ill patients
        "Trauma Care",  # Care for critically injured patients
        "Life Support",  # Medical interventions to support vital organ functions
        "Intensive Care Unit (ICU)",  # Hospital unit for critically ill patients
        "Ventilator-associated Pneumonia (VAP)",  # Pneumonia that occurs in people using mechanical ventilators
        "Extracorporeal Membrane Oxygenation (ECMO)",  # A procedure that provides cardiac and respiratory support to critically ill patients
        "Invasive Monitoring",  # Monitoring techniques involving the insertion of devices into the body
        "Shock",  # A critical condition of decreased blood flow to organs
        "Multi-organ Failure",  # Failure of multiple organ systems, often seen in critical care
        "Neurocritical Care",  # Critical care of patients with neurological injuries or conditions
        "Critical Care Pharmacology",  # Study of drug therapy for critically ill patients
        "Palliative Care in ICU",  # Specialized care focusing on providing relief from the symptoms and stress of serious illnesses in the ICU
        "Fluid Resuscitation",  # Administration of fluids to critically ill patients
        "Blood Product Transfusion",  # Transfusion of blood products in critical care settings
        "Nosocomial Infections",  # Infections acquired in hospital settings, significant in ICU
        "Acute Kidney Injury (AKI) in ICU",  # Kidney failure that occurs in critically ill patients
        "Critical Care Outcomes",  # Research and metrics on patient outcomes in critical care
        "Advanced Life Support (ALS)",  # Set of life-saving protocols and skills that extend Basic Life Support to further support the circulation and provide an open airway and adequate ventilation (breathing)
    ],
    "Post-operative Care": [
        "Post-operative Care",  # Direct care provided after surgery
        "Post-surgical Recovery",  # Process of recovery following surgery
        "Surgical Wound Care",  # Management of surgical incisions and wounds
        "Post-operative Complications",  # Complications that may arise after surgery
        "Pain Management after Surgery",  # Techniques and medications to manage post-surgical pain
        "Rehabilitation after Surgery",  # Rehabilitative processes following surgical procedures
        "Post-operative Infections",  # Infections that can occur after surgery
        "Wound Infection",  # Infection occurring at the site of a surgical wound
        "Deep Vein Thrombosis (DVT) Prevention",  # Measures to prevent blood clots after surgery
        "Pulmonary Embolism Prevention",  # Strategies to prevent lung blood clots post-surgery
        "Post-operative Mobility",  # Encouraging movement to aid in recovery after surgery
        "Post-operative Nutrition",  # Nutritional support for surgical recovery
        "Surgical Drain Management",  # Handling of drains inserted during surgery
        "Post-operative Physiotherapy",  # Physical therapy to aid in recovery after surgery
        "Venous Thromboembolism (VTE) Prophylaxis",  # Prevention of thrombosis after surgery
        "Incisional Care",  # Care specific to surgical incisions
        "Post-anesthesia Care Unit (PACU)",  # Immediate care after anesthesia
        "Post-operative Monitoring",  # Continuous monitoring for potential complications after surgery
        "Suture Care",  # Care for surgical stitches
        "Post-operative Delirium",  # Acute confusion following surgery
        "Post-operative Ileus",  # Temporary impairment of bowel function after abdominal surgery
        "Fluid and Electrolyte Management",  # Balancing fluids and electrolytes post-surgery
        "Blood Transfusion in Post-operative Care",  # Use of blood transfusions as needed after surgery
        "Post-operative Urinary Retention",  # Difficulty urinating after surgery
        "Early Ambulation",  # Early movement to promote recovery
        "Post-operative Respiratory Exercises",  # Exercises to prevent respiratory complications
    ],
    "LLM-NLP": [
        "Natural Language Processing",  # Core field of study
        "Large Language Models",  # Specific focus on large-scale language models
        "Transformer Models",  # A type of model architecture pivotal to recent advancements in NLP
        "BERT",  # Bidirectional Encoder Representations from Transformers, a key model in NLP
        "GPT",  # Generative Pretrained Transformer, highlighting versions like GPT-3, GPT-4
        "Tokenization in NLP",  # The process of converting text into tokens for processing
        "Semantic Analysis",  # Understanding the meaning and interpretation of words and sentences
        "Sentiment Analysis",  # Determining the sentiment expressed in a piece of text
        "Machine Translation",  # Automatic translation of text from one language to another
        "Text Summarization",  # Techniques for shortening text while preserving meaning
        "Language Generation",  # Generating human-like text based on input data
        "Named Entity Recognition (NER)",  # Identifying and classifying named entities in text
        "Part-of-Speech Tagging",  # Identifying parts of speech in text, such as nouns, verbs, adjectives
        "Question Answering Systems",  # Building systems that can answer questions posed in natural language
        "Dialogue Systems",  # Systems that can converse with humans in natural language
        "Speech Recognition",  # Translating spoken language into text
        "NLP for Healthcare",  # Application of NLP in medical records, diagnosis, and treatment
        "Bias in Language Models",  # Addressing and identifying bias in language model training and outputs
        "Language Model Fine-tuning",  # Adjusting pre-trained models for specific tasks or domains
        "Text Classification",  # Categorizing text into predefined groups
        "Word Embeddings",  # Representing words in numerical vector spaces
        "Deep Learning for NLP",  # Utilizing deep learning techniques in NLP applications
        "Cross-lingual Transfer Learning",  # Applying knowledge gained from one language to others
        "Conversational AI",  # Artificial intelligence that powers dialogue systems and chatbots
        "NLP in Social Media Analysis",  # Analyzing and interpreting text data from social media platforms
    ],
    "Machine Vision": [
        "Medical Imaging",  # Broad term covering all imaging modalities
        "Computed Tomography (CT)",  # Imaging procedure that uses computer-processed combinations of many X-ray measurements
        "Magnetic Resonance Imaging (MRI)",  # Imaging technique used in radiology to form pictures of the anatomy and the physiological processes
        "Ultrasound Imaging",  # Imaging technique using sound waves to produce images of structures within the body
        "Positron Emission Tomography (PET)",  # Nuclear medicine functional imaging technique used to observe metabolic processes
        "Single Photon Emission Computed Tomography (SPECT)",  # Nuclear medicine tomographic imaging technique using gamma rays
        "X-ray Imaging",  # Imaging technique using X-rays to view the internal form of an object
        "Digital Pathology",  # Study of pathology with the help of digital images
        "Radiomics",  # Extraction of large amounts of features from radiographic medical images
        "Image Segmentation in Medical Imaging",  # Process of partitioning a digital image into multiple segments
        "Image Reconstruction",  # Techniques used to recreate images from projections
        "3D Medical Imaging",  # Creating three-dimensional images for medical analysis
        "Machine Learning in Medical Imaging",  # Applying machine learning techniques to enhance medical imaging processes
        "Deep Learning in Medical Imaging",  # Utilizing deep learning algorithms for improving imaging analysis
        "Computer-Aided Diagnosis (CAD)",  # Systems designed to help radiologists in the interpretation of medical images
        "Radiogenomics",  # Association between cancer imaging features and gene expression
        "Functional MRI (fMRI)",  # Specialized MRI scan used to measure the haemodynamic response related to neural activity
        "Optical Coherence Tomography (OCT)",  # Imaging technique used to capture micrometer-resolution, three-dimensional images from within optical scattering media
        "Biomedical Imaging",  # Encompasses various imaging modalities for visualizing biological tissues
        "Neuroimaging",  # Use of various techniques to either directly or indirectly image the structure, function/pharmacology of the nervous system
        "Image Analysis Algorithms in Medicine",  # Algorithms developed to enhance, recognize, and quantify medical images
        "Texture Analysis in Medical Imaging",  # Method to quantify the variation in surface intensity or patterns
        "Artificial Intelligence in Radiology",  # AI applications for analyzing medical images and aiding in diagnosis
        "Diagnostic Imaging",  # Use of electromagnetic radiation and certain other technologies to produce images for the assessment of medical conditions
        "Interventional Radiology",  # Medical specialty involving the conduct of medical procedures with the guidance of imaging technologies
    ],
    "Recurrent Learning": [
        "Recurrent Neural Networks (RNN)",  # Core architecture in recurrent learning
        "Long Short-Term Memory (LSTM)",  # A type of RNN architecture for dealing with long dependencies
        "Gated Recurrent Unit (GRU)",  # Simplified version of LSTM, yet effective in many cases
        "Sequence Prediction",  # Predicting sequential data, a common application of RNNs
        "Time Series Analysis",  # Analyzing time-ordered data points, often with RNNs
        "Natural Language Processing (NLP)",  # Processing and understanding human languages, where RNNs are frequently applied
        "Speech Recognition",  # Transcribing spoken language into text, an application of recurrent learning
        "Machine Translation",  # Automatic translation of text or speech from one language to another
        "Sentiment Analysis",  # Determining the sentiment behind text, often using RNNs
        "Text Generation",  # Generating text automatically, where RNNs can be applied
        "Music Generation",  # Creating music sequences, an artistic application of recurrent learning
        "Video Frame Prediction",  # Predicting future frames in a video sequence
        "Stock Market Prediction",  # Forecasting future stock prices, a financial application of RNNs
        "Bioinformatics",  # Analyzing biological sequences, where recurrent learning can be applied
        "Deep Learning for Sequences",  # Deep learning approaches tailored to sequential data
        "Attention Mechanisms",  # Improving RNN performance by focusing on specific parts of the sequence
        "Sequence to Sequence Models",  # Models that map input sequences to output sequences, used in machine translation and more
        "Neural Machine Translation (NMT)",  # Using deep learning to translate text or speech from one language to another
        "Language Modeling",  # Predicting the probability distribution of language sequences
        "Memory Networks",  # Enhancing RNNs with external or internal memory components for better sequence modeling
        "Dynamic Computational Graphs",  # Techniques for building flexible models that can handle variable-length sequences
        "Continuous Time Recurrent Neural Networks (CTRNN)",  # RNNs designed for modeling time-continuous dynamics
    ],
    "Machine Learning":[
        "Machine Learning",  # The core discipline
        "Supervised Learning",  # ML tasks where models are trained using labeled data
        "Unsupervised Learning",  # Learning patterns from unlabeled data
        "Semi-supervised Learning",  # Combining labeled and unlabeled data for training
        "Reinforcement Learning",  # Algorithms learn actions based on feedback to maximize a reward
        "Deep Learning",  # Subset of ML using deep neural networks
        "Neural Networks",  # Computational models inspired by the human brain's network of neurons
        "Convolutional Neural Networks (CNN)",  # A class of deep neural networks, most commonly applied to analyzing visual imagery
        "Recurrent Neural Networks (RNN)",  # Networks with loops allowing information to be persisted
        "Long Short-Term Memory (LSTM)",  # A special kind of RNN capable of learning long-term dependencies
        "Decision Trees",  # A decision support tool that uses a tree-like model of decisions and their possible consequences
        "Random Forests",  # An ensemble learning method for classification, regression, and other tasks
        "Gradient Boosting Machines (GBM)",  # A machine learning technique for regression and classification problems
        "Support Vector Machines (SVM)",  # Supervised learning models with associated learning algorithms
        "Natural Language Processing (NLP)",  # Field of ML focused on the interaction between computers and humans through natural language
        "Generative Adversarial Networks (GAN)",  # A class of ML frameworks designed by opposing networks
        "Feature Engineering",  # Process of using domain knowledge to extract features from raw data
        "Model Evaluation",  # Assessing the performance of a ML model
        "Hyperparameter Tuning",  # The process of adjusting the parameters of ML models
        "Data Augmentation",  # Techniques to increase the amount of data by adding slightly modified copies
        "Transfer Learning",  # Reusing a pre-trained model on a new problem
        "Dimensionality Reduction",  # Reducing the number of random variables under consideration
        "Cluster Analysis",  # Grouping a set of objects in such a way that objects in the same group are more similar to each other
        "Principal Component Analysis (PCA)",  # A technique for simplifying the complexity in high-dimensional data
        "Regularization",  # Techniques to prevent overfitting in ML models
        "Anomaly Detection",  # Identifying unusual patterns that do not conform to expected behavior
        "Predictive Modeling",  # The process of using data and statistical algorithms to predict future outcomes
        "Machine Learning Pipelines",  # Sequences of data processing and learning steps in ML projects
        "Ethics in Machine Learning",  # Considerations of fairness, accountability, and transparency in ML
        "Bias and Fairness in ML",  # Addressing and mitigating bias in ML algorithms and datasets
    ],
    "Deep Learning": [
        "Deep Learning",  # The core field of study, focusing on algorithms and neural network architectures
        "Artificial Neural Networks (ANN)",  # The foundation of deep learning models
        "Convolutional Neural Networks (CNN)",  # Specialized deep neural networks for processing structured grid data such as images
        "Recurrent Neural Networks (RNN)",  # Networks designed for sequential data processing
        "Long Short-Term Memory (LSTM)",  # A type of RNN effective in learning long-term dependencies
        "Generative Adversarial Networks (GANs)",  # Frameworks for teaching DL models to create data resembling the input data
        "Deep Reinforcement Learning",  # Combining deep learning with reinforcement learning to make decisions
        "Transfer Learning",  # Leveraging pre-trained models on new problems
        "Autoencoders",  # Neural networks used for unsupervised learning tasks, such as feature learning and representation
        "Deep Belief Networks (DBN)",  # Generative models composed of multiple layers of stochastic, latent variables
        "Attention Mechanisms",  # Techniques in neural networks to improve the focus on specific aspects of the input
        "Neural Architecture Search (NAS)",  # Automated process of designing deep learning models
        "Feature Learning",  # Techniques for learning features automatically from raw data
        "Representation Learning",  # Strategies for identifying representations of the data that make it easier to extract useful information
        "Self-supervised Learning",  # A type of unsupervised learning where the data itself provides supervision
        "Semantic Segmentation",  # Classifying each pixel of an image into a predefined category
        "Object Detection",  # Technology capable of identifying objects within images or videos
        "Image Classification",  # Assigning a label to an entire image or photograph
        "Natural Language Understanding",  # Part of NLP focusing on machine reading comprehension
        "Speech Recognition",  # Converting spoken words into text
        "Machine Translation",  # Automatic translation of text from one language to another
        "Deep Learning Optimization",  # Techniques for improving the training of deep neural networks
        "Graph Neural Networks (GNN)",  # Neural networks that directly operate on the graph structure
        "Federated Learning",  # A machine learning setting where the model is trained across multiple decentralized devices or servers
        "Bias and Fairness in Deep Learning",  # Addressing biases in datasets and models
        "Explainable AI (XAI)",  # Methods and techniques in AI research to make the results of AI systems more understandable to humans
        "Adversarial Machine Learning",  # Studying the vulnerabilities of ML models to adversarial examples
    ],
    "Signal Processing": [
        "Signal Processing",  # Core discipline focusing on the analysis and manipulation of signals
        "Digital Signal Processing (DSP)",  # Processing of digital signals and the mathematical algorithms for this purpose
        "Analog Signal Processing",  # Handling of continuous signals
        "Time-Frequency Analysis",  # Analyzing signals in both time and frequency domains
        "Fourier Transform",  # Mathematical transform for decomposing functions into their sinusoidal components
        "Fast Fourier Transform (FFT)",  # An algorithm to compute the Fourier transform quickly
        "Wavelet Transform",  # Analysis of signals using wavelets for better time and frequency localization
        "Filter Design",  # Creating filters to modify or enhance certain aspects of signals
        "Adaptive Filtering",  # Filters that adjust their parameters based on signal input
        "Signal Reconstruction",  # Rebuilding a signal from a set of samples
        "Spectral Analysis",  # Examination of the spectral content of a signal
        "Convolution",  # A mathematical operation on two functions to produce a third function
        "Modulation",  # The process of varying one or more properties of a periodic waveform
        "Demodulation",  # Extracting the original information-bearing signal from a modulated carrier wave
        "Sampling Theory",  # Theory describing the process of converting continuous signals into discrete signals
        "Quantization",  # Process of mapping input values from a large set to output values in a smaller set
        "Signal Enhancement",  # Techniques for improving the quality or clarity of a signal
        "Noise Reduction",  # Methods for diminishing noise within a signal
        "Biomedical Signal Processing",  # Application of signal processing techniques in the biomedical context, such as ECG and EEG analysis
        "Image Processing",  # Processing of images using mathematical operations
        "Audio Signal Processing",  # Analysis and manipulation of audio signals
        "Voice Recognition",  # Technology capable of identifying and responding to the sound of a voice
        "Machine Learning for Signal Processing",  # Utilizing machine learning algorithms to analyze and interpret signals
        "Signal Processing in Communications",  # Enhancing communication signals for better transmission and reception
        "Radar Signal Processing",  # Analyzing radar signals for detection, estimation, and classification
        "Sonar Signal Processing",  # Handling and interpretation of sonar data and signals
        "Seismic Data Processing",  # Analyzing seismic signals for geophysical and exploration purposes
    ],
    "Bioinformatics": [
        "Bioinformatics",  # Core field
        "Computational Genomics",  # Specific to the genomic aspect
        "Computational Biology",  # Broad term within the field
        "Genome Sequencing",  # Specific technique
        "Genomic Data Mining",  # Data analysis specific to genomics
        "Proteomics Analysis",  # Study of proteomes
        "Transcriptomics Analysis",  # Study of transcriptomes
        "Metabolomics Data Analysis",  # Study of metabolomes
        "Phylogenetic Analysis",  # Evolutionary analyses
        "Molecular Dynamics Simulations",  # Computational modeling in bioinformatics
        "Structural Bioinformatics",  # Focus on the structure of biological macromolecules
        "Biological Sequence Analysis",  # Core activity in bioinformatics
        "Functional Genomics",  # Understanding the function of genetic information
        "Comparative Genomics",  # Comparing genomes of different species
        "Systems Biology",  # Integrative approach, closely related to bioinformatics
        "Network Biology",  # Study of biological networks
        "Pathway Analysis",  # Analysis of biological pathways
        "Gene Expression Analysis",  # Study of gene expression patterns
        "Single-Cell Sequencing",  # Single-cell genomic analysis
        "Machine Learning in Genomics",  # Application of ML in genomics
        "Bioinformatics Pipelines",  # Automated workflows in bioinformatics
        "Next-Generation Sequencing",  # Modern sequencing technologies
        "Omics Data Integration",  # Integrating different omics data types
        "ChIP-seq Analysis",  # Chromatin immunoprecipitation sequencing analysis
        "RNA-Seq Data Analysis",  # Sequencing of RNA transcripts
        "Variant Calling",  # Identifying variants from sequence data
        "Microbiome Analysis",  # Study of microbial communities
        "Population Genomics",  # Genomic analysis at the population level
    ]
}


tags_dic_E3 = {
    "Cardiology": [
        "Cardiolog*",  # Broadens to include Cardiology, Cardiologist, etc.
        "Cardiovascular Disease*",  # Captures both singular and plural forms
        "Coronary Artery Disease*",  # Includes any articles mentioning Coronary Artery Diseases
        "Heart Failure*",  # Ensures inclusion of Heart Failures and related terms
        "Arrhythmia*",  # Covers Arrhythmias and other related irregular heartbeats
        "Electrocardiograph*",  # Broadens to Electrocardiography, Electrocardiographs
        "Echocardiograph*",  # Expands to Echocardiography, Echocardiographs
        "Cardiac Catheterization*",  # Also captures Catheterizations
        "Percutaneous Coronary Intervention*",  # Ensures inclusion of all forms of the term
        "Stent Placement*",  # Captures Stent Placements and related procedures
        "Cardiac Surgery*",  # Broadens to include various cardiac surgical procedures
        "Heart Transplant*",  # Ensures coverage of Heart Transplants and transplantation process
        "Hypertension*",  # Captures Hypertension and hypertensive disorders
        "Myocardial Infarction*",  # Broadens to include Infarctions and related terms
        "Atrial Fibrillation*",  # Ensures all related terms and plural forms are included
        "Cardiac Imaging*",  # Broadens to include various cardiac imaging modalities
        "Vascular Health*",  # Ensures coverage of vascular health-related topics
        "Angiograph*",  # Expands to Angiography, Angiographs, ensuring broader coverage
        "Heart Valve Disease*",  # Captures diseases of the heart valves in various forms
        "Cardiac Rehabilitation*",  # Includes Cardiac Rehab and related terms
        "Preventive Cardiology*",  # Expands to encompass the field of prevention in Cardiology
        "Lipidolog*",  # Broadens to include Lipidology and related lipid studies
        "Thrombosis*",  # Ensures inclusion of Thromboses and related blood clot conditions
        "Cardiomyopathy*",  # Broadens to include various forms of Cardiomyopathies
        "Congenital Heart Disease*",  # Covers both singular and plural, ensuring broad capture
        "Heart Rhythm Disorder*",  # Ensures inclusion of disorders affecting heart rhythm
        "Interventional Cardiology*",  # Broadens to include the subspecialty and related procedures
        "Acute Kidney Injury", "AKI",  # Specific inclusion of AKI with spaces to avoid unrelated matches
    ],
   "Gastroenterology": [
        "Gastroenterolog*",  # Broadens to include Gastroenterology, Gastroenterological, etc.
        "Gastrointestinal Disease*",  # Captures both singular and plural forms
        "Inflammatory Bowel Disease* OR IBD",  # Includes the abbreviation and full term
        "Irritable Bowel Syndrome* OR IBS",  # Includes the abbreviation and full term
        "Hepatolog*",  # Expands to Hepatology and related terms
        "Endoscop*",  # Broadens to Endoscopy, Endoscopic procedures
        "Colonoscop*",  # Includes Colonoscopy and related terms
        "Gastroesophageal Reflux Disease* OR GERD",  # Includes the abbreviation and full term
        "Peptic Ulcer Disease*",  # Ensures coverage of all related terms
        "Celiac Disease*",  # Captures both singular and plural forms
        "Gastrointestinal Cancer*",  # Broadens to include various types of GI cancers
        "Hepatitis*",  # Ensures inclusion of Hepatitis A, B, C, etc.
        "Liver Transplantation*",  # Captures the process and related terms
        "Pancreatit*",  # Expands to Pancreatitis and related conditions
        "Gallstone*",  # Ensures inclusion of singular and plural forms
        "Gastrointestinal Bleeding",  # Specific term, consider adding "GI Bleeding" for broader coverage
        "Gastrointestinal Endoscop*",  # Broadens to include procedures and technologies
        "Esophageal Disease*",  # Captures diseases affecting the esophagus
        "Barrett's Esophagus",  # Specific condition, consider "Barrett* Esophagus" for variations
        "Helicobacter pylori Infection", "H. pylori",  # Includes common abbreviation and full term
        "Colitis",  # Consider "Colitis*" for broader coverage including specific types
        "Diverticulitis",  # Specific condition, consider "Diverticul*" for broader search
        "Fecal Microbiota Transplantation OR FMT",  # Includes the abbreviation and full term
        "GI Motility Disorder*",  # Broadens to include various motility issues
        "Capsule Endoscop*",  # Includes the procedure and device-related terms
        "Biliary Tract Disorder*",  # Captures a range of biliary system conditions
        "Nutritional Gastroenterolog*",  # Broadens to include nutritional aspects within the field

    ],
    "Nephrology": [
        "Nephrolog*",  # Broadens to include Nephrology, Nephrological, etc.
        "Chronic Kidney Disease* OR CKD",  # Captures both the full term and abbreviation
        "Acute Kidney Injury* OR AKI",  # Includes both full term and abbreviation, consider " AKI " for precision
        "End-Stage Renal Disease* OR ESRD",  # Includes both full term and abbreviation
        "Kidney Transplantation*",  # Broadens to capture related terms and processes
        "Dialysis",  # General term, consider "Dialys*" for broader coverage including Dialysis-related processes
        "Hemodialysis",  # Specific term, no change needed unless broader context is required
        "Peritoneal Dialysis",  # Specific term, consider "Peritoneal Dialys*" for variations
        "Renal Biopsy",  # Specific procedure, consider "Renal Biops*" for broader coverage
        "Glomerulonephritis",  # Specific condition, consider "Glomerulonephrit*" for variations
        "Polycystic Kidney Disease* OR PKD",  # Includes both full term and abbreviation
        "Nephrotic Syndrome*",  # Broadens to include variations of Nephrotic Syndrome
        "Electrolyte Imbalance*",  # Captures both singular and plural forms
        "Hypertension and Kidney*",  # Broadens to relate Hypertension with various kidney-related issues
        "Renal Pharmacolog*",  # Expands to include Renal Pharmacology and related terms
        "Kidney Stone*",  # Broadens to capture singular and plural forms
        "Renal Patholog*",  # Expands to include Renal Pathology and related studies
        "Uremia",  # Specific condition, consider "Uremi*" for broader coverage
        "Anemia in Chronic Kidney Disease* OR CKD Anemia",  # Captures both descriptive and abbreviated forms
        "Renal Replacement Therapy* OR RRT",  # Includes both full term and abbreviation
        "Kidney Function Test*",  # Broadens to capture various tests related to kidney function
        "Proteinuria",  # Specific condition, consider adding "Proteinuri*" for variations
        "Hematuria",  # Specific condition, consider "Hematuri*" for variations
        "Renal Nutrition",  # Specific focus, consider "Renal Nutritional*" for broader coverage
        "Vascular Access for Dialysis",  # Consider "Vascular Access*" for broadening
        "Renal Genetic*",  # Expands to cover studies on genetics affecting the kidney
        "Diabetic Nephropathy* OR Diabetic Kidney Disease",  # Includes both terms and common abbreviation
    ],
    "Critical Care": [
        "Critical Care*",  # Broadens to include related terms and plurals
        "Intensive Care Medicine*",  # Captures variations and related phrases
        "Mechanical Ventilation*",  # Ensures inclusion of all related terms
        "Acute Respiratory Distress Syndrome* OR ARDS",  # Includes abbreviation and full term
        "Sepsis",  # No change needed, but consider "Sepsis*" for broader coverage
        "Septic Shock*",  # Broadens to include variations
        "Cardiac Arrest*",  # Ensures inclusion of related terms
        "Resuscitation*",  # Broadens to include various forms of revival techniques
        "Critical Care Nursing",  # Specific term, consider "Critical Care Nurs*" for broader coverage
        "Trauma Care*",  # Broadens to include related trauma care topics
        "Life Support*",  # Ensures inclusion of all forms of life support
        "Intensive Care Unit* OR ICU",  # Includes both abbreviation and full term
        "Ventilator-associated Pneumonia* OR VAP",  # Includes abbreviation and full term
        "Extracorporeal Membrane Oxygenation* OR ECMO",  # Includes abbreviation and full term
        "Invasive Monitoring*",  # Broadens to include various monitoring techniques
        "Shock*",  # Broadens to include types and related conditions
        "Multi-organ Failure*",  # Ensures inclusion of singular and plural forms
        "Neurocritical Care*",  # Broadens to include neurology-focused critical care
        "Critical Care Pharmacolog*",  # Expands to include Pharmacology and related terms
        "Palliative Care in ICU*",  # Ensures coverage of palliative care within ICU settings
        "Fluid Resuscitation*",  # Broadens to include all forms of fluid administration
        "Blood Product Transfusion*",  # Ensures inclusion of various blood product transfusions
        "Nosocomial Infection*",  # Broadens to include singular and plural forms
        "Acute Kidney Injury* OR AKI in ICU",  # Includes both abbreviation and full term, with ICU context
        "Critical Care Outcomes*",  # Broadens to include various outcome-related research
        "Advanced Life Support* OR ALS",  # Includes both abbreviation and full term
    ],
    "Post-operative Care": [
        "Post-operative Care*",  # Broadens to include various aspects of care after surgery
        "Post-surgical Recovery*",  # Ensures coverage of recovery processes following surgeries
        "Surgical Wound Care*",  # Broadens to include care for all types of surgical wounds
        "Post-operative Complication*",  # Captures both singular and plural forms of complications
        "Pain Management after Surgery*",  # Broadens to encompass all pain management techniques post-surgery
        "Rehabilitation after Surgery*",  # Includes various rehabilitation processes following surgical procedures
        "Post-operative Infection*",  # Broadens to capture all types of infections that can occur after surgery
        "Wound Infection*",  # Ensures inclusion of all wound-related infections
        "Deep Vein Thrombosis* OR DVT Prevention",  # Includes abbreviation and full term for broader coverage
        "Pulmonary Embolism* Prevention",  # Ensures inclusion of strategies to prevent PE post-surgery
        "Post-operative Mobility*",  # Broadens to encourage movement aiding in recovery after surgery
        "Post-operative Nutrition*",  # Encompasses all aspects of nutritional support for surgical recovery
        "Surgical Drain Management*",  # Broadens to include management of all types of surgical drains
        "Post-operative Physiotherapy*",  # Includes various physical therapy approaches after surgery
        "Venous Thromboembolism* OR VTE Prophylaxis",  # Includes abbreviation and full term for comprehensive coverage
        "Incisional Care*",  # Broadens to include care for all types of surgical incisions
        "Post-anesthesia Care Unit* OR PACU",  # Includes abbreviation and full term for immediate post-anesthesia care
        "Post-operative Monitoring*",  # Ensures continuous monitoring of patients for complications after surgery
        "Suture Care*",  # Broadens to include care for all types of surgical sutures
        "Post-operative Delirium*",  # Broadens to include various aspects of acute confusion following surgery
        "Post-operative Ileus*",  # Captures temporary impairment of bowel function post-surgery
        "Fluid and Electrolyte Management*",  # Ensures management of fluids and electrolytes after surgery
        "Blood Transfusion* in Post-operative Care",  # Broadens to include the use of blood transfusions post-surgery
        "Post-operative Urinary Retention*",  # Ensures coverage of difficulties urinating after surgery
        "Early Ambulation*",  # Encourages early movement to promote recovery after surgery
        "Post-operative Respiratory Exercise*",  # Includes exercises to prevent respiratory complications
    ],
    "LLM-NLP": [
        "Natural Language Processing OR NLP",  # Broadens to include both the abbreviation and full term
        "Large Language Model*",  # Ensures coverage of both singular and plural forms
        "Transformer Model*",  # Broadens to include various transformer-based models
        "BERT OR Bidirectional Encoder Representations from Transformers",  # Includes both the acronym and full term
        "GPT OR Generative Pretrained Transformer*",  # Includes versions like GPT-3, GPT-4 with both acronym and full term
        "Tokenization in NLP",  # Specific process, consider "Tokenization" for broader coverage
        "Semantic Analysis",  # No change needed, but consider "Semantic Analysis in NLP" for specificity
        "Sentiment Analysis",  # Broad term, consider "Sentiment Analysis in NLP" for focused searches
        "Machine Translation",  # No change needed, specific enough for relevant searches
        "Text Summarization",  # Specific field, no change needed unless broader context required
        "Language Generation",  # Broad term, consider "Language Generation in NLP" for specificity
        "Named Entity Recognition OR NER",  # Includes both the abbreviation and full term
        "Part-of-Speech Tagging",  # Specific process, consider "POS Tagging" as an alternative term
        "Question Answering Systems",  # No change needed, specific to NLP
        "Dialogue Systems OR Conversational Systems",  # Includes related term for systems that converse in natural language
        "Speech Recognition",  # No change needed, but consider "Automatic Speech Recognition" for broader coverage
        "NLP for Healthcare",  # Specific application, consider "NLP in Medical Records" for broader searches
        "Bias in Language Model*",  # Broadens to include discussions on bias within various models
        "Language Model Fine-tuning",  # Specific process, no change needed unless broader context required
        "Text Classification",  # No change needed, specific enough for relevant searches
        "Word Embedding*",  # Ensures coverage of both singular and plural forms
        "Deep Learning for NLP",  # Specific application of deep learning techniques in NLP
        "Cross-lingual Transfer Learning",  # Specific technique, consider "Cross-lingual Learning" for broader coverage
        "Conversational AI",  # Broad term for AI-powered dialogue systems
        "NLP in Social Media Analysis",  # Specific application of NLP for analyzing social media data
    ],
    "Machine Vision": [
        "Medical Imaging*",  # Broadens to include various imaging modalities and studies
        "Computed Tomography* OR CT",  # Includes abbreviation and full term for broader coverage
        "Magnetic Resonance Imaging* OR MRI",  # Includes abbreviation and full term for comprehensive searches
        "Ultrasound Imaging*",  # Broadens to capture all ultrasound-based imaging studies
        "Positron Emission Tomography* OR PET",  # Includes abbreviation and full term to ensure comprehensive retrieval
        "Single Photon Emission Computed Tomography* OR SPECT",  # Includes abbreviation and full term for complete coverage
        "X-ray Imaging*",  # Broadens to include various X-ray-based imaging techniques
        "Digital Pathology*",  # Expands to encompass studies in pathology using digital tools
        "Radiomics*",  # Broadens to include the field and related radiomic analysis studies
        "Image Segmentation in Medical Imaging",  # Specific term, consider "Image Segmentation*" for broader coverage
        "Image Reconstruction*",  # Broadens to include various reconstruction techniques and studies
        "3D Medical Imaging*",  # Ensures inclusion of three-dimensional imaging studies
        "Machine Learning in Medical Imaging",  # Specific application, consider "Machine Learning* AND Medical Imaging" for targeted searches
        "Deep Learning in Medical Imaging",  # Specific application, consider "Deep Learning* AND Medical Imaging" for focused retrieval
        "Computer-Aided Diagnosis* OR CAD",  # Includes abbreviation and full term to cover computer-assisted diagnostic systems
        "Radiogenomics*",  # Broadens to include the study and applications of radiogenomics
        "Functional MRI* OR fMRI",  # Includes abbreviation and full term for functional imaging studies
        "Optical Coherence Tomography* OR OCT",  # Includes abbreviation and full term for comprehensive search
        "Biomedical Imaging*",  # Broadens to include various biomedical imaging studies
        "Neuroimaging*",  # Expands to include all forms of neurological imaging techniques
        "Image Analysis Algorithms in Medicine",  # Specific field, consider "Image Analysis* AND Medicine" for broader searches
        "Texture Analysis in Medical Imaging",  # Specific technique, consider "Texture Analysis* AND Medical Imaging" for targeted searches
        "Artificial Intelligence in Radiology",  # Specific application, consider "AI AND Radiology" for focused searches
        "Diagnostic Imaging*",  # Broadens to include all forms of imaging for diagnostic purposes
        "Interventional Radiology*",  # Expands to cover the specialty and its procedures
    ],
    "Recurrent Learning": [
        "Recurrent Neural Networks* OR RNN",  # Includes both the abbreviation and the full term.
        "Long Short-Term Memory* OR LSTM",  # Ensures coverage of both abbreviation and full term.
        "Gated Recurrent Unit* OR GRU",  # Broadens search to include abbreviation and full term.
        "Sequence Prediction*",  # Broadens to capture various applications in sequence prediction.
        "Time Series Analysis*",  # Expands to include studies involving time-ordered data.
        "Natural Language Processing OR NLP",  # Covers both the full term and its widely used abbreviation.
        "Speech Recognition*",  # Broadens to capture all related terms and technologies.
        "Machine Translation*",  # Ensures inclusion of automated text translation studies.
        "Sentiment Analysis*",  # Expands to include various methods and applications.
        "Text Generation*",  # Broadens to capture text generation via RNNs and related models.
        "Music Generation*",  # Expands to include generative models in music.
        "Video Frame Prediction*",  # Broadens to encompass prediction of future video frames.
        "Stock Market Prediction*",  # Expands to include forecasting in financial markets.
        "Bioinformatics*",  # Broadens to capture applications in analyzing biological sequences.
        "Deep Learning for Sequences*",  # Expands to include deep learning models tailored for sequential data.
        "Attention Mechanisms*",  # Broadens to include studies on attention in neural networks.
        "Sequence to Sequence Models* OR Seq2Seq",  # Includes both the abbreviated form and full term.
        "Neural Machine Translation* OR NMT",  # Ensures coverage of both abbreviation and full term.
        "Language Modeling*",  # Broadens to capture models predicting language sequences.
        "Memory Networks*",  # Expands to include networks enhanced with memory components.
        "Dynamic Computational Graphs*",  # Broadens to include flexible model architectures.
        "Continuous Time Recurrent Neural Networks* OR CTRNN",  # Includes both the abbreviation and the full term for time-continuous models.
    ],
    "Machine Learning":[
        "Machine Learning*",  # Broadens to include various aspects of the discipline.
        "Supervised Learning*",  # Expands to capture studies on labeled data learning.
        "Unsupervised Learning*",  # Broadens to include learning from unlabeled data.
        "Semi-supervised Learning*",  # Covers approaches combining labeled and unlabeled data.
        "Reinforcement Learning*",  # Encompasses learning through feedback to maximize rewards.
        "Deep Learning*",  # Broadens to capture all deep neural network studies.
        "Neural Networks*",  # Expands to include various neural network architectures.
        "Convolutional Neural Networks*"," CNN ", "-CNN", "CNN-"  ,  # Includes abbreviation for targeted searches.
        "Recurrent Neural Networks* OR RNN",  # Ensures coverage of networks with loop connections.
        "Long Short-Term Memory* OR LSTM",  # Covers this specific RNN architecture.
        "Decision Trees*",  # Expands to include various decision tree models.
        "Random Forests*",  # Broadens to capture ensemble method studies.
        "Gradient Boosting Machines* OR GBM",  # Ensures comprehensive retrieval of gradient boosting research.
        "Support Vector Machines* OR SVM",  # Covers both the term and abbreviation.
        "Natural Language Processing OR NLP",  # Ensures coverage of the intersection of ML and linguistics.
        "Generative Adversarial Networks* OR GAN",  # Broadens search to include this class of neural networks.
        "Feature Engineering*",  # Captures the process across ML applications.
        "Model Evaluation*",  # Broadens to encompass various evaluation methods.
        "Hyperparameter Tuning*",  # Ensures inclusion of optimization studies.
        "Data Augmentation*",  # Expands to cover techniques for increasing data variability.
        "Transfer Learning*",  # Covers the reuse of pre-trained models.
        "Dimensionality Reduction*",  # Ensures inclusion of techniques like PCA, t-SNE.
        "Cluster Analysis*",  # Expands to include various clustering methodologies.
        "Principal Component Analysis* OR PCA",  # Captures this dimensionality reduction technique.
        "Regularization*",  # Ensures coverage of techniques to prevent overfitting.
        "Anomaly Detection*",  # Broadens to include outlier detection studies.
        "Predictive Modeling*",  # Captures the process across various ML applications.
        "Machine Learning Pipelines*",  # Ensures inclusion of end-to-end ML workflows.
        "Ethics in Machine Learning*",  # Covers considerations of ethics within ML.
        "Bias and Fairness in ML*",  # Ensures coverage of studies on bias and fairness.

    ],
    "Deep Learning": [
        "Deep Learning*",  # Broadens to include various aspects of deep learning research and applications.
        "Artificial Neural Networks* OR ANN",  # Includes both the acronym and the full term for broader coverage.
        "Convolutional Neural Networks* OR CNN",  # Ensures comprehensive retrieval of convolutional network studies.
        "Recurrent Neural Networks* OR RNN",  # Expands to include sequential data processing research.
        "Long Short-Term Memory* OR LSTM",  # Covers this specific architecture of RNN for long-term dependencies.
        "Generative Adversarial Networks* OR GANs",  # Broadens search to generative models and their applications.
        "Deep Reinforcement Learning*",  # Combines deep learning techniques with reinforcement learning.
        "Transfer Learning*",  # Encompasses leveraging pre-trained models on new, related problems.
        "Autoencoders*",  # Neural networks for unsupervised learning tasks, including feature learning.
        "Deep Belief Networks* OR DBN",  # Includes generative models with layers of latent variables.
        "Attention Mechanisms*",  # Broadens to include studies on focusing mechanisms within neural networks.
        "Neural Architecture Search* OR NAS",  # Expands to automated model design processes.
        "Feature Learning*",  # Techniques for automatic feature extraction from raw data.
        "Representation Learning*",  # Strategies for identifying data representations for easier information extraction.
        "Self-supervised Learning*",  # Unsupervised learning with data providing its own supervision.
        "Semantic Segmentation*",  # Classifying image pixels into categories.
        "Object Detection*",  # Broadens to include technologies for identifying objects in images or videos.
        "Image Classification*",  # Assigning labels to images or photographs.
        "Natural Language Understanding*",  # Part of NLP for machine comprehension of text.
        "Speech Recognition*",  # Includes technologies for converting spoken language into text.
        "Machine Translation*",  # Automatic text translation from one language to another.
        "Deep Learning Optimization*",  # Techniques to enhance the training of deep neural networks.
        "Graph Neural Networks* OR GNN",  # Covers networks operating on graph data structures.
        "Federated Learning*",  # Distributed model training across multiple devices.
        "Bias and Fairness in Deep Learning*",  # Addresses bias and fairness considerations in deep learning models.
        "Explainable AI* OR XAI",  # Methods for making AI system results more interpretable.
        "Adversarial Machine Learning*",  # Studies vulnerabilities of ML models to adversarial attacks.
    ],
    "Signal Processing": [
        "Signal Processing*",  # Broadens to include various aspects and applications.
        "Digital Signal Processing* OR DSP",  # Includes abbreviation and full term for broader coverage.
        "Analog Signal Processing*",  # Expands to capture studies on analog signals.
        "Time-Frequency Analysis*",  # Broadens to include analysis techniques in both domains.
        "Fourier Transform*",  # Ensures inclusion of Fourier analysis and its applications.
        "Fast Fourier Transform* OR FFT",  # Covers both the abbreviation and full term.
        "Wavelet Transform*",  # Broadens to include wavelet-based analysis techniques.
        "Filter Design*",  # Expands to capture studies on designing various signal filters.
        "Adaptive Filtering*",  # Broadens to include filters that adapt based on inputs.
        "Signal Reconstruction*",  # Ensures coverage of signal rebuilding techniques.
        "Spectral Analysis*",  # Expands to include various spectral analysis methods.
        "Convolution*",  # Broadens to capture convolution operations in signal processing.
        "Modulation*",  # Ensures inclusion of modulation techniques and studies.
        "Demodulation*",  # Covers the process of demodulating signals.
        "Sampling Theory*",  # Broadens to include theoretical aspects of signal sampling.
        "Quantization*",  # Expands to capture quantization processes in digital signal processing.
        "Signal Enhancement*",  # Broadens to include techniques for signal clarity improvement.
        "Noise Reduction*",  # Ensures inclusion of noise reduction techniques in signals.
        "Biomedical Signal Processing*",  # Broadens to include processing of biomedical signals like ECG, EEG.
        "Image Processing*",  # Expands to include digital image processing techniques.
        "Audio Signal Processing*",  # Broadens to capture audio signal analysis and manipulation.
        "Voice Recognition*",  # Ensures comprehensive retrieval of voice recognition technologies.
        "Machine Learning for Signal Processing",  # Specific application, consider "Machine Learning* AND Signal Processing" for targeted searches.
        "Signal Processing in Communications*",  # Expands to include signal processing in communication systems.
        "Radar Signal Processing*",  # Broadens to capture radar signal analysis techniques.
        "Sonar Signal Processing*",  # Expands to include sonar signal analysis and processing.
        "Seismic Data Processing*",  # Broadens to capture seismic signal analysis for geophysical studies.
    ],
    "Bioinformatics": [
        "Bioinformatics*",  # Broadens to include various aspects of the discipline.
        "Computational Genomics*",  # Captures studies specific to genomic analysis.
        "Computational Biology*",  # Encompasses the broader field of computational approaches to biology.
        "Genome Sequencing*",  # Includes sequencing techniques and applications.
        "Genomic Data Mining*",  # Broadens to include data mining techniques in genomics.
        "Proteomics Analysis*",  # Expands to capture proteomic data analysis and its implications.
        "Transcriptomics Analysis*",  # Broadens to include transcriptome analysis techniques.
        "Metabolomics Data Analysis*",  # Encompasses analysis of metabolomic datasets.
        "Phylogenetic Analysis*",  # Captures studies on evolutionary relationships.
        "Molecular Dynamics Simulations*",  # Broadens to include all forms of molecular simulations.
        "Structural Bioinformatics*",  # Expands to encompass the study of biological macromolecule structures.
        "Biological Sequence Analysis*",  # Broadens to include various sequence analysis techniques.
        "Functional Genomics*",  # Captures the functional aspects of genomics research.
        "Comparative Genomics*",  # Broadens to include comparative studies of genomic data.
        "Systems Biology*",  # Encompasses the integrative and systemic approach to biology.
        "Network Biology*",  # Broadens to include the study of biological networks and interactions.
        "Pathway Analysis*",  # Captures analyses of biological and metabolic pathways.
        "Gene Expression Analysis*",  # Broadens to include various gene expression studies.
        "Single-Cell Sequencing*",  # Expands to include single-cell genomic analysis techniques.
        "Machine Learning in Genomics",  # Specific application of ML in genomic studies.
        "Bioinformatics Pipelines*",  # Broadens to include automated workflows in bioinformatics.
        "Next-Generation Sequencing* OR NGS",  # Includes modern sequencing technologies and their abbreviation.
        "Omics Data Integration*",  # Broadens to include the integration of various omics data.
        "ChIP-seq Analysis*",  # Expands to include chromatin immunoprecipitation sequencing studies.
        "RNA-Seq Data Analysis*",  # Broadens to include RNA sequencing data analysis.
        "Variant Calling*",  # Captures the identification of variants in genomic data.
        "Microbiome Analysis*",  # Expands to include studies on microbial communities.
        "Population Genomics*",  # Broadens to include genomic studies at the population level.
    ]
}


tags_dic_E4 = {
    
    "Cardiology": [
        "Cardiolog*",
        "Cardiovascular Disease*",
        "Coronary Artery Disease*",
        "Heart Failure*",
        "Arrhythmia*",
        "Electrocardiograph*",
        "Echocardiograph*",
        "Cardiac Catheterization*",
        "Percutaneous Coronary Intervention*",
        "Stent Placement*",
        "Cardiac Surgery*",
        "Heart Transplant*",
        "Hypertension*",
        "Myocardial Infarction*",
        "Atrial Fibrillation*",
        "Cardiac Imaging*",
        "Vascular Health*",
        "Angiograph*",
        "Heart Valve Disease*",
        "Cardiac Rehabilitation*",
        "Preventive Cardiology*",
        "Lipidolog*",
        "Thrombosis*",
        "Cardiomyopathy*",
        "Congenital Heart Disease*",
        "Heart Rhythm Disorder*",
        "Interventional Cardiology*",
        "Acute Kidney Injury|AKI" 
    ],
    
    "Gastroenterology": [
        "Gastroenterolog*",
        "Gastrointestinal Disease*",
        "Inflammatory Bowel Disease*|IBD",
        "Irritable Bowel Syndrome*|IBS",
        "Hepatolog*",
        "Endoscop*",
        "Colonoscop*",
        "Gastroesophageal Reflux Disease*|GERD",
        "Peptic Ulcer Disease*",
        "Celiac Disease*",
        "Gastrointestinal Cancer*",
        "Hepatitis*",
        "Liver Transplantation*",
        "Pancreatit*",
        "Gallstone*",
        "Gastrointestinal Bleeding|GI Bleeding",
        "Gastrointestinal Endoscop*",
        "Esophageal Disease*",
        "Barrett* Esophagus",
        "Helicobacter pylori Infection|H. pylori",
        "Colitis*",
        "Diverticul*",
        "Fecal Microbiota Transplantation|FMT",
        "GI Motility Disorder*",
        "Capsule Endoscop*",
        "Biliary Tract Disorder*",
        "Nutritional Gastroenterolog*"
    ],
    
    "Nephrology": [
        "Nephrolog*",
        "Chronic Kidney Disease*|CKD",
        "Acute Kidney Injury*|AKI",
        "End-Stage Renal Disease*|ESRD",
        "Kidney Transplantation*",
        "Dialys*",
        "Hemodialysis",
        "Peritoneal Dialys*",
        "Renal Biops*",
        "Glomerulonephrit*",
        "Polycystic Kidney Disease*|PKD",
        "Nephrotic Syndrome*",
        "Electrolyte Imbalance*",
        "Hypertension and Kidney*",
        "Renal Pharmacolog*",
        "Kidney Stone*",
        "Renal Patholog*",
        "Uremi*",
        "Anemia in Chronic Kidney Disease*|CKD Anemia",
        "Renal Replacement Therapy*|RRT",
        "Kidney Function Test*",
        "Proteinuri*",
        "Hematuri*",
        "Renal Nutritional*",
        "Vascular Access*",
        "Renal Genetic*",
        "Diabetic Nephropathy*|Diabetic Kidney Disease"
    ],

    "Critical Care": [
        "Critical Care*",
        "Intensive Care Medicine*",
        "Mechanical Ventilation*",
        "Acute Respiratory Distress Syndrome*|ARDS",
        "Sepsis*",
        "Septic Shock*",
        "Cardiac Arrest*",
        "Resuscitation*",
        "Critical Care Nurs*",
        "Trauma Care*",
        "Life Support*",
        "Intensive Care Unit*|ICU",
        "Ventilator-associated Pneumonia*|VAP",
        "Extracorporeal Membrane Oxygenation*|ECMO",
        "Invasive Monitoring*",
        "Shock*",
        "Multi-organ Failure*",
        "Neurocritical Care*",
        "Critical Care Pharmacolog*",
        "Palliative Care in ICU*",
        "Fluid Resuscitation*",
        "Blood Product Transfusion*",
        "Nosocomial Infection*",
        "Acute Kidney Injury*|AKI in ICU",
        "Critical Care Outcomes*",
        "Advanced Life Support*|ALS"
    ],

    "Post-operative Care": [
        "Post-operative Care*",
        "Post-surgical Recovery*",
        "Surgical Wound Care*",
        "Post-operative Complication*",
        "Pain Management after Surgery*",
        "Rehabilitation after Surgery*",
        "Post-operative Infection*",
        "Wound Infection*",
        "Deep Vein Thrombosis*|DVT Prevention",
        "Pulmonary Embolism* Prevention",
        "Post-operative Mobility*",
        "Post-operative Nutrition*",
        "Surgical Drain Management*",
        "Post-operative Physiotherapy*",
        "Venous Thromboembolism*|VTE Prophylaxis",
        "Incisional Care*",
        "Post-anesthesia Care Unit*|PACU",
        "Post-operative Monitoring*",
        "Suture Care*",
        "Post-operative Delirium*",
        "Post-operative Ileus*",
        "Fluid and Electrolyte Management*",
        "Blood Transfusion* in Post-operative Care",
        "Post-operative Urinary Retention*",
        "Early Ambulation*",
        "Post-operative Respiratory Exercise*"
    ],
    
    "LLM-NLP": [
        "Natural Language Processing|NLP",
        "Large Language Model*",
        "Transformer Model*",
        "BERT|Bidirectional Encoder Representations from Transformers",
        "GPT|Generative Pretrained Transformer*",
        "Tokenization",
        "Semantic Analysis in NLP",
        "Sentiment Analysis in NLP",
        "Machine Translation",
        "Text Summarization",
        "Language Generation in NLP",
        "Named Entity Recognition|NER",
        "POS Tagging",
        "Question Answering Systems",
        "Dialogue Systems|Conversational Systems",
        "Speech Recognition",
        "NLP in Medical Records",
        "Bias in Language Model*",
        "Language Model Fine-tuning",
        "Text Classification",
        "Word Embedding*",
        "Deep Learning for NLP",
        "Cross-lingual Learning",
        "Conversational AI",
        "NLP in Social Media Analysis"
    ],
    "Machine Vision": [
        "Medical Imaging*",
        "Computed Tomography*|CT",
        "Magnetic Resonance Imaging*|MRI",
        "Ultrasound Imaging*",
        "Positron Emission Tomography*|PET",
        "Single Photon Emission Computed Tomography*|SPECT",
        "X-ray Imaging*",
        "Digital Pathology*",
        "Radiomics*",
        "Image Segmentation*",
        "Image Reconstruction*",
        "3D Medical Imaging*",
        "Machine Learning* AND Medical Imaging",
        "Deep Learning* AND Medical Imaging",
        "Computer-Aided Diagnosis*|CAD",
        "Radiogenomics*",
        "Functional MRI*|fMRI",
        "Optical Coherence Tomography*|OCT",
        "Biomedical Imaging*",
        "Neuroimaging*",
        "Image Analysis* AND Medicine",
        "Texture Analysis* AND Medical Imaging",
        "AI AND Radiology",
        "Diagnostic Imaging*",
        "Interventional Radiology*"
    ],

    "Recurrent Learning": [
        "Recurrent Neural Networks*|RNN",
        "Long Short-Term Memory*|LSTM",
        "Gated Recurrent Unit*|GRU",
        "Sequence Prediction*",
        "Time Series Analysis*",
        "Natural Language Processing|NLP",
        "Speech Recognition*",
        "Machine Translation*",
        "Sentiment Analysis*",
        "Text Generation*",
        "Music Generation*",
        "Video Frame Prediction*",
        "Stock Market Prediction*",
        "Bioinformatics*",
        "Deep Learning for Sequences*",
        "Attention Mechanisms*",
        "Sequence to Sequence Models*|Seq2Seq",
        "Neural Machine Translation*|NMT",
        "Language Modeling*",
        "Memory Networks*",
        "Dynamic Computational Graphs*",
        "Continuous Time Recurrent Neural Networks*|CTRNN"
    ],

    "Machine Learning": [
        "Machine Learning*",
        "Supervised Learning*",
        "Unsupervised Learning*",
        "Semi-supervised Learning*",
        "Reinforcement Learning*",
        "Deep Learning*",
        "Neural Networks*",
        "Convolutional Neural Networks*|CNN",
        "Recurrent Neural Networks*|RNN",
        "Long Short-Term Memory*|LSTM",
        "Decision Trees*",
        "Random Forests*",
        "Gradient Boosting Machines*|GBM",
        "Support Vector Machines*|SVM",
        "Natural Language Processing|NLP",
        "Generative Adversarial Networks*|GAN",
        "Feature Engineering*",
        "Model Evaluation*",
        "Hyperparameter Tuning*",
        "Data Augmentation*",
        "Transfer Learning*",
        "Dimensionality Reduction*",
        "Cluster Analysis*",
        "Principal Component Analysis*|PCA",
        "Regularization*",
        "Anomaly Detection*",
        "Predictive Modeling*",
        "Machine Learning Pipelines*",
        "Ethics in Machine Learning*",
        "Bias and Fairness in ML*"
    ],

    "Deep Learning": [
        "Deep Learning*",
        "Artificial Neural Networks*|ANN",
        "Convolutional Neural Networks*|CNN",
        "Recurrent Neural Networks*|RNN",
        "Long Short-Term Memory*|LSTM",
        "Generative Adversarial Networks*|GANs",
        "Deep Reinforcement Learning*",
        "Transfer Learning*",
        "Autoencoders*",
        "Deep Belief Networks*|DBN",
        "Attention Mechanisms*",
        "Neural Architecture Search*|NAS",
        "Feature Learning*",
        "Representation Learning*",
        "Self-supervised Learning*",
        "Semantic Segmentation*",
        "Object Detection*",
        "Image Classification*",
        "Natural Language Understanding*",
        "Speech Recognition*",
        "Machine Translation*",
        "Deep Learning Optimization*",
        "Graph Neural Networks*|GNN",
        "Federated Learning*",
        "Bias and Fairness in Deep Learning*",
        "Explainable AI*|XAI",
        "Adversarial Machine Learning*"
    ],
    "Signal Processing": [
        "Signal Processing*",
        "Digital Signal Processing*|DSP",
        "Analog Signal Processing*",
        "Time-Frequency Analysis*",
        "Fourier Transform*",
        "Fast Fourier Transform*|FFT",
        "Wavelet Transform*",
        "Filter Design*",
        "Adaptive Filtering*",
        "Signal Reconstruction*",
        "Spectral Analysis*",
        "Convolution*",
        "Modulation*",
        "Demodulation*",
        "Sampling Theory*",
        "Quantization*",
        "Signal Enhancement*",
        "Noise Reduction*",
        "Biomedical Signal Processing*",
        "Image Processing*",
        "Audio Signal Processing*",
        "Voice Recognition*",
        "Machine Learning for Signal Processing",
        "Signal Processing in Communications*",
        "Radar Signal Processing*",
        "Sonar Signal Processing*",
        "Seismic Data Processing*"
    ],
    
    "Bioinformatics": [
        "Bioinformatics*",
        "Computational Genomics*",
        "Computational Biology*",
        "Genome Sequencing*",
        "Genomic Data Mining*",
        "Proteomics Analysis*",
        "Transcriptomics Analysis*",
        "Metabolomics Data Analysis*",
        "Phylogenetic Analysis*",
        "Molecular Dynamics Simulations*",
        "Structural Bioinformatics*",
        "Biological Sequence Analysis*",
        "Functional Genomics*",
        "Comparative Genomics*",
        "Systems Biology*",
        "Network Biology*",
        "Pathway Analysis*",
        "Gene Expression Analysis*",
        "Single-Cell Sequencing*",
        "Machine Learning in Genomics",
        "Bioinformatics Pipelines*",
        "Next-Generation Sequencing*|NGS",
        "Omics Data Integration*",
        "ChIP-seq Analysis*",
        "RNA-Seq Data Analysis*",
        "Variant Calling*",
        "Microbiome Analysis*",
        "Population Genomics*"
    ],

}


tags_dic_E5 = {
    "Cardiology": [
        "Cardiolog*",
        "Cardiovascular Disease*",
        "Coronary Artery Disease*",
        "Heart Failure*",
        "Arrhythmia*",
        "Electrocardiograph*",
        "Echocardiograph*",
        "Cardiac Catheterization*",
        "Percutaneous Coronar*",
        "Stent Placement*",
        "Cardiac Surgery*",
        "Heart Transplant*",
        "Hypertension*",
        "Myocardial Infarction*",
        "Atrial Fibrillation*",
        "Cardiac Imaging*",
        "Vascular Health*",
        "Angiograph*",
        "Heart Valve Disease*",
        "Cardiac Rehabilitation*",
        "Preventive Cardiology*",
        "Thrombosis*",
        "Cardiomyopathy*",
        "Congenital Heart Disease*",
        "Heart Rhythm Disorder*",
        "Interventional Cardiology*",
    ],
    
    "Gastroenterology": [
        "Gastroenterolog*",
        "Gastrointestinal Disease*",
        "Inflammatory Bowel Disease*|\\bIBD\\b",
        "Irritable Bowel Syndrome*|\\bIBS\\b",
        "Hepatolog*",
        "Endoscop*",
        "Colonoscop*",
        "Gastroesophageal Reflux Disease*|\\bGERD\\b",
        "Peptic Ulcer Disease*",
        "Celiac Disease*",
        "Gastrointestinal Cancer*",
        "Hepatitis*",
        "Liver Transplantation*",
        "Pancreatit*",
        "Gallstone*",
        "Gastrointestinal Bleeding|GI Bleeding",
        "Gastrointestinal Endoscop*",
        "Esophageal Disease*",
        "Barrett* Esophagus",
        "Helicobacter pylori Infection|H. pylori",
        "Colitis*",
        "Diverticul*",
        "Fecal Microbiota Transplantation|FMT",
        "GI Motility Disorder*",
        "Capsule Endoscop*",
        "Biliary Tract Disorder*",
        "Nutritional Gastroenterolog*"
    ],
    
    "Nephrology": [
        "Nephrolog*",
        "Chronic Kidney Disease*|\\bCKD\\b",
        "Acute Kidney Injury*|\\bAKI\\b",
        "End-Stage Renal Disease*|\\bESRD\\b",
        "Kidney Transplantation*",
        "Dialys*",
        "Hemodialysis",
        "Peritoneal Dialys*",
        "Renal Biops*",
        "Glomerulonephrit*",
        "Polycystic Kidney Disease*|\\bPKD\\b",
        "Nephrotic Syndrome*",
        "Electrolyte Imbalance*",
        "Hypertension and Kidney*",
        "Renal Pharmacolog*",
        "Kidney Stone*",
        "Renal Patholog*",
        "Uremi*",
        "Anemia in Chronic Kidney Disease*|CKD Anemia",
        "Renal Replacement Therapy*|\\bRRT\\b",
        "Kidney Function Test*",
        "Proteinuri*",
        "Hematuri*",
        "Renal Nutritional*",
        "Vascular Access*",
        "Renal Genetic*",
        "Diabetic Nephropathy*|Diabetic Kidney Disease"
    ],

    "Critical Care": [
        "Critical Care*",
        "Intensive Care Medicine*",
        "Mechanical Ventilation*",
        "Acute Respiratory Distress Syndrome*|\\bARDS\\b",
        "Sepsis*",
        "Septic Shock*",
        "Cardiac Arrest*",
        "Resuscitation*",
        "Critical Care Nurs*",
        "Trauma Care*",
        "Life Support*",
        "Intensive Care Unit*|\\bICU\\b",
        "Ventilator-associated Pneumonia*|\\bVAP\\b",
        "Extracorporeal Membrane Oxygenation*|\\bECMO\\b",
        "Invasive Monitoring*",
        "Shock*",
        "Multi-organ Failure*",
        "Neurocritical Care*",
        "Critical Care Pharmacolog*",
        "Fluid Resuscitation*",
        "Blood Product Transfusion*",
        "Nosocomial Infection*",
        "Critical Care Outcomes*",
        "Advanced Life Support*|\\bALS\\b"
    ],

    "Post-operative Care": [
        "Post-operative Care*",
        "Post-surgical Recovery*",
        "Surgical Wound Care*",
        "Post-operative Complication*",
        "Pain Management after Surgery*",
        "Rehabilitation after Surgery*",
        "Post-operative Infection*",
        "Wound Infection*",
        "Deep Vein Thrombosis*|DVT Prevention",
        "Pulmonary Embolism* Prevention",
        "Post-operative Mobility*",
        "Post-operative Nutrition*",
        "Surgical Drain Management*",
        "Post-operative Physiotherapy*",
        "Venous Thromboembolism*|VTE Prophylaxis",
        "Incisional Care*",
        "Post-anesthesia Care Unit*|PACU",
        "Post-operative Monitoring*",
        "Suture Care*",
        "Post-operative Delirium*",
        "Post-operative Ileus*",
        "Fluid and Electrolyte Management*",
        "Blood Transfusion* in Post-operative Care",
        "Post-operative Urinary Retention*",
        "Early Ambulation*",
        "Post-operative Respiratory Exercise*"
    ],
    
    "LLM-NLP": [
        "Natural Language Processing|\\bNLP\\b",
        "Large Language Model*",
        "Transformer Model*",
        "BERT|Bidirectional Encoder Representations from Transformers",
        "\\bGPT\\b|chatgpt|chat-gpt|Generative Pretrained Transformer*",
        "Tokenization",
        "Semantic Analysis",
        "Sentiment Analysis",
        "Machine Translation",
        "Text Summarization",
        "Language Generation",
        "Named Entity Recognition|\\bNER\\b",
        "POS Tagging",
        "Question Answering Systems",
        "Dialogue Systems|Conversational Systems",
        "Speech Recognition",
        "Bias in Language Model*",
        "Language Model Fine-tuning",
        "Text Classification",
        "Word Embedding*",
        "Deep Learning for NLP",
        "Cross-lingual Learning",
        "Conversational AI",
        "NLP in Social Media Analysis"
    ],
    "Machine Vision": [
        "Medical Imaging*",
        "Computed Tomography*|CT scan|CT imag*", # removed CT due to high occurance
        "Magnetic Resonance Imaging*|\\bMRI\\b",
        "Enterograph*",
        "Ultrasound Imaging*",
        "Positron Emission Tomography*|\\bPET\\b",
        "Single Photon Emission Computed Tomography*|\\bSPECT\\b",
        "X-ray Imaging*",
        "Digital Pathology*", 
        "whole slide imag*",
        "Radiomics*",
        "Image Segmentation*",
        "Image Reconstruction*",
        "3D reconstruct*", #
        "Radiogenomics*",
        "Radiomic*"
        "Functional MRI*|fMRI",
        "Optical Coherence Tomography*|\\bOCT\\b",
        "Biomedical Imaging*",
        "Neuroimaging*",
        "Image Analysis*",
        "Texture Analysis*",
        "Diagnostic Imaging*",
        "Interventional Radiology*"
    ],

    "Recurrent Learning": [
        "Recurrent Neural Networks*|RNN",
        "Long Short-Term Memory*|LSTM",
        "Gated Recurrent Unit*|GRU",
        "Sequence Prediction*",
        "Time Series Analysis*",
        "Natural Language Processing|NLP",
        "Speech Recognition*",
        "Machine Translation*",
        "Sentiment Analysis*",
        "Text Generation*",
        "Music Generation*",
        "Video Frame Prediction*",
        "Stock Market Prediction*",
        "Bioinformatics*",
        "Deep Learning for Sequences*",
        "Attention Mechanisms*",
        "Sequence to Sequence Models*|Seq2Seq",
        "Neural Machine Translation*|NMT",
        "Language Modeling*",
        "Memory Networks*",
        "Dynamic Computational Graphs*",
        "Continuous Time Recurrent Neural Networks*|CTRNN"
    ],

    "Machine Learning": [
        "Machine Learning*",
        "Supervised Learning*",
        "Unsupervised Learning*",
        "Semi-supervised Learning*",
        "Reinforcement Learning*",
        "Deep Learning*",
        "Neural Networks*",
        "Convolutional Neural Networks*|CNN",
        "Recurrent Neural Networks*|RNN",
        "Long Short-Term Memory*|LSTM",
        "Decision Trees*",
        "Random Forests*",
        "Gradient Boosting Machines*|GBM",
        "Support Vector Machines*|SVM",
        "Natural Language Processing|NLP",
        "Generative Adversarial Networks*|GAN",
        "Feature Engineering*",
        "Model Evaluation*",
        "Hyperparameter Tuning*",
        "Data Augmentation*",
        "Transfer Learning*",
        "Dimensionality Reduction*",
        "Cluster Analysis*",
        "Principal Component Analysis*|PCA",
        "Regularization*",
        "Anomaly Detection*",
        "Predictive Modeling*",
        "Machine Learning Pipelines*",
        "Ethics in Machine Learning*",
        "Bias and Fairness in ML*",
        "Computer-Aided Diagnosis*|\\bCAD\\b",
    ],

    "Deep Learning": [
        "Deep Learning*",
        "Artificial Neural Network*|\\bANN\\b",
        "Convolutional Neural Network*|\\bCNN\\b",
        "Recurrent Neural Networks*|\\bRNN\\b",
        "Long Short-Term Memory*|\\bLSTM\\b",
        "Generative Adversarial Network*|\\bGAN\\b",
        "Deep Reinforcement Learning*",
        "Transfer Learning*",
        "Autoencoders*",
        "Deep Belief Networks*|\\bDBN\\b",
        "Attention Mechanisms*",
        "Neural Architecture Search*|\\bNAS\\b",
        "Feature Learning*",
        "Representation Learning*",
        "Self-supervised Learning*",
        "Semantic Segmentation*",
        "Object Detection*",
        "Image Classification*",
        "Natural Language Understanding*",
        "Speech Recognition*",
        "Machine Translation*",
        "Deep Learning Optimization*",
        "Graph Neural Networks*|\\bGNN\\b",
        "Federated Learning*",
        "Bias and Fairness in Deep Learning*",
        "Explainable AI*|\\bXAI\\b",
        "Adversarial Machine Learning*"
    ],
    "Signal Processing": [
        "Signal Processing*",
        "Digital Signal Processing*|\\bDSP\\b",
        "Analog Signal Processing*",
        "Time-Frequency Analysis*",
        "Fourier Transform*",
        "Fast Fourier Transform*|\\bFFT\\b",
        "Wavelet Transform*",
        "Filter Design*",
        "Adaptive Filtering*",
        "Signal Reconstruction*",
        "Spectral Analysis*",
        "Convolution*",
        "Modulation*",
        "Demodulation*",
        "Sampling Theory*",
        "Quantization*",
        "Signal Enhancement*",
        "Noise Reduction*",
        "Biomedical Signal Processing*",
        "Image Processing*",
        "Audio Signal Processing*",
        "Voice Recognition*",
        "Machine Learning for Signal Processing",
        "Signal Processing in Communications*",
        "Radar Signal Processing*",
        "Sonar Signal Processing*",
        "Seismic Data Processing*"
    ],
    
    "Bioinformatics": [
        "Bioinformatics*",
        "Computational Genomics*",
        "Computational Biology*",
        "Genome Sequencing*",
        "Genomic Data Mining*",
        "Proteomics Analysis*",
        "Transcriptomics Analysis*",
        "Metabolomics Data Analysis*",
        "Phylogenetic Analysis*",
        "Molecular Dynamics Simulations*",
        "Structural Bioinformatics*",
        "Biological Sequence Analysis*",
        "Functional Genomics*",
        "Comparative Genomics*",
        "Systems Biology*",
        "Network Biology*",
        "Pathway Analysis*",
        "Gene Expression Analysis*",
        "Single-Cell Sequencing*",
        "Machine Learning in Genomics",
        "Bioinformatics Pipelines*",
        "Next-Generation Sequencing*|\\bNGS\\b",
        "Omics Data Integration*",
        "ChIP-seq Analysis*",
        "RNA-Seq Data Analysis*",
        "Variant Calling*",
        "Microbiome Analysis*",
        "Population Genomics*"
    ],

}

tags_dic_E6 = {
    "Cardiology": [
        "Cardiolog*",
        "Cardiovascular Disease*",
        "Coronary Artery Disease*",
        "Heart Failure*",
        "Arrhythmia*",
        "Electrocardiograph*",
        "Echocardiograph*",
        "Cardiac Catheterization*",
        "Percutaneous Coronar*",
        "Stent Placement*",
        "Cardiac Surgery*",
        "Heart Transplant*",
        "Hypertension*",
        "Myocardial Infarction*",
        "Atrial Fibrillation*",
        "Cardiac Imaging*",
        "Vascular Health*",
        "Angiograph*",
        "Heart Valve Disease*",
        "Cardiac Rehabilitation*",
        "Preventive Cardiology*",
        "Thrombosis*",
        "Cardiomyopathy*",
        "Congenital Heart Disease*",
        "Heart Rhythm Disorder*",
        "Interventional Cardiology*",
    ],
    
    "Gastroenterology": [
        "Gastroenterolog*",
        "Gastrointestinal Disease*",
        "Inflammatory Bowel Disease*|\\bIBD\\b",
        "Irritable Bowel Syndrome*|\\bIBS\\b",
        "Hepatolog*",
        "Endoscop*",
        "Colonoscop*",
        "Gastroesophageal Reflux Disease*|\\bGERD\\b",
        "Peptic Ulcer Disease*",
        "Celiac Disease*",
        "Gastrointestinal Cancer*",
        "Hepatitis*",
        "Liver Transplantation*",
        "Pancreatit*",
        "Gallstone*",
        "Gastrointestinal Bleeding|GI Bleeding",
        "Gastrointestinal Endoscop*",
        "Esophageal Disease*",
        "Barrett* Esophagus",
        "Helicobacter pylori Infection|H. pylori",
        "Colitis*",
        "Diverticul*",
        "Fecal Microbiota Transplantation|FMT",
        "GI Motility Disorder*",
        "Capsule Endoscop*",
        "Biliary Tract Disorder*",
        "Nutritional Gastroenterolog*"
    ],
    
    "Nephrology": [
        "Nephrolog*",
        "Chronic Kidney Disease*|\\bCKD\\b",
        "Acute Kidney Injury*|\\bAKI\\b",
        "End-Stage Renal Disease*|\\bESRD\\b",
        "Kidney Transplantation*",
        "Dialys*",
        "Hemodialysis",
        "Peritoneal Dialys*",
        "Renal Biops*",
        "Glomerulonephrit*",
        "Polycystic Kidney Disease*|\\bPKD\\b",
        "Nephrotic Syndrome*",
        "Electrolyte Imbalance*",
        "Hypertension and Kidney*",
        "Renal Pharmacolog*",
        "Kidney Stone*",
        "Renal Patholog*",
        "Uremi*",
        "Anemia in Chronic Kidney Disease*|CKD Anemia",
        "Renal Replacement Therapy*|\\bRRT\\b",
        "Kidney Function Test*",
        "Proteinuri*",
        "Hematuri*",
        "Renal Nutritional*",
        "Vascular Access*",
        "Renal Genetic*",
        "Diabetic Nephropathy*|Diabetic Kidney Disease"
    ],

    "Critical Care": [
        "Critical Care*",
        "Intensive Care Medicine*",
        "Mechanical Ventilation*",
        "Acute Respiratory Distress Syndrome*|\\bARDS\\b",
        "Sepsis*",
        "Septic Shock*",
        "Cardiac Arrest*",
        "Resuscitation*",
        "Critical Care Nurs*",
        "Trauma Care*",
        "Life Support*",
        "Intensive Care Unit*|\\bICU\\b",
        "Ventilator-associated Pneumonia*|\\bVAP\\b",
        "Extracorporeal Membrane Oxygenation*|\\bECMO\\b",
        "Invasive Monitoring*",
        "Shock*",
        "Multi-organ Failure*",
        "Neurocritical Care*",
        "Critical Care Pharmacolog*",
        "Fluid Resuscitation*",
        "Blood Product Transfusion*",
        "Nosocomial Infection*",
        "Critical Care Outcomes*",
        "Advanced Life Support*|\\bALS\\b"
    ],

    "Post-operative Care": [
        "Post-operative Care*",
        "Post-surgical Recovery*",
        "Surgical Wound Care*",
        "Post-operative Complication*",
        "Pain Management after Surgery*",
        "Rehabilitation after Surgery*",
        "Post-operative Infection*",
        "Wound Infection*",
        "Deep Vein Thrombosis*|DVT Prevention",
        "Pulmonary Embolism* Prevention",
        "Post-operative Mobility*",
        "Post-operative Nutrition*",
        "Surgical Drain Management*",
        "Post-operative Physiotherapy*",
        "Venous Thromboembolism*|VTE Prophylaxis",
        "Incisional Care*",
        "Post-anesthesia Care Unit*|PACU",
        "Post-operative Monitoring*",
        "Suture Care*",
        "Post-operative Delirium*",
        "Post-operative Ileus*",
        "Fluid and Electrolyte Management*",
        "Blood Transfusion* in Post-operative Care",
        "Post-operative Urinary Retention*",
        "Early Ambulation*",
        "Post-operative Respiratory Exercise*"
    ],
    
    "LLM-NLP": [
        "Natural Language Processing|\\bNLP\\b",
        "Large Language Model*",
        "Transformer Model*",
        "BERT|Bidirectional Encoder Representations from Transformers",
        "\\bGPT\\b|chatgpt|chat-gpt|Generative Pretrained Transformer*",
        "Tokenization",
        "Semantic Analysis",
        "Sentiment Analysis",
        "Machine Translation",
        "Text Summarization",
        "Language Generation",
        "Named Entity Recognition|\\bNER\\b",
        "POS Tagging",
        "Question Answering Systems",
        "Dialogue Systems|Conversational Systems",
        "Speech Recognition",
        "Bias in Language Model*",
        "Language Model Fine-tuning",
        "Text Classification",
        "Word Embedding*",
        "Deep Learning for NLP",
        "Cross-lingual Learning",
        "Conversational AI",
        "NLP in Social Media Analysis"
    ],
    "Machine Vision": [
        "Medical Imaging*",
        "Computed Tomography*|CT scan|CT imag*", # removed CT due to high occurance
        "Magnetic Resonance Imaging*|\\bMRI\\b",
        "Enterograph*",
        "Ultrasound Imaging*",
        "Positron Emission Tomography*|\\bPET\\b",
        "Single Photon Emission Computed Tomography*|\\bSPECT\\b",
        "X-ray Imaging*",
        "Digital Pathology*", 
        "whole slide imag*",
        "Radiomics*",
        "Image Segmentation*",
        "Image Reconstruction*",
        "3D reconstruct*", #
        "Radiogenomics*",
        "Radiomic*"
        "Functional MRI*|fMRI",
        "Optical Coherence Tomography*|\\bOCT\\b",
        "Biomedical Imaging*",
        "Neuroimaging*",
        "Image Analysis*",
        "Texture Analysis*",
        "Diagnostic Imaging*",
        "Interventional Radiology*"
    ],

    "Recurrent Learning": [
        "Recurrent Neural Networks*|RNN",
        "Long Short-Term Memory*|LSTM",
        "Gated Recurrent Unit*|GRU",
        "Sequence Prediction*",
        "Time Series Analysis*",
        "Natural Language Processing|NLP",
        "Speech Recognition*",
        "Machine Translation*",
        "Sentiment Analysis*",
        "Text Generation*",
        "Music Generation*",
        "Video Frame Prediction*",
        "Stock Market Prediction*",
        "Bioinformatics*",
        "Deep Learning for Sequences*",
        "Attention Mechanisms*",
        "Sequence to Sequence Models*|Seq2Seq",
        "Neural Machine Translation*|NMT",
        "Language Modeling*",
        "Memory Networks*",
        "Dynamic Computational Graphs*",
        "Continuous Time Recurrent Neural Networks*|CTRNN"
    ],

    "Machine Learning": [
        "Machine Learning*",
        "Supervised Learning*",
        "Unsupervised Learning*",
        "Semi-supervised Learning*",
        "Reinforcement Learning*",
        "Deep Learning*",
        "Neural Networks*",
        "Convolutional Neural Networks*|CNN",
        "Recurrent Neural Networks*|RNN",
        "Long Short-Term Memory*|LSTM",
        "Decision Trees*",
        "Random Forests*",
        "Gradient Boosting Machines*|GBM",
        "Support Vector Machines*|SVM",
        "Natural Language Processing|NLP",
        "Generative Adversarial Networks*|GAN",
        "Feature Engineering*",
        "Model Evaluation*",
        "Hyperparameter Tuning*",
        "Data Augmentation*",
        "Transfer Learning*",
        "Dimensionality Reduction*",
        "Cluster Analysis*",
        "Principal Component Analysis*|PCA",
        "Regularization*",
        "Anomaly Detection*",
        "Predictive Modeling*",
        "Machine Learning Pipelines*",
        "Ethics in Machine Learning*",
        "Bias and Fairness in ML*",
        "Computer-Aided Diagnosis*|\\bCAD\\b",
    ],

    "Deep Learning": [
        "Deep Learning*",
        "Artificial Neural Network*|\\bANN\\b",
        "Convolutional Neural Network*|\\bCNN\\b",
        "Recurrent Neural Networks*|\\bRNN\\b",
        "Long Short-Term Memory*|\\bLSTM\\b",
        "Generative Adversarial Network*|\\bGAN\\b",
        "Deep Reinforcement Learning*",
        "Transfer Learning*",
        "Autoencoders*",
        "Deep Belief Networks*|\\bDBN\\b",
        "Attention Mechanisms*",
        "Neural Architecture Search*|\\bNAS\\b",
        "Feature Learning*",
        "Representation Learning*",
        "Self-supervised Learning*",
        "Semantic Segmentation*",
        "Object Detection*",
        "Image Classification*",
        "Natural Language Understanding*",
        "Speech Recognition*",
        "Machine Translation*",
        "Deep Learning Optimization*",
        "Graph Neural Networks*|\\bGNN\\b",
        "Federated Learning*",
        "Bias and Fairness in Deep Learning*",
        "Explainable AI*|\\bXAI\\b",
        "Adversarial Machine Learning*"
    ],
    "Signal Processing": [
        "Signal Processing*",
        "Digital Signal Processing*|\\bDSP\\b",
        "Analog Signal Processing*",
        "Time-Frequency Analysis*",
        "Fourier Transform*",
        "Fast Fourier Transform*|\\bFFT\\b",
        "Wavelet Transform*",
        "Filter Design*",
        "Adaptive Filtering*",
        "Signal Reconstruction*",
        "Spectral Analysis*",
        "Convolution*",
        "Modulation*",
        "Demodulation*",
        "Sampling Theory*",
        "Quantization*",
        "Signal Enhancement*",
        "Noise Reduction*",
        "Biomedical Signal Processing*",
        "Image Processing*",
        "Audio Signal Processing*",
        "Voice Recognition*",
        "Machine Learning for Signal Processing",
        "Signal Processing in Communications*",
        "Radar Signal Processing*",
        "Sonar Signal Processing*",
        "Seismic Data Processing*"
    ],
    
    "Bioinformatics": [
        "Bioinformatics*",
        "Computational Genomics*",
        "Computational Biology*",
        "Genome Sequencing*",
        "Genomic Data Mining*",
        "Proteomics Analysis*",
        "Transcriptomics Analysis*",
        "Metabolomics Data Analysis*",
        "Phylogenetic Analysis*",
        "Molecular Dynamics Simulations*",
        "Structural Bioinformatics*",
        "Biological Sequence Analysis*",
        "Functional Genomics*",
        "Comparative Genomics*",
        "Systems Biology*",
        "Network Biology*",
        "Pathway Analysis*",
        "Gene Expression Analysis*",
        "Single-Cell Sequencing*",
        "Machine Learning in Genomics",
        "Bioinformatics Pipelines*",
        "Next-Generation Sequencing*|\\bNGS\\b",
        "Omics Data Integration*",
        "ChIP-seq Analysis*",
        "RNA-Seq Data Analysis*",
        "Variant Calling*",
        "Microbiome Analysis*",
        "Population Genomics*",
        "Epigenomic*", "Epigenetic*"
    ],
    "time series": [
        "Temporal data", 
        "Sequential data", 
        "Time-dependent data", 
        "Time-based data", 
        "Time-varying data", 
        "Time sequence", 
        "Temporal sequence", 
        "Time domain", 
        "Temporal patterns", 
        "Time-based analysis", 
        "Time-series forecasting", 
        "Time-series analysis", 
        "Time-series prediction", 
        "Sequential modeling", 
        "Time-evolving data", 
        "Time-driven analysis", 
        "Time-centric analysis", 
        "Time-oriented data", 
        "Temporal analytics", 
        "Time-series modeling", 
        "Temporal information", 
        "Time-related data", 
        "Time-dependent patterns", 
        "Time-series trends", 
        "Time-evolving patterns", 
        "Time-sensitive data", 
        "Time-aware analysis", 
        "Time-based modeling", 
        "Time-series exploration", 
        "Time-series trends", 
        "Time-based prediction", 
        "Temporal pattern recognition",
        "Temporal trends", 
        "Time trend", 
        "Time analysis", 
        "Time pattern",
        "Time behavior",
        "Time correlation",
        "Time prediction",
        "Temporal trends",
        "Temporal behavior",
        "Temporal correlation",
        "Temporal prediction"
    ],

}

E_allinone= { #nesf ba gpt-4 bod nesf ba gpt3.5 to web. Badan khasti behteresh kon
    "Cardiology": [
        "cardiovascular (disease|diseases?)", "heart (disease|diseases?)", "\\bMI\\b", 
        "angina", "coronary artery disease", "heart failure", "arrhythmia", 
        "\\bAF\\b", "cardiac (surgery|catheterization)", "stent", 
        "angioplasty", "electrocardiogram", "echocardiography", "hypertension", "stroke", 
        "cardiomyopathy", "peripheral arterial disease", "valvular heart disease", 
        "(acute|chronic) (heart failure|MI)", "\\b(PCI|CABG|ECG|EKG)\\b", 
        "\\b(PAD|VHD)\\b"
    ],
    "Gastroenterology": [
        "\\bgastrointestinal (disease|diseases|cancer)\\b", "\\binflammatory bowel (disease|diseases)|\\bIBD\\b", 
        "\\bCrohn('s)? (disease|diseases)\\b", "\\bulcerative colitis\\b", "\\bhepatology\\b", 
        "\\bcirrhosis\\b", "\\bhepatitis\\b", "\\bgastroesophageal reflux|\\bGERD\\b", 
        "\\bcolonoscopy\\b", "\\bendoscopy\\b", "\\bliver transplant\\b", "\\bpancreatitis\\b", 
        "\\bgallstones\\b", "\\bceliac disease\\b", "\\birritable bowel (syndrome|syndromes)|\\bIBS\\b", 
        "\\bpeptic ulcer\\b"
    ],
    "Nephrology": [
        "\\bnephrology\\b", "kidney (disease|diseases)", "renal failure", 
        "(acute|chronic) kidney (injury|disease)|\\b(AKI|CKD)\\b", "glomerulonephritis", 
        "polycystic kidney disease", "nephrotic syndrome", "(hemodialysis|peritoneal dialysis)", 
        "kidney transplant", "renal biopsy", "uremia", "electrolyte imbalance", 
        "proteinuria", "hematuria", "kidney stones", "renal tubular acidosis", "nephrectomy", 
        "end-stage renal disease|\\bESRD\\b", "dialysis"
    ],
    "Critical Care": [
        "\\bcritical care\\b", "intensive (care|therapy|treatment) unit|ICU", 
        "\\bcritical illness\\b", "\\bsevere illness\\b", "\\bcritical care medicine\\b", "life support|ventilator", 
        "(respiratory )?support|ventilation", "\\bhemodynamic monitoring\\b", "\\borgan support\\b", "\\bsepsis\\b", "\\bshock\\b", 
        "(multiple )?organ failure", "\\bintubation\\b", "\\bvasopressor\\b", "(\\bsedation\\b|\\banalgesia\\b)", 
        "\\bdelirium\\b", "(acute )?respiratory distress syndrome|ARDS"
    ],
    "post-operative care": [
        "postoperative (care|management)", "perioperative (care|management)", 
        "surgical (recovery|aftercare|follow-up)", "post-op (treatment|monitoring)", 
        "recovery (phase|care)"
    ], 
    "LLM-NLP": [
        "large (language )?model", "natural language processing", "GPT(-|\s)?\d+(\.\d+)?", 
        "transformer", "machine learning", "artificial intelligence", 
        "deep learning", "text (classification|analysis)", "language (understanding|generation)", 
        "(NLP|neural network)", "tokenization", "sequence generation", 
        "contextual understanding"
    ],
    "Machine Vision": [
        "machine vision", "computer vision", "image (processing|analysis)", 
        "(visual|pattern) recognition", "object detection", "deep learning", 
        "convolutional neural network", "image (segmentation|classification)", 
        "(feature )?extraction", "(scene|visual) understanding", 
        "visual perception"
    ],
    "Recurrent Learning": [
        "recurrent (neural )?network", "(RNN|LSTM|GRU)", 
        "(long short-term memory|gated recurrent unit)", 
        "(sequential|temporal) learning", "time series analysis"
    ],
    "Recurrent Learning": [
        "recurrent (neural )?network", "(RNN|LSTM|GRU)", 
        "(long short-term memory|gated recurrent unit)", 
        "(sequential|temporal) learning", "time series analysis"
    ],
    "Machine Learning": [
        "machine learning", "artificial intelligence", 
        "(supervised|unsupervised) learning", "deep learning", 
        "(neural )?network", "(classification|regression|clustering)", 
        "(natural language )?processing", "reinforcement learning", 
        "(decision tree|random forest|support vector machine|gradient boosting)", 
        "(convolutional|recurrent) neural network"
    ],
    "Signal Processing": [
        "signal processing", "(digital|analog) signal processing", 
        "time series analysis", "frequency analysis", "filtering", 
        "(Fourier|wavelet) transform", "spectral analysis", 
        "signal (denoising|reconstruction)", "feature extraction", 
        "(pattern|image|audio) processing"
    ],
    "Bioinformatics": [
        "bioinformatics", "(genomics|proteomics|transcriptomics|metabolomics)", 
        "computational biology", "(sequence )?analysis", "(sequence )?alignment", 
        "gene prediction", "protein structure prediction", "phylogenetics", 
        "(functional )?genomics", "systems biology", "biological databases", 
        "microarray analysis", "next-generation sequencing", 
        "gene expression analysis", "(sequence )?motif", "bioinformatic analysis"
    ],
    "time series": [
        "time series",
        "time series (analysis|forecast*|regress*|cluster*|classif*|decompos*)", 
        "\\btemporal data\\b", "\\bsequential data\\b", "\\blongitudinal data\\b", 
        "\\bchronological data\\b", "\\btime-point data\\b", "\\btime-stamped data\\b", 
        "\\bseasonality\\b", "\\btrend analysis\\b", "\\bautoregression\\b", 
        "\\bmoving average\\b", "\\bstationar*\\b", "\\b(AR|MA|ARMA|ARIMA|SARIMA)\\b", 
        "\\blag plot\\b"
    ],
    
}
    
E_allinone_addmanual = { #nesf ba gpt-4 bod nesf ba gpt3.5 to web. Badan khasti behteresh kon
    "Cardiology": [
        "cardiovascular (disease|diseases?)", "heart (disease|diseases?)", "\\bMI\\b", 
        "angina", "coronary artery disease", "heart failure", "arrhythmia", 
        "\\bAF\\b", "cardiac (surgery|catheterization)", "stent", 
        "angioplasty", "electrocardiogram", "echocardiography", "hypertension", "stroke", 
        "cardiomyopathy", "peripheral arterial disease", "valvular heart disease", 
        "(acute|chronic) (heart failure|MI)", "\\b(PCI|CABG|ECG|EKG)\\b", 
        "\\b(PAD|VHD)\\b"
    ],
    "Gastroenterology": [
        "\\bgastrointestinal (disease|diseases|cancer)\\b", "\\binflammatory bowel (disease|diseases)|\\bIBD\\b", 
        "\\bCrohn('s)? (disease|diseases)\\b", "\\bulcerative colitis\\b", "\\bhepatology\\b", 
        "\\bcirrhosis\\b", "\\bhepatitis\\b", "\\bgastroesophageal reflux|\\bGERD\\b", 
        "\\bcolonoscopy\\b", "\\bendoscopy\\b", "\\bliver transplant\\b", "\\bpancreatitis\\b", 
        "\\bgallstones\\b", "\\bceliac disease\\b", "\\birritable bowel (syndrome|syndromes)|\\bIBS\\b", 
        "\\bpeptic ulcer\\b"
    ],
    "Nephrology": [
        "\\bnephrology\\b", "kidney (disease|diseases)", "renal failure", 
        "(acute|chronic) kidney (injury|disease)|\\b(AKI|CKD)\\b", "glomerulonephritis", 
        "polycystic kidney disease", "nephrotic syndrome", "(hemodialysis|peritoneal dialysis)", 
        "kidney transplant", "renal biopsy", "uremia", "electrolyte imbalance", 
        "proteinuria", "hematuria", "kidney stones", "renal tubular acidosis", "nephrectomy", 
        "end-stage renal disease|\\bESRD\\b", "dialysis"
    ],
    "Critical Care": [
        "\\bcritical care\\b", "intensive (care|therapy|treatment) unit|ICU", 
        "\\bcritical illness\\b", "\\bsevere illness\\b", "\\bcritical care medicine\\b", "life support|ventilator", 
        "(respiratory )?support|ventilation", "\\bhemodynamic monitoring\\b", "\\borgan support\\b", "\\bsepsis\\b", "\\bshock\\b", 
        "(multiple )?organ failure", "\\bintubation\\b", "\\bvasopressor\\b", "(\\bsedation\\b|\\banalgesia\\b)", 
        "\\bdelirium\\b", "(acute )?respiratory distress syndrome|ARDS"
    ],
    "post-operative care": [
        "postoperative (care|management)", "perioperative (care|management)", 
        "surgical (recovery|aftercare|follow-up)", "post-op (treatment|monitoring)", 
        "recovery (phase|care)"
    ], 
    "LLM-NLP": [
        "large (language )?model", "natural language processing", "GPT(-|\s)?\d+(\.\d+)?", 
        "transformer", "machine learning", "artificial intelligence", 
        "deep learning", "text (classification|analysis)", "language (understanding|generation)", 
        "(NLP|neural network)", "tokenization", "sequence generation", 
        "contextual understanding"
    ],
    "Machine Vision": [
        "machine vision", "computer vision", "image (processing|analysis)", 
        "(visual|pattern) recognition", "object detection", "deep learning", 
        "convolutional neural network", "image (segmentation|classification)", 
        "(feature )?extraction", "(scene|visual) understanding", 
        "visual perception"
    ],
    "Recurrent Learning": [
        "recurrent (neural )?network", "(RNN|LSTM|GRU)", 
        "(long short-term memory|gated recurrent unit)", 
        "(sequential|temporal) learning", "time series analysis"
    ],
    "Recurrent Learning": [
        "recurrent (neural )?network", "(RNN|LSTM|GRU)", 
        "(long short-term memory|gated recurrent unit)", 
        "(sequential|temporal) learning", "time series analysis"
    ],
    "Machine Learning": [
        "machine learning", "artificial intelligence", 
        "(supervised|unsupervised) learning", "deep learning", 
        "(neural )?network", "(classification|regression|clustering)", 
        "(natural language )?processing", "reinforcement learning", 
        "(decision tree|random forest|support vector machine|gradient boosting)", 
        "(convolutional|recurrent) neural network"
    ],
    "Signal Processing": [
        "signal processing", "(digital|analog) signal processing", 
        "time series analysis", "frequency analysis", "filtering", 
        "(Fourier|wavelet) transform", "spectral analysis", 
        "signal (denoising|reconstruction)", "feature extraction", 
        "(pattern|image|audio) processing",
        
    ],
    "Bioinformatics": [
        "bioinformatics", "(genomics|proteomics|transcriptomics|metabolomics)", 
        "computational biology", "(sequence )?analysis", "(sequence )?alignment", 
        "gene prediction", "protein structure prediction", "phylogenetics", 
        "(functional )?genomics", "systems biology", "biological databases", 
        "microarray analysis", "next-generation sequencing", 
        "gene expression analysis", "(sequence )?motif", "bioinformatic analysis"
    ],
    "time series": [
        "time series",
        "time series (analysis|forecast*|regress*|cluster*|classif*|decompos*)", 
        "\\btemporal data\\b", "\\bsequential data\\b", "\\blongitudinal data\\b", 
        "\\bchronological data\\b", "\\btime-point data\\b", "\\btime-stamped data\\b", 
        "\\bseasonality\\b", "\\btrend analysis\\b", "\\bautoregression\\b", 
        "\\bmoving average\\b", "\\bstationar*\\b", "\\b(AR|MA|ARMA|ARIMA|SARIMA)\\b", 
        "\\blag plot\\b"
    ],
    
}

#tags_dic=tags_dic_E6
#tags_dic=E_allinone


In [None]:
E_allinone_addmanual = { #nesf ba gpt-4 bod nesf ba gpt3.5 to web. Badan khasti behteresh kon
    "Cardiology": [
        "cardiovascular (disease|diseases?)", "heart (disease|diseases?)", "\\bMI\\b", 
        "angina", "coronary artery disease", "heart failure", "arrhythmia", 
        "\\bAF\\b", "cardiac (surgery|catheterization)", "stent", 
        "angioplasty", "electrocardiogram", "echocardiography", "hypertension", "stroke", 
        "cardiomyopathy", "peripheral arterial disease", "valvular heart disease", 
        "(acute|chronic) (heart failure|MI)", "\\b(PCI|CABG|ECG|EKG)\\b", 
        "\\b(PAD|VHD)\\b",
        
        "Cardiac Arrest",
        "coronary artery disease",
        "Cardiometabolic",
        "Peripheral Artery Disease",
    ],
    "Gastroenterology": [
        "\\bgastrointestinal (disease|diseases|cancer)\\b", "\\binflammatory bowel (disease|diseases)|\\bIBD\\b", 
        "\\bCrohn('s)? (disease|diseases)\\b", "\\bulcerative colitis\\b", "\\bhepatology\\b", 
        "\\bcirrhosis\\b", "\\bhepatitis\\b", "\\bgastroesophageal reflux|\\bGERD\\b", 
        "\\bcolonoscopy\\b", "\\bendoscopy\\b", "\\bliver transplant\\b", "\\bpancreatitis\\b", 
        "\\bgallstones\\b", "\\bceliac disease\\b", "\\birritable bowel (syndrome|syndromes)|\\bIBS\\b", 
        "\\bpeptic ulcer\\b"
    ],
    "Nephrology": [
        "\\bnephrology\\b", "kidney (disease|diseases)", "renal failure", 
        "(acute|chronic) kidney (injury|disease)|\\b(AKI|CKD)\\b", "glomerulonephritis", 
        "polycystic kidney disease", "nephrotic syndrome", "(hemodialysis|peritoneal dialysis)", 
        "kidney transplant", "renal biopsy", "uremia", "electrolyte imbalance", 
        "proteinuria", "hematuria", "kidney stones", "renal tubular acidosis", "nephrectomy", 
        "end-stage renal disease|\\bESRD\\b", "dialysis",
        
        "acute kidney injury",
        "Chronic Kidney Disease",
        "Nephrolog*",
        "kidney function",
        "\\bAKI",
        "renal function",
        "Albuminuria",
        "Hemodialysis",
        "Glomerulopathy",
        "Genetically",
        "Kidney Disease",
        "Glomerular"
    ],
    "Critical Care": [
        "\\bcritical care\\b", "intensive (care|therapy|treatment) unit|ICU", 
        "\\bcritical illness\\b", "\\bsevere illness\\b", "\\bcritical care medicine\\b", "life support|ventilator", 
        "(respiratory )?support|ventilation", "\\bhemodynamic monitoring\\b", "\\borgan support\\b", "\\bsepsis\\b", "\\bshock\\b", 
        "(multiple )?organ failure", "\\bintubation\\b", "\\bvasopressor\\b", "(\\bsedation\\b|\\banalgesia\\b)", 
        "\\bdelirium\\b", "(acute )?respiratory distress syndrome|ARDS"
    ],
    "post-operative care": [
        "postoperative (care|management)", "perioperative (care|management)", 
        "surgical (recovery|aftercare|follow-up)", "post-op (treatment|monitoring)", 
        "recovery (phase|care)",
        
        "postoperative",
        "Preoperative",
        "reoperation",
        "Bariatric Surgery",
        "Readmission",
        "Fracture"
    ], 
    "LLM-NLP": [
        "large (language )?model", "natural language processing", "GPT(-|\s)?\d+(\.\d+)?", 
        "transformer", "machine learning", "artificial intelligence", 
        "deep learning", "text (classification|analysis)", "language (understanding|generation)", 
        "(NLP|neural network)", "tokenization", "sequence generation", 
        "contextual understanding",
        
        "Natural Language Processing"
    ],
    "Machine Vision": [
        "machine vision", "computer vision", "image (processing|analysis)", 
        "(visual|pattern) recognition", "object detection", "deep learning", 
        "convolutional neural network", "image (segmentation|classification)", 
        "(feature )?extraction", "(scene|visual) understanding", 
        "visual perception"
    ],
    "Recurrent Learning": [
        "recurrent (neural )?network", "(RNN|LSTM|GRU)", 
        "(long short-term memory|gated recurrent unit)", 
        "(sequential|temporal) learning", "time series analysis"
    ],

    "Machine Learning": [
        "machine learning", "artificial intelligence", 
        "(supervised|unsupervised) learning", "deep learning", 
        "(neural )?network", "(classification|regression|clustering)", 
        "(natural language )?processing", "reinforcement learning", 
        "(decision tree|random forest|support vector machine|gradient boosting)", 
        "(convolutional|recurrent) neural network",
        
        "Machine Learning"
    ],
    "Signal Processing": [
        "signal processing", "(digital|analog) signal processing", 
        "time series analysis", "frequency analysis", "filtering", 
        "(Fourier|wavelet) transform", "spectral analysis", 
        "signal (denoising|reconstruction)", "feature extraction", 
        "(pattern|image|audio) processing",

        
        "Signal Processing*",
        "Digital Signal Processing*|\\bDSP\\b",
        "Analog Signal Processing*",
        "Time-Frequency Analysis*",
        "Fourier Transform*",
        "Fast Fourier Transform*|\\bFFT\\b",
        "Wavelet Transform*",
        "Filter Design*",
        "Adaptive Filtering*",
        "Signal Reconstruction*",
        "Spectral Analysis*",
        "Convolution*",
        "Modulation*",
        "Demodulation*",
        "Sampling Theory*",
        "Quantization*",
        "Signal Enhancement*",
        "Noise Reduction*",
        "Biomedical Signal Processing*",
        "Image Processing*",
        "Audio Signal Processing*",
        "Voice Recognition*",
        "Machine Learning for Signal Processing",
        "Signal Processing in Communications*",
        "Radar Signal Processing*",
        "Sonar Signal Processing*",
        "Seismic Data Processing*"
    ],
    "Bioinformatics": [
        "bioinformatics", "(genomics|proteomics|transcriptomics|metabolomics)", 
        "computational biology", "(sequence )?analysis", "(sequence )?alignment", 
        "gene prediction", "protein structure prediction", "phylogenetics", 
        "(functional )?genomics", "systems biology", "biological databases", 
        "microarray analysis", "next-generation sequencing", 
        "gene expression analysis", "(sequence )?motif", "bioinformatic analysis",
        
        "Epigenetic",
        "Genome-Wide",
        "gene expression",
        "Proteomic",
        "Whole-genome sequencing",
        "genotype",
        "Polygenic",
        "gene expression",
        "Genetic pleiotropy",
        "Phenome-Wide",
        "Whole genome sequence"
    ],
    "time series": [
        "time series",
        "time series (analysis|forecast*|regress*|cluster*|classif*|decompos*)", 
        "\\btemporal data\\b", "\\bsequential data\\b", "\\blongitudinal data\\b", 
        "\\bchronological data\\b", "\\btime-point data\\b", "\\btime-stamped data\\b", 
        "\\bseasonality\\b", "\\btrend analysis\\b", "\\bautoregression\\b", 
        "\\bmoving average\\b", "\\bstationar*\\b", "\\b(AR|MA|ARMA|ARIMA|SARIMA)\\b", 
        "\\blag plot\\b",
        
        "Temporal data", 
        "Sequential data", 
        "Time-dependent data", 
        "Time-based data", 
        "Time-varying data", 
        "Time sequence", 
        "Temporal sequence", 
        "Time domain", 
        "Temporal patterns", 
        "Time-based analysis", 
        "Time-series forecasting", 
        "Time-series analysis", 
        "Time-series prediction", 
        "Sequential modeling", 
        "Time-evolving data", 
        "Time-driven analysis", 
        "Time-centric analysis", 
        "Time-oriented data", 
        "Temporal analytics", 
        "Time-series modeling", 
        "Temporal information", 
        "Time-related data", 
        "Time-dependent patterns", 
        "Time-series trends", 
        "Time-evolving patterns", 
        "Time-sensitive data", 
        "Time-aware analysis", 
        "Time-based modeling", 
        "Time-series exploration", 
        "Time-series trends", 
        "Time-based prediction", 
        "Temporal pattern recognition",
        "Temporal trends", 
        "Time trend", 
        "Time analysis", 
        "Time pattern",
        "Time behavior",
        "Time correlation",
        "Time prediction",
        "Temporal trends",
        "Temporal behavior",
        "Temporal correlation",
        "Temporal prediction"
    ],
    "Deep Learning": [
        "Deep Learning*",
        "Artificial Neural Network*|\\bANN\\b",
        "Convolutional Neural Network*|\\bCNN\\b",
        "Recurrent Neural Networks*|\\bRNN\\b",
        "Long Short-Term Memory*|\\bLSTM\\b",
        "Generative Adversarial Network*|\\bGAN\\b",
        "Deep Reinforcement Learning*",
        "Transfer Learning*",
        "Autoencoders*",
        "Deep Belief Networks*|\\bDBN\\b",
        "Attention Mechanisms*",
        "Neural Architecture Search*|\\bNAS\\b",
        "Feature Learning*",
        "Representation Learning*",
        "Self-supervised Learning*",
        "Semantic Segmentation*",
        "Object Detection*",
        "Image Classification*",
        "Natural Language Understanding*",
        "Speech Recognition*",
        "Machine Translation*",
        "Deep Learning Optimization*",
        "Graph Neural Networks*|\\bGNN\\b",
        "Federated Learning*",
        "Bias and Fairness in Deep Learning*",
        "Explainable AI*|\\bXAI\\b",
        "Adversarial Machine Learning*"
    ],
    
    "COVID-19": [
        "COVID-19", "SARS-CoV-2"
    ]
}


tags_dic=E_allinone_addmanual

In [None]:
# function to get a string (title/abstract) and a list of strings [term list]
# [Done] Adding 1 in the 'other' column, when no keyword is present
# [ ] The terms are incorrectly added in the loop perhaps (something is wrong with adding the topics e.g. Evaluating the role of ChatGPT in gastroenterology: a comprehensive systematic review of applications, benefits, and limitations.)

import pandas as pd
import re
import openpyxl

# Given function to check keyword existence using regex
def check_keyword_existence_with_regex(title_abstract, keyword_equivalents):
    regex_patterns = [keyword.replace('*', '.*') for keyword in keyword_equivalents]
    for pattern in regex_patterns:
        compiled_pattern = re.compile(pattern, re.IGNORECASE)
        if re.search(compiled_pattern, title_abstract):
            return 1  # Pattern found
    return 0  # No pattern found



# Function to add a column for each keyword in the DataFrame
def add_tags_to_df(docs_df, tags_dic):
    # Create a temporary copy of the DataFrame to avoid modifying the original one
    temp_df = docs_df.copy()
    
    # Initialize the 'others' column with zeros
    temp_df['others'] = 0
    
    for keyword, equivalents in tags_dic.items():
        # Initialize a new column for the keyword with zeros
        temp_df[keyword] = 0
        
        for index, row in temp_df.iterrows():
            title = row['Article_ArticleTitle']
            abstract = row['Article_AbstractText']
            title_abstract = " ".join([str(title), str(abstract)])
            
            # Check if any of the keyword equivalents are present
            result = check_keyword_existence_with_regex(title_abstract, equivalents)
            # Update the DataFrame with the result
            temp_df.at[index, keyword] = result
            
            # If the keyword was found, mark the row so it's not considered 'other'
            if result:
                temp_df.at[index, 'others'] = 0
    
    # For rows where no keywords were found, set 'others' to 1
    keyword_columns = list(tags_dic.keys())
    temp_df['others'] = temp_df[keyword_columns].sum(axis=1) == 0

    return temp_df

def get_rows_without_keywords(docs_df_tagged, tags_dic):
    """
    Returns a DataFrame containing rows without any of the specified keywords.
    
    Parameters:
    - docs_df_tagged: DataFrame containing tagged data.
    - tags_dic: Dictionary of keywords and their equivalent terms.
    
    Returns:
    - docs_df_no_keywords: DataFrame containing rows without any valid keyword.
    """
    
    for keyword in tags_dic.keys():
        if keyword in docs_df_tagged.columns:
            count = docs_df_tagged[keyword].sum()
            print(f"---Count of '{keyword}': {count}")
        else:
            print(f"--XX-Column '{keyword}' not found in the DataFrame.")
    
    keyword_columns = [col for col in tags_dic.keys() if col in docs_df_tagged.columns]
    rows_without_tag = docs_df_tagged[keyword_columns].sum(axis=1) == 0
    count_no_keyword = rows_without_tag.sum()
    print(f"Count of rows without any valid keyword: {count_no_keyword}")
    
    # Filter the DataFrame to include only rows without any valid keyword
    docs_df_no_keywords = docs_df_tagged[rows_without_tag]
    
    return docs_df_no_keywords



docs_df_tagged = add_tags_to_df(clean_docs_df, tags_dic)



print(""" 
The E_allinone_addmanual result:
    
---Count of 'Cardiology': 94
---Count of 'Gastroenterology': 7
---Count of 'Nephrology': 128
---Count of 'Critical Care': 106
---Count of 'post-operative care': 22
---Count of 'LLM-NLP': 69
---Count of 'Machine Vision': 23
---Count of 'Recurrent Learning': 3
---Count of 'Machine Learning': 129
---Count of 'Signal Processing': 1
---Count of 'Bioinformatics': 120
---Count of 'time series': 3
---Count of 'COVID-19': 79
Count of rows without any valid keyword: 36

____________________________________
Winner of raw prompts is:  tags_dic_E6
_____________________________________

The E6 results:
---Count of 'Cardiology': 62
---Count of 'Gastroenterology': 9
---Count of 'Nephrology': 103
---Count of 'Critical Care': 45
---Count of 'Post-operative Care': 6
---Count of 'LLM-NLP': 18
---Count of 'Machine Vision': 10
---Count of 'Recurrent Learning': 13
---Count of 'Machine Learning': 74
---Count of 'Deep Learning': 26
---Count of 'Signal Processing': 5
---Count of 'Bioinformatics': 7
---Count of 'time series': 2
Count of rows without any valid keyword: 114


The tags_dic_E6 result:
---Count of 'Cardiology': 80
---Count of 'Gastroenterology': 6
---Count of 'Nephrology': 105
---Count of 'Critical Care': 105
---Count of 'post-operative care': 0
---Count of 'LLM-NLP': 56
---Count of 'Machine Vision': 20
---Count of 'Recurrent Learning': 3
---Count of 'Machine Learning': 115
---Count of 'Signal Processing': 1
---Count of 'Bioinformatics': 80
---Count of 'time series': 2
Count of rows without any valid keyword: 102
""")

docs_df_no_keywords = get_rows_without_keywords(clean_docs_df, tags_dic)
print()
for index, row in docs_df_no_keywords.iterrows():
    print(row['Article_ArticleTitle'])
    print(row['Article_AbstractText'])
    print("--------")

##### Step 1-4: Preparing the result for a clean table/posting to WP

In [None]:
fileanme_date='websitedf_cleantaged_20240305'


df_temp=docs_df_tagged.copy()
columns_to_keep = [
    "Article_ArticleTitle", "clean_abstract", "clean_date", "clean_doi", "clean_authors", "Cardiology", "Gastroenterology", "Nephrology", "Critical Care",
    "post-operative care", "LLM-NLP", "Machine Vision", "Recurrent Learning","Deep Learning",	"others",
    "Machine Learning", "Signal Processing", "Bioinformatics", "time series", "COVID-19", 
]
df_temp=df_temp[columns_to_keep]




csv_format_4website=pd.DataFrame()

topic_columns = [
    "Cardiology", "Gastroenterology", "Nephrology", "Critical Care",
    "post-operative care", "LLM-NLP", "Machine Vision", "Recurrent Learning", "Deep Learning",	"others",
    "Machine Learning", "Signal Processing", "Bioinformatics", "time series", "COVID-19"
]
def concatenate_topics(row):
    topics=', '.join([col for col in topic_columns if row[col] == 1])
    topics=', '.join([topics,'4research'])
    return topics
df_temp['Topics'] = df_temp.apply(concatenate_topics, axis=1)

#create unique id 
import re
from datetime import datetime

def get_eight_numbers(string):
    string=str(string)
    numbers = re.findall(r'\d+', string)
    
    if numbers:
        allnum=''.join(numbers)
        if len(allnum) >= 9:
            return allnum[-9:]
        else:
            negativelen=-len(allnum)
            return allnum[negativelen:]

    else:
        current_time = datetime.now()
        formatted_time = current_time.strftime("%Y%m%d%H%M%S")
        print('formatted_time used and no doi was available')
        return formatted_time


post_id=[]
post_name=[]
post_title=[]
post_content=[]
post_date=[]
post_tags=[]

for index, row in df_temp.iterrows():
    title=row['Article_ArticleTitle']
    post_title.append(title)
    
    doi=row['clean_doi']
    authors=row['clean_authors']
    abstract=row['clean_abstract']
    
    content=f"""Link: <a href="{doi}">{doi}</a></b>
    <em>Authors: {authors}</em></b>
    
    Abstract: {abstract} """
    
    post_content.append(content)
    
    
    postid=get_eight_numbers(str(row["clean_doi"]))
    post_id.append(postid)
    
    postname=f"4research_{postid}"
    post_name.append(postname)

    post_date.append(row['clean_date'])
    post_tags.append(row['Topics'])
    
print(f"""
      post_id:{len(post_id)}
      post_name:{len(post_name)}
      post_title:{len(post_title)}
      post_content:{len(post_content)}
      post_date:{len(post_date)}
      post_tags:{len(post_tags)}
      """)



csv_format_4website['post_id']=post_id
csv_format_4website['post_name']=post_name
csv_format_4website['post_author']='amirsafavi'
csv_format_4website['post_date']=post_date
csv_format_4website['post_type']='post'
csv_format_4website['post_status']='publish'
csv_format_4website['post_title']=post_title
csv_format_4website['post_content']=post_content
csv_format_4website['post_category']='4research'
csv_format_4website['post_tags']=post_tags
csv_format_4website['custom_field']=''


#dropping rows with empty title (probably because they strated with '{' and I previously turned as nan )
csv_format_4website.dropna(subset=['post_title'], inplace=True)

import csv
csv_format_4website.to_csv(f"{fileanme_date}.csv", index=False,quoting=csv.QUOTE_ALL)

# Step 1-website: Preparing the dataframe for the website

##### Step 2-1: How to get large chunk of data (i.e. 500,000) -> Enhancing Step 1-1

##### Step 2-2: Adding GPT for tagging/cleaning -> Enhancing Step 1-2 and 1-3

##### Step 2-3: Turning this into CSS/HTML page with filtering feature -> Enhancing Step 1-4

## PREVIOUS CODE & TRASH

In [None]:
# Previous Solution 2023
import pandas as pd
import time

start_time = time.time()

api_key="05b7ff399af9219b0537c9b74db39bdf0709"
ec = eutils.Client(api_key=api_key)

df=pd.DataFrame(columns=["Title","Abstract","Journal","Year","Authors","Article_Type"])


term= r"""((Computer-Assist*[tiab] OR "Computer Assist*"[tiab] OR "Computer Aid*"[tiab] OR Computer-Aid*[tiab] OR "Artificial Intelligen*"[tiab] OR \"Deep Learning*"[tiab] OR "Machine Learning"[tiab] OR"Deep learning"[tiab] OR "Neural network*"[tiab] OR "Random forest*"[tiab] OR "Decision tree*"[tiab] OR "Support Vector Machine*"[tiab] OR "naïve bayes"[tiab] OR "k-Nearest Neighbor*"[tiab] OR "Gradient Boosting"[Tiab]))"""

n=1
total_article=0
warning_list=[]
for year in ["2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021","2022", "2023"]:
    for month in ["01","02","03","04","05","06","07","08","09","10","11","12"]:
        for day in ["01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31"]:
            for chunker in ["AND \"image*\"[Tiab]","NOT \"image*\"[Tiab]"]:
                Date="AND ((\"{}/{}/{}\"[Date - Entry] : \"{}/{}/{}\"[Date - Entry]))".format(year,month,day,year,month,day)
                search_term=term+Date+chunker
                Lit = ec.esearch(db='pubmed',term=search_term)

                end_time = time.time()
                elapsed_time = end_time - start_time
                print(search_term)
                print("\n number of articles = {}. \n but AI.id has {} ids withing it.".format(Lit.count, len(Lit.ids)))
                total_article=total_article+Lit.count
                
                if Lit.count > 249 :
                    warning="warningggggg: for search term {} you should do the search again".format(search_term)
                    warning_list=warning_list.append(warning)
                    print (warning)
                
                if Lit.count == 0:
                    break



                Lit_get = ec.efetch(db='pubmed', id=Lit.ids)
                iterator = iter(Lit_get)
                lit_list = list(iterator)
                length = len(lit_list) 


                for get1 in lit_list:
                    title=get1.title
                    abstract=get1.abstract
                    authors=get1.authors
                    journal=get1.jrnl
                    year=get1.year
                    Atype=get1.pub_types

                    output=[title,abstract,journal,year,authors,Atype]

                    df.loc[len(df)] = output
                    
                n=n+1
df
print("Elapsed time for loop {} is {}. \n total articles should be: {}".format(n,elapsed_time,total_article))


#------------------
df.info(verbose = False)

#---------------
#pip install eutils
import eutils
api_key="05b7ff399af9219b0537c9b74db39bdf0709"
ec = eutils.Client(api_key=api_key)

# search for all AI in pubmed
Start="2023/03/01"
End="2023/03/01"
Date="AND ((\"{}\"[Date - Entry] : \"{}\"[Date - Entry]))".format(Start,End)

term= "((Computer-Assist*[tiab] OR \"Computer Assist*\"[tiab] OR \"Computer Aid*\"[tiab] OR Computer-Aid*[tiab] OR \"Artificial Intelligen*\"[tiab] OR \"Deep Learning*\"[tiab] OR \"Machine Learning\"[tiab] OR\"Deep learning\"[tiab] OR \"Neural network*\"[tiab] OR \"Random forest*\"[tiab] OR \"Decision tree*\"[tiab] OR \"Support Vector Machine*\"[tiab] OR \"naïve bayes\"[tiab] OR \"k-Nearest Neighbor*\"[tiab] OR \"Gradient Boosting\"[Tiab]))"

search_term=term+Date+"NOT \"image*\"[Tiab]\""


Lit = ec.esearch(db='pubmed',term=search_term)



print("\n number of articles = {}. \n but AI.id has {} ids withing it.".format(Lit.count, len(Lit.ids)))

print(Date)

#----------

import pandas as pd
df=pd.DataFrame(columns=["Title","Abstract","Journal","Year","Authors","Article_Type"])
df.loc[len(df)] = output
df

#---------
Lit_get = ec.efetch(db='pubmed', id=Lit.ids)
iterator = iter(Lit_get)


#---------
lit_list = list(iterator)
length = len(lit_list) 
print("Length of the iterator:", length)
lit_list


df=pd.DataFrame(columns=["Title","Abstract","Journal","Year","Authors","Article_Type"])
for get1 in lit_list:
    title=get1.title
    abstract=get1.abstract
    authors=get1.authors
    journal=get1.jrnl
    year=get1.year
    Atype=get1.pub_types

    output=[title,abstract,journal,year,authors,Atype]
    df.loc[len(df)] = output
df


#------------------
get1 = next(iterator)

title=get1.title
abstract=get1.abstract
authors=get1.authors
journal=get1.jrnl
year=get1.year
Atype=get1.pub_types

output=[title,abstract,journal,year,authors,Atype]
for i in output:
    print(i)

In [None]:
# RAG

!pip install -qU \
    nemoguardrails==0.4.0 \
    pinecone-client==2.2.2 \
    datasets==2.14.3 \
    openai==0.27.8
    
    
#------
# getting data source for retrival

import os
import glob
import pandas as pd

pdf_directory= r"D:\MY WORK\MaGHALE\Guidline GI LLM\Data"
pdf_files = glob.glob(os.path.join(pdf_directory, '**/*.pdf'), recursive=True)

excel_directory=r"D:\MY WORK\MaGHALE\Guidline GI LLM\Data\guidelines.xlsx"
excel_guide=pd.read_excel(excel_directory)

for pdf_file in pdf_files:
    print(pdf_file)
print(f"Total count of PDF files: {len(pdf_files)}")

excel_guide

#----
for i in excel_guide["Title"]:
 print(i)
 
#----
for i in excel_guide["Title"]:
    print(i)
i= 

prompt= For the 
completion = openai.Completion.create(
    engine=model_engine,
    prompt=prompt,
    max_tokens=1024,
    n=1, # we want one anwer
    stop=None, # no special stopping rule
    temperature=0.5 #internal parameter for llm, you can play with it
)


#-----
#pip install openai
pip install langchain
pip install tiktoken
pip install chromadb
pip install untructured

#--------
## Install these packages
# pip install openai langchain tiktoken chromadb untructured

# Import modules
import openai
import os
import sys

#import class from modules

from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.indexes import VectorstoreIndexCreator
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

# Generate API KEY from OPENAI website and define as a variable. if you want to hide API key just import "constant"
# and define API key as constant.APIKEY  
os.environ["OPENAI_API_KEY"] = ""


# This function is used to pass the argument with query.

query = None
if len(sys.argv) > 1:
  query = sys.argv[1]


#Load the custom Dataset and split into chunks, You can load data as (pdf, text file, html file and WebBaseLoader.)
loader = DirectoryLoader("mydata/")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

# This part is used for embeddings the docs and store it into Vector DB and intialize the retriever.
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings)

# Create the RetrievalQA object
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever())

# This part is used to build a chat or Q&A application having the capability of both conversational capabilities and document retrieval
chain = ConversationalRetrievalChain.from_llm(
  llm=ChatOpenAI(model="gpt-3.5-turbo"),
  retriever=docsearch.as_retriever(search_kwargs={"k": 1}),
)

#Initialize an empty list called chat_history to store the conversation history.
#Start a while loop that continues until the user enters "quit", "q", or "exit".
#Check if the variable query is empty. If it is, prompt the user to enter a query.
#Check if the user wants to quit the app. If the query matches "quit", "q", or "exit", exit the program.
#Call the chain object with the user's query and the current chat_history as input. This will generate a response from the app.
#Print the answer from the result dictionary.
#Append the user's query and the app's answer to the chat_history list.
#Repeat the loop to allow the user to enter another query. 

chat_history = []
while True:
  if not query:
    query = input("Prompt: ")
  if query in ['quit', 'q', 'exit']:
    sys.exit()
  result = chain({"question": query, "chat_history": chat_history})
  print(result['answer'])

  chat_history.append((query, result['answer']))
  query = None
