<a href="https://colab.research.google.com/github/Sug-ar-N-Spice/Dr.Chats/blob/Simon/Pub_med_Simon_DR-Chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install langchain_openai
!pip install openai
!pip install google-api-python-client
!pip install google-auth-oauthlib
!pip install google-auth-httplib2
!pip install dotenv


Collecting dotenv
  Downloading dotenv-0.0.5.tar.gz (2.4 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [5]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os
from openai import OpenAI
from google.colab import userdata
userdata.get('secretName')

ModuleNotFoundError: No module named 'dotenv'

In [6]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import time
from tqdm import tqdm
import re
from collections import Counter

class PMCOpenAccessCollector:
    def __init__(self, save_dir: str = "pmc_data", email: str = None):
        """
        Initialize the collector with a directory to save downloaded files.
        """
        self.save_dir = Path(save_dir)
        self.save_dir.mkdir(exist_ok=True)
        self.email = "NCBI_API_KEY"

        # API endpoints
        self.esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
        self.efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

    def search_articles(self, query: str, max_results: int = 100) -> list:
        """
        Search for articles matching the query in PMC.
        """
        params = {
            'db': 'pmc',
            'term': f"{query} AND open access[filter]",  # Only get open access articles
            'retmax': max_results,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'API_KEY': self.email
        }

        response = requests.get(self.esearch_url, params=params)
        if response.status_code != 200:
            raise Exception(f"Search failed with status code: {response.status_code}")

        root = ET.fromstring(response.content)
        id_list = root.findall('.//IdList/Id')
        return [id_elem.text for id_elem in id_list]

    def fetch_article(self, pmcid: str) -> dict:
        """
        Fetch and parse a single article by PMC ID.
        """
        params = {
            'db': 'pmc',
            'id': pmcid,
            'retmode': 'xml',
            'tool': 'PMCOpenAccessCollector',
            'API_KEY': self.email
        }

        try:
            response = requests.get(self.efetch_url, params=params)
            if response.status_code != 200:
                print(f"Failed to fetch PMCID {pmcid}")
                return None

            root = ET.fromstring(response.content)

            # Extract article data
            article_data = {
                'pmc_id': pmcid,
                'title': '',
                'abstract': '',
                'full_text': '',
                'keywords': [],
                'publication_date': '',
                'journal': ''
            }

            # Extract title
            title_elem = root.find(".//article-title")
            if title_elem is not None and title_elem.text:
                article_data['title'] = title_elem.text

            # Extract abstract
            abstract_paras = root.findall(".//abstract//p")
            article_data['abstract'] = " ".join(
                p.text for p in abstract_paras if p is not None and p.text
            )

            # Extract main text
            body_paras = root.findall(".//body//p")
            article_data['full_text'] = " ".join(
                p.text for p in body_paras if p is not None and p.text
            )

            # Extract keywords
            kwd_group = root.findall(".//kwd-group/kwd")
            article_data['keywords'] = [
                k.text for k in kwd_group if k is not None and k.text
            ]

            # Extract journal title
            journal_elem = root.find(".//journal-title")
            if journal_elem is not None and journal_elem.text:
                article_data['journal'] = journal_elem.text

            # Extract publication date
            pub_date = root.find(".//pub-date[@pub-type='epub']")
            if pub_date is not None:
                year = pub_date.find('year')
                month = pub_date.find('month')
                day = pub_date.find('day')
                date_parts = []
                for part in [year, month, day]:
                    if part is not None and part.text:
                        date_parts.append(part.text)
                article_data['publication_date'] = '-'.join(date_parts)

            return article_data

        except Exception as e:
            print(f"Error processing PMCID {pmcid}: {str(e)}")
            return None

    def collect_dataset(self, queries: list, max_articles_per_query: int = 25) -> pd.DataFrame:
        """
        Collect and process articles based on specific queries.
        """
        all_articles = []

        for query in queries:
            print(f"\nProcessing query: {query}")

            # Search for articles
            pmcids = self.search_articles(query, max_results=max_articles_per_query)
            print(f"Found {len(pmcids)} matching articles")

            # Fetch each article
            print("Fetching articles...")
            for pmcid in tqdm(pmcids):
                article_data = self.fetch_article(pmcid)
                if article_data:
                    article_data['search_query'] = query
                    all_articles.append(article_data)
                time.sleep(0.34)  # Respect NCBI's rate limits

        return pd.DataFrame(all_articles)

def main():
    collector = PMCOpenAccessCollector()
    queries = [
        "STI",
        "HIV",
        "AIDS",
        "STD",
        "Gonorrhea",
        "Chlamydia",
    ]

    # Collect data
    print("Starting data collection...")
    df = collector.collect_dataset(
        queries=queries,
        max_articles_per_query=25  # Adjust as needed
    )

    # Save to CSV
    output_file = "pmc_dataset_simon.csv"
    df.to_csv(output_file, index=False)

    # Display statistics
    print("\nDataset Statistics:")
    print(f"Total articles: {len(df)}")
    print("\nArticles per query:")
    print(df['search_query'].value_counts())
    print("\nSample titles:")
    print(df[['title', 'search_query']].head())

    # Save some sample text for verification
    with open("sample_articles.txt", "w", encoding="utf-8") as f:
        for _, row in df.head().iterrows():
            f.write(f"Title: {row['title']}\n")
            f.write(f"Query: {row['search_query']}\n")
            f.write(f"Abstract: {row['abstract'][:500]}...\n")
            f.write("-" * 80 + "\n")

if __name__ == "__main__":
    main()

Starting data collection...

Processing query: STI
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:17<00:00,  1.43it/s]



Processing query: HIV
Found 25 matching articles
Fetching articles...


  4%|▍         | 1/25 [00:00<00:10,  2.40it/s]

Failed to fetch PMCID 11470818


100%|██████████| 25/25 [00:15<00:00,  1.63it/s]



Processing query: AIDS
Found 25 matching articles
Fetching articles...


 28%|██▊       | 7/25 [00:04<00:12,  1.45it/s]

Failed to fetch PMCID 11470618


100%|██████████| 25/25 [00:15<00:00,  1.64it/s]



Processing query: STD
Found 25 matching articles
Fetching articles...


  4%|▍         | 1/25 [00:00<00:10,  2.29it/s]

Failed to fetch PMCID 11470406


 92%|█████████▏| 23/25 [00:12<00:01,  1.52it/s]

Failed to fetch PMCID 11466866


100%|██████████| 25/25 [00:15<00:00,  1.65it/s]



Processing query: Gonorrhea
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:13<00:00,  1.87it/s]



Processing query: Chlamydia
Found 25 matching articles
Fetching articles...


100%|██████████| 25/25 [00:13<00:00,  1.84it/s]


Dataset Statistics:
Total articles: 146

Articles per query:
search_query
STI          25
Gonorrhea    25
Chlamydia    25
HIV          24
AIDS         24
STD          23
Name: count, dtype: int64

Sample titles:
                                               title search_query
0  Examining concordance of sexual-related factor...          STI
1  Halotolerant phosphate solubilizing bacteria i...          STI
2  Methamphetamine abuse impairs sequential worki...          STI
3  Barriers and facilitators to women’s access to...          STI
4  8th Public Health Palliative Care Internationa...          STI





In [7]:
# Analyze keywords frequency
df = pd.read_csv("pmc_dataset_simon.csv")
all_keywords = [kw for kws in df['keywords'] for kw in eval(kws)]
keyword_freq = Counter(all_keywords).most_common(20)

In [8]:
all_keywords

['HIV risk perception',
 'HIV prevention',
 'Pre-exposure prophylaxis',
 'Adolescent girls and young women',
 'DREAMS',
 'Eastern and Southern Africa',
 'Phosphorus-solubilizing bacteria',
 'PAE',
 'PUE',
 'Auxin',
 'Barley',
 'Photosynthetic activity',
 'working memory',
 'sequential working memory',
 'cognitive deficits',
 'methamphetamine dependence',
 'digit ordering task',
 'Sexual and reproductive health',
 'Health service',
 'Access',
 'Systematic review',
 'Delivery of healthcare',
 'Health equity',
 'Rural health',
 'Barriers and facilitators',
 'Women’s health',
 'diagnostic tool',
 'hypermobile Ehlers-Danlos syndrome',
 'patient experiences',
 'diagnostic odyssey',
 'affinity mapping',
 'mobile health app',
 'mobile phone',
 'Advanced Cardiac Life Support',
 'Out-of-hospital cardiac arrest',
 'Emergency medical services',
 'Return of spontaneous circulation',
 'Epinephrine',
 'minimal hepatic encephalopathy',
 'butyrylcholinesterase',
 'quantitative susceptibility mapping',


In [9]:
keyword_freq

[('COVID-19', 6),
 ('HIV', 4),
 ('HIV risk perception', 3),
 ('HIV prevention', 3),
 ('Pre-exposure prophylaxis', 3),
 ('Adolescent girls and young women', 3),
 ('DREAMS', 3),
 ('Eastern and Southern Africa', 3),
 ('Sexual and reproductive health', 3),
 ('Systematic review', 3),
 ('Health equity', 3),
 ('Colonization resistance', 3),
 ('gastrointestinal microbiota', 3),
 ('host-microbe interactions', 3),
 ('metagenomic sequencing', 3),
 ('pediatric infectious diseases', 3),
 ('respiratory microbiota', 3),
 ('Attitude', 3),
 ('Knowledge', 3),
 ('Risk factors', 3)]

In [10]:
pd.DataFrame(keyword_freq, columns=['Keyword', 'Frequency'])

Unnamed: 0,Keyword,Frequency
0,COVID-19,6
1,HIV,4
2,HIV risk perception,3
3,HIV prevention,3
4,Pre-exposure prophylaxis,3
5,Adolescent girls and young women,3
6,DREAMS,3
7,Eastern and Southern Africa,3
8,Sexual and reproductive health,3
9,Systematic review,3


In [11]:
pd.DataFrame(all_keywords, columns=['keyword']).groupby('keyword').size().reset_index(name='Frequency').sort_values('Frequency', ascending=False)



Unnamed: 0,keyword,Frequency
50,COVID-19,6
125,HIV,4
266,Sexual and reproductive health,3
235,Pre-exposure prophylaxis,3
257,Risk factors,3
...,...,...
45,Brazil,1
46,Bryophyta,1
233,Poison information,1
232,Placenta mesenchymal stem cell,1


In [12]:
# Analyze publication dates
df['publication_date'] = pd.to_datetime(df['publication_date'], errors='coerce')
df['publication_year'] = df['publication_date'].dt.year
year_counts = df['publication_year'].value_counts().sort_index()
year_counts

Unnamed: 0_level_0,count
publication_year,Unnamed: 1_level_1
2022.0,2
2023.0,6
2024.0,116


In [13]:
#clean the data
df['full_text'] = df['full_text'].str.replace(r'\[.*?\]', '', regex=True)

df

Unnamed: 0,pmc_id,title,abstract,full_text,keywords,publication_date,journal,search_query,publication_year
0,11470662,Examining concordance of sexual-related factor...,HIV risk perception is an important cognition ...,The United Nations has committed to ending HIV...,"['HIV risk perception', 'HIV prevention', 'Pre...",2024-10-12,BMC Public Health,STI,2024.0
1,11470655,Halotolerant phosphate solubilizing bacteria i...,Forty-seven (47) bacterial strains were isolat...,Phosphorus (P) represents about 0.2 % of plant...,"['Phosphorus-solubilizing bacteria', 'PAE', 'P...",2024-09-27,Heliyon,STI,2024.0
2,11468863,Methamphetamine abuse impairs sequential worki...,The ability to maintain and manipulate sequent...,Methamphetamine is a potent stimulant that exe...,"['working memory', 'sequential working memory'...",2024-09-27,Frontiers in Psychiatry,STI,2024.0
3,11468210,Barriers and facilitators to women’s access to...,Accessing sexual and reproductive health (SRH)...,Access to healthcare is a multifaceted indicat...,"['Sexual and reproductive health', 'Health ser...",2024-10-11,BMC Health Services Research,STI,2024.0
4,11467999,8th Public Health Palliative Care Internationa...,,\n \n \n Dear Colleagues Dear Friends It bring...,[],2024-10-10,Palliative Care and Social Practice,STI,2024.0
...,...,...,...,...,...,...,...,...,...
141,11457441,Acute necrotizing encephalopathy caused by bac...,"Acute necrotizing encephalopathy (ANE), a rare...",Acute necrotizing encephalopathy (ANE) is a di...,"['Acute necrotizing encephalopathy', 'Bacteria...",2024-10-07,BMC Infectious Diseases,Chlamydia,2024.0
142,11456218,Modulation of Inflammation in McCoy Cells by Z...,Introduction: This study investigated biosynth...,Inflammation is a fundamental biological respo...,"['cytokines', 'green synthesis', 'cytotoxicity...",NaT,Cureus,Chlamydia,
143,11452993,Neurobartonelloses: emerging from obscurity!,A systematic literature search was conducted o...,With the development of more sensitive and spe...,"['Bartonellosis', 'Neurological', 'Neuropsychi...",2024-10-05,Parasites & Vectors,Chlamydia,2024.0
144,11452002,Comprehensive,Due to the rise of multidrug-resistant strains...,CgtA is a well-studied potential drug target [...,[],2024-10-04,PLOS ONE,Chlamydia,2024.0


In [14]:
#clean the data
df['full_text'] = df['full_text'].str.replace(r'\[.*?\]', '', regex=True)
df['full_text'] = df['full_text'].dropna().apply(lambda x: re.sub(r'\s+', ' ', x))
df


Unnamed: 0,pmc_id,title,abstract,full_text,keywords,publication_date,journal,search_query,publication_year
0,11470662,Examining concordance of sexual-related factor...,HIV risk perception is an important cognition ...,The United Nations has committed to ending HIV...,"['HIV risk perception', 'HIV prevention', 'Pre...",2024-10-12,BMC Public Health,STI,2024.0
1,11470655,Halotolerant phosphate solubilizing bacteria i...,Forty-seven (47) bacterial strains were isolat...,Phosphorus (P) represents about 0.2 % of plant...,"['Phosphorus-solubilizing bacteria', 'PAE', 'P...",2024-09-27,Heliyon,STI,2024.0
2,11468863,Methamphetamine abuse impairs sequential worki...,The ability to maintain and manipulate sequent...,Methamphetamine is a potent stimulant that exe...,"['working memory', 'sequential working memory'...",2024-09-27,Frontiers in Psychiatry,STI,2024.0
3,11468210,Barriers and facilitators to women’s access to...,Accessing sexual and reproductive health (SRH)...,Access to healthcare is a multifaceted indicat...,"['Sexual and reproductive health', 'Health ser...",2024-10-11,BMC Health Services Research,STI,2024.0
4,11467999,8th Public Health Palliative Care Internationa...,,Dear Colleagues Dear Friends It brings me imm...,[],2024-10-10,Palliative Care and Social Practice,STI,2024.0
...,...,...,...,...,...,...,...,...,...
141,11457441,Acute necrotizing encephalopathy caused by bac...,"Acute necrotizing encephalopathy (ANE), a rare...",Acute necrotizing encephalopathy (ANE) is a di...,"['Acute necrotizing encephalopathy', 'Bacteria...",2024-10-07,BMC Infectious Diseases,Chlamydia,2024.0
142,11456218,Modulation of Inflammation in McCoy Cells by Z...,Introduction: This study investigated biosynth...,Inflammation is a fundamental biological respo...,"['cytokines', 'green synthesis', 'cytotoxicity...",NaT,Cureus,Chlamydia,
143,11452993,Neurobartonelloses: emerging from obscurity!,A systematic literature search was conducted o...,With the development of more sensitive and spe...,"['Bartonellosis', 'Neurological', 'Neuropsychi...",2024-10-05,Parasites & Vectors,Chlamydia,2024.0
144,11452002,Comprehensive,Due to the rise of multidrug-resistant strains...,CgtA is a well-studied potential drug target [...,[],2024-10-04,PLOS ONE,Chlamydia,2024.0


In [19]:
# Load environment variables.
#load_dotenv("/Users/simonponce/Desktop/AI Bootcamp /06-Sourcing-AI-Project-Data/1/02-Ins_Pandas_Read_HTML_and DR.CHATS/Solved/thekeys.env")
# Set the model name for our LLMs.
OPENAI_MODEL = "gpt-3.5-turbo"
# Store the API key in a variable.
OPENAI_API_KEY =

In [20]:
# Initialize the model.
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=OPENAI_MODEL, temperature=0.3)

# Define a query as a string.
query = "What is the relationship between HIV and AIDS?"

# Pass the query to the invoke method and print the result.
result = llm.invoke(query)
print(result.content)

HIV (Human Immunodeficiency Virus) is the virus that causes AIDS (Acquired Immunodeficiency Syndrome). HIV attacks the immune system, specifically targeting CD4 cells (T cells), which are crucial in fighting off infections. As the virus replicates and destroys these cells, the immune system becomes weakened, making the individual more susceptible to opportunistic infections and certain cancers. If left untreated, HIV can progress to AIDS, which is the most advanced stage of HIV infection. AIDS is diagnosed when the individual's CD4 cell count falls below a certain level and they develop one or more opportunistic infections or AIDS-defining illnesses. Treatment with antiretroviral therapy can help control the virus and prevent the progression to AIDS.


In [21]:
!pip install langchain
!pip install openai
!pip install gradio

Collecting langchain
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Downloading langchain-0.3.3-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_text_splitters-0.3.0-py3-none-any.whl (25 kB)
Installing collected packages: langchain-text-splitters, langchain
Successfully installed langchain-0.3.3 langchain-text-splitters-0.3.0
Collecting gradio
  Downloading gradio-5.0.2-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Co

In [25]:
import gradio as gr
# Assuming chatbot is already defined and initialized
# and processed_df_1 is your DataFrame
# Function to handle the summary and answering the question
def generate_response(query):
    context_2 = result['df'].iloc[0]
    summary = chatbot.generate_summary(context_2)
    answer = chatbot.answer_question(summary, question)
    return result.content
# Create a Gradio interface
iface = gr.Interface(
    fn=generate_response,            # The function to call
    inputs=gr.Textbox(label="Your Question"),  # Input type
    outputs=gr.Textbox(label="Answer"),        # Output type
    title="Symptoms Q&A",  # Title of the interface
    description="Ask about the main symptoms you are concerned with."
)
# Launch the Gradio app
iface.launch()
















Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c7c0d56f1a3903be1e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


