In [1]:
from openai import OpenAI
import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
import json
import os
import re
import csv
import subprocess
import sys
import warnings

# Filter BeautifulSoup warnings about XML being parsed as HTML
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# Check and install required dependencies
try:
    import lxml
except ImportError:
    print("Installing lxml parser for BeautifulSoup...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lxml"])
    print("lxml installed successfully")

# Initialize the client with your API key
client = OpenAI(api_key="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")

# Function to search PubMed for scientific articles
def search_pubmed(query):
    try:
        # Search PubMed using E-utilities
        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        
        # First get IDs
        search_url = f"{base_url}esearch.fcgi"
        params = {
            'db': 'pubmed',
            'term': query,
            'retmode': 'json',
            'retmax': 5
        }
        
        response = requests.get(search_url, params=params)
        search_data = response.json()
        
        if 'esearchresult' in search_data and 'idlist' in search_data['esearchresult']:
            id_list = search_data['esearchresult']['idlist']
            
            if not id_list:
                return {"results": [], "message": "No PubMed articles found."}
                
            # Then get summaries
            summary_url = f"{base_url}esummary.fcgi"
            params = {
                'db': 'pubmed',
                'id': ','.join(id_list),
                'retmode': 'json'
            }
            
            response = requests.get(summary_url, params=params)
            summary_data = response.json()
            
            results = []
            if 'result' in summary_data:
                for pmid in id_list:
                    if pmid in summary_data['result']:
                        article = summary_data['result'][pmid]
                        title = article.get('title', 'No title available')
                        
                        # Create result object
                        results.append({
                            'title': title,
                            'link': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
                            'pmid': pmid,
                            'authors': ', '.join([author.get('name', '') for author in article.get('authors', []) if 'name' in author]),
                            'journal': article.get('fulljournalname', 'Journal not specified'),
                            'publication_date': article.get('pubdate', 'Date not specified')
                        })
            
            return {"results": results, "message": f"Found {len(results)} articles on PubMed."}
        else:
            return {"results": [], "message": "No PubMed articles found or error in search."}
    except Exception as e:
        return {"error": str(e), "message": "Error searching PubMed."}

# Function to search for gene information in NCBI Gene database
def search_gene_info(gene_symbol):
    try:
        # Search NCBI Gene database using E-utilities
        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        
        # First search for the gene
        search_url = f"{base_url}esearch.fcgi"
        params = {
            'db': 'gene',
            'term': f"{gene_symbol}[GENE] AND human[ORGN]",  # Focus on human genes
            'retmode': 'json',
            'retmax': 1  # Usually we just want the top match
        }
        
        response = requests.get(search_url, params=params)
        search_data = response.json()
        
        if 'esearchresult' in search_data and 'idlist' in search_data['esearchresult'] and search_data['esearchresult']['idlist']:
            gene_id = search_data['esearchresult']['idlist'][0]
            
            # Then get summary
            summary_url = f"{base_url}esummary.fcgi"
            params = {
                'db': 'gene',
                'id': gene_id,
                'retmode': 'json'
            }
            
            response = requests.get(summary_url, params=params)
            summary_data = response.json()
            
            if 'result' in summary_data and gene_id in summary_data['result']:
                gene_data = summary_data['result'][gene_id]
                
                # Extract relevant information
                gene_info = {
                    'gene_id': gene_id,
                    'symbol': gene_data.get('name', gene_symbol),
                    'description': gene_data.get('description', 'No description available'),
                    'summary': gene_data.get('summary', 'No summary available'),
                    'aliases': gene_data.get('otheraliases', 'No aliases available'),
                    'location': gene_data.get('maplocation', 'Location unknown'),
                    'ncbi_link': f"https://www.ncbi.nlm.nih.gov/gene/{gene_id}"
                }
                
                return {"info": gene_info, "message": f"Found gene information for {gene_symbol}"}
            else:
                return {"info": {}, "message": f"No detailed information found for gene {gene_symbol}"}
        else:
            return {"info": {}, "message": f"Gene {gene_symbol} not found in NCBI Gene database"}
            
    except Exception as e:
        return {"error": str(e), "message": f"Error searching gene information for {gene_symbol}"}

# Function to search the web (improved version)
def search_web(query):
    try:
        # Prepare search URL with the query
        search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
        
        # Request headers to mimic a browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Make the request
        response = requests.get(search_url, headers=headers)
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract search results
        results = []
        
        # Process Google search results
        for g in soup.select('div.g'):
            # Extract title
            title_elem = g.select_one('h3')
            if not title_elem:
                continue
                
            title = title_elem.get_text()
            
            # Extract URL
            link_elem = g.select_one('a')
            if not link_elem or 'href' not in link_elem.attrs:
                continue
                
            link = link_elem['href']
            if link.startswith('/url?'):
                link = re.search(r'/url\?q=([^&]+)', link).group(1)
            elif not link.startswith('http'):
                continue
                
            # Extract snippet
            snippet_elem = g.select_one('.VwiC3b, .st')
            snippet = snippet_elem.get_text() if snippet_elem else "No description available"
            
            # Check if this is a scholarly/medical source
            is_scholarly = any(domain in link.lower() for domain in [
                'nih.gov', 'ncbi.nlm', 'pubmed', 'nature.com', 'sciencedirect',
                'scholar.google', 'researchgate', 'academic', 'science', 'journal',
                'medical', 'health', 'gene', 'genomic', 'genetics', 'omics'
            ])
            
            # Add to results
            results.append({
                'title': title,
                'link': link,
                'snippet': snippet,
                'is_scholarly': is_scholarly
            })
            
            # Limit results
            if len(results) >= 5:
                break
        
        return {
            "results": results,
            "message": f"Found {len(results)} web results for '{query}'."
        }
    except Exception as e:
        return {"error": str(e), "message": "Error during web search."}

# Function to fetch abstracts for a specific PubMed article
def fetch_pubmed_abstract(pmid):
    try:
        # Use E-utilities to fetch the abstract
        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        fetch_url = f"{base_url}efetch.fcgi"
        params = {
            'db': 'pubmed',
            'id': pmid,
            'retmode': 'xml'
        }
        
        response = requests.get(fetch_url, params=params)
        
        # Try multiple parser approaches to handle different XML formats
        try:
            # First attempt with lxml-xml
            soup = BeautifulSoup(response.text, features="lxml-xml")
        except Exception as e:
            print(f"XML parsing with lxml-xml failed: {e}")
            try:
                # Second attempt with xml parser
                soup = BeautifulSoup(response.text, features="xml")
            except Exception as e:
                print(f"XML parsing with xml failed: {e}")
                # Fall back to html parser as last resort
                soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the abstract text
        abstract_element = soup.find('AbstractText')
        if abstract_element:
            abstract = abstract_element.get_text()
        else:
            # Try alternative approach if the first method fails
            abstract_sections = soup.find_all('AbstractText')
            if abstract_sections:
                abstract = " ".join([section.get_text() for section in abstract_sections])
            else:
                abstract = "Abstract not available for this article."
            
        title_element = soup.find('ArticleTitle')
        title = title_element.get_text() if title_element else "Title not available"
        
        return {
            "pmid": pmid,
            "title": title,
            "abstract": abstract,
            "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
        }
    except Exception as e:
        return {"error": str(e), "pmid": pmid, "message": "Error fetching abstract."}

# Function to fetch content from a webpage
def fetch_webpage(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.extract()
        
        # Get text
        text = soup.get_text(separator='\n', strip=True)
        
        # Truncate if too long (OpenAI has token limits)
        max_length = 8000
        if len(text) > max_length:
            text = text[:max_length] + "... [Content truncated due to length]"
            
        return {"content": text, "url": url}
    except Exception as e:
        return {"error": str(e), "url": url}

# Function to process a gene-disease pair and store results
def process_gene_disease_pair(gene, disease):
    print(f"\n{'-' * 50}")
    print(f"Processing: {gene} association with {disease}")
    print(f"{'-' * 50}")
    
    # Create a safe filename
    filename = f"{gene.replace(' ', '_')}_{disease.replace(' ', '_')}.txt"
    
    # Define available functions
    tools = [
        {
            "type": "function",
            "function": {
                "name": "search_pubmed",
                "description": "Search PubMed for scientific articles about genes and diseases",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "query": {
                            "type": "string",
                            "description": "The search query for PubMed"
                        }
                    },
                    "required": ["query"]
                }
            }
        },
        {
            "type": "function",
            "function": {
                "name": "search_gene_info",
                "description": "Search for information about a specific gene in NCBI Gene database",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "gene_symbol": {
                            "type": "string",
                            "description": "The gene symbol to search for"
                        }
                    },
                    "required": ["gene_symbol"]
                }
            }
        },
        {
            "type": "function",
            "function": {
                "name": "search_web",
                "description": "Search the web for general information",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "query": {
                            "type": "string",
                            "description": "The search query"
                        }
                    },
                    "required": ["query"]
                }
            }
        },
        {
            "type": "function",
            "function": {
                "name": "fetch_pubmed_abstract",
                "description": "Fetch the abstract for a specific PubMed article by its ID",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "pmid": {
                            "type": "string",
                            "description": "The PubMed ID (PMID) of the article"
                        }
                    },
                    "required": ["pmid"]
                }
            }
        },
        {
            "type": "function",
            "function": {
                "name": "fetch_webpage",
                "description": "Fetch the content of a webpage",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "url": {
                            "type": "string",
                            "description": "The URL to fetch"
                        }
                    },
                    "required": ["url"]
                }
            }
        }
    ]
    
    # Initialize conversation with a specific query about the gene and disease
    messages = [
        {"role": "system", "content": "You are a helpful assistant with knowledge about genetics, genomics, and medical research. You can search for gene information, scientific literature, and analyze relationships between genes and diseases, particularly from TWAS (Transcriptome-Wide Association Study) results."},
        {"role": "user", "content": f"Analyze the association between gene {gene} and {disease} based on TWAS results. Research the function of this gene, its potential role in disease pathways, existing evidence for its involvement in {disease}, and functional mechanisms that might explain this association. Provide a comprehensive analysis including molecular mechanisms, expression patterns, and potential therapeutic implications."}
    ]
    
    try:
        # Create output file
        with open(filename, "w", encoding="utf-8") as output_file:
            output_file.write(f"Analysis of {gene} association with {disease}\n")
            output_file.write(f"{'-' * 50}\n\n")
            
            # Get initial response from the model
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
                tools=tools,
                tool_choice="auto"
            )
            
            response_message = response.choices[0].message
            
            # Log all messages to the file for complete conversation tracking
            def log_message(role, content):
                output_file.write(f"[{role.upper()}]: {content}\n\n")
                output_file.write(f"{'-' * 30}\n\n")
            
            # Check if the model wants to call a function
            iteration = 0
            max_iterations = 10  # Limit to prevent infinite loops
            
            while hasattr(response_message, 'tool_calls') and response_message.tool_calls and iteration < max_iterations:
                iteration += 1
                print(f"Iteration {iteration}: Model is making tool calls...")
                
                # Add the assistant's message to the history
                messages.append(response_message)
                log_message("assistant", f"Making the following tool calls: {[tc.function.name for tc in response_message.tool_calls]}")
                
                # Process each tool call
                for tool_call in response_message.tool_calls:
                    function_name = tool_call.function.name
                    function_args = json.loads(tool_call.function.arguments)
                    
                    # Call the appropriate function
                    function_response = None
                    if function_name == "search_pubmed":
                        query = function_args.get("query")
                        print(f"Searching PubMed for: {query}")
                        function_response = search_pubmed(query)
                        log_message("tool", f"search_pubmed query: {query}\nResults: {json.dumps(function_response, indent=2)}")
                    elif function_name == "search_gene_info":
                        gene_symbol = function_args.get("gene_symbol")
                        print(f"Fetching information for gene: {gene_symbol}")
                        function_response = search_gene_info(gene_symbol)
                        log_message("tool", f"search_gene_info gene: {gene_symbol}\nResults: {json.dumps(function_response, indent=2)}")
                    elif function_name == "search_web":
                        query = function_args.get("query")
                        print(f"Searching the web for: {query}")
                        function_response = search_web(query)
                        log_message("tool", f"search_web query: {query}\nResults: {json.dumps(function_response, indent=2)}")
                    elif function_name == "fetch_pubmed_abstract":
                        pmid = function_args.get("pmid")
                        print(f"Fetching abstract for PubMed ID: {pmid}")
                        function_response = fetch_pubmed_abstract(pmid)
                        log_message("tool", f"fetch_pubmed_abstract pmid: {pmid}\nResults: {json.dumps(function_response, indent=2)}")
                    elif function_name == "fetch_webpage":
                        url = function_args.get("url")
                        print(f"Fetching webpage: {url}")
                        function_response = fetch_webpage(url)
                        # Log URL and truncated content to avoid huge files
                        content_preview = function_response.get("content", "")[:1000] + "..." if "content" in function_response else ""
                        log_message("tool", f"fetch_webpage url: {url}\nContent preview: {content_preview}")
                    
                    # Add the function result to the messages
                    messages.append({
                        "role": "tool",
                        "tool_call_id": tool_call.id,
                        "name": function_name,
                        "content": json.dumps(function_response)
                    })
                
                # Get the next response after tool use
                next_response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=messages,
                    tools=tools,
                    tool_choice="auto"
                )
                
                response_message = next_response.choices[0].message
                
                # If no more tool calls, break the loop
                if not hasattr(response_message, 'tool_calls') or not response_message.tool_calls:
                    ai_response = response_message.content
                    print("\nFinal response received.")
                    log_message("final_response", ai_response)
                    messages.append({"role": "assistant", "content": ai_response})
                    break
            
            # Add final summary if we reached max iterations
            if iteration >= max_iterations:
                print("Reached maximum number of iterations. Requesting final summary...")
                final_prompt = {"role": "user", "content": "Please provide a final comprehensive summary of all the information you've gathered about this gene-disease relationship, including molecular mechanisms, pathways, cell lines, and potential therapeutic implications."}
                messages.append(final_prompt)
                log_message("user", final_prompt["content"])
                
                final_response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=messages
                )
                
                final_answer = final_response.choices[0].message.content
                log_message("final_summary", final_answer)
            
            print(f"Completed processing {gene} association with {disease}. Results saved to {filename}")
            
    except Exception as e:
        print(f"Error processing {gene} for {disease}: {e}")
        # Write error to file
        with open(filename, "a", encoding="utf-8") as output_file:
            output_file.write(f"ERROR: {str(e)}\n")

# Main function to read input file and process each pair
def process_input_file(input_file_path):
    try:
        # Create output directory if it doesn't exist
        output_dir = "gene_disease_results"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Created output directory: {output_dir}")
        
        # Change to output directory
        os.chdir(output_dir)
        
        # Read the input file
        with open(input_file_path, 'r') as file:
            # Determine if the file is tab-separated or comma-separated
            first_line = file.readline().strip()
            if '\t' in first_line:
                delimiter = '\t'
            else:
                delimiter = ','
            
            # Return to the beginning of the file
            file.seek(0)
            
            # Create a CSV reader
            reader = csv.reader(file, delimiter=delimiter)
            
            # Process each row
            for i, row in enumerate(reader):
                if len(row) >= 2:
                    gene = row[0].strip()
                    disease = row[1].strip()
                    
                    if gene and disease:  # Ensure neither is empty
                        print(f"\nProcessing pair {i+1}: {gene} - {disease}")
                        process_gene_disease_pair(gene, disease)
                    else:
                        print(f"Skipping row {i+1}: Missing gene or disease")
                else:
                    print(f"Skipping row {i+1}: Insufficient columns")
                
        print("\nAll gene-disease pairs have been processed.")
        
    except Exception as e:
        print(f"Error processing input file: {e}")

# Run the script
if __name__ == "__main__":
    input_file = input("Enter the path to your gene-disease file: ")
    process_input_file(input_file)

Enter the path to your gene-disease file:  C:\Users\Shaoyi Zhang\Desktop\Jupyter NoteBook\gene_disease.txt


Created output directory: gene_disease_results

Processing pair 1: APOE - alzheimer

--------------------------------------------------
Processing: APOE association with alzheimer
--------------------------------------------------
Iteration 1: Model is making tool calls...
Fetching information for gene: APOE
Searching PubMed for: APOE Alzheimer disease TWAS association
Iteration 2: Model is making tool calls...
Searching PubMed for: APOE Alzheimer's disease
Iteration 3: Model is making tool calls...
Searching the web for: APOE Alzheimer's disease TWAS results

Final response received.
Completed processing APOE association with alzheimer. Results saved to APOE_alzheimer.txt

Processing pair 2: TNF - rheumatoid arthritis

--------------------------------------------------
Processing: TNF association with rheumatoid arthritis
--------------------------------------------------
Iteration 1: Model is making tool calls...
Fetching information for gene: TNF
Searching PubMed for: TNF rheumatoid