# 🧠 Scientific Q&A Assistant using Gen AI (RAG Pipeline)

# 🧠 Scientific Q&A Assistant using Gen AI (RAG Pipeline)

In [None]:
# 📌 1. Install Dependencies
!pip install -q openai arxiv wikipedia sentence-transformers langchain matplotlib langchain-community

### 📌 Step 1: Install all necessary dependencies
We install OpenAI, arXiv, Wikipedia, LangChain, Sentence Transformers, and visualization libraries.

In [None]:
# 📌 2. Import Libraries
import requests
import arxiv
import wikipedia
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

### 📌 Step 2: Import required Python libraries
These include APIs for data retrieval, embedding models, and plotting tools.

In [None]:
# 📌 3. Load Sentence Transformer Embedder
embedder = SentenceTransformer('all-MiniLM-L6-v2')

### 📌 Step 3: Load the SentenceTransformer model
This model is used to create embeddings for scientific text data.

In [None]:
# 📌 4. Fetch Articles from PubMed (Entrez API)
def fetch_pubmed_articles(query, max_results=5):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={query}&retmax={max_results}&retmode=json"
    ids = requests.get(url).json()["esearchresult"]["idlist"]
    summaries = []
    for pid in ids:
        summary_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pid}&retmode=json"
        summary_data = requests.get(summary_url).json()
        if pid in summary_data["result"]:
            title = summary_data["result"][pid].get("title", "")
            source = summary_data["result"][pid].get("source", "")
            summaries.append({"source": "PubMed", "title": title, "summary": source})
    return summaries

### 📌 Step 4: Define a function to fetch abstracts and metadata from PubMed
Uses Entrez API to search and retrieve summaries.

In [None]:
# 📌 5. Fetch Articles from Wikipedia
def get_wikipedia_background(topic):
    try:
        summary = wikipedia.summary(topic, sentences=5)
        return [{"source": "Wikipedia", "title": topic, "summary": summary}]
    except Exception:
        return []

### 📌 Step 5: Define a function to retrieve a short summary from Wikipedia
Uses the `wikipedia` Python package.

In [None]:
# 📌 6. Fetch Articles from arXiv
def fetch_arxiv_articles(query, max_results=5):
    search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.Relevance)
    articles = []
    for result in search.results():
        articles.append({"source": "arXiv", "title": result.title, "summary": result.summary})
    return articles

### 📌 Step 6: Define a function to fetch abstracts from arXiv
Uses the `arxiv` Python SDK to search relevant publications.

In [None]:
# 📌 7. Build Combined Report
def build_merged_report(topic, pubmed_limit=5, arxiv_limit=5):
    pubmed = fetch_pubmed_articles(topic, max_results=pubmed_limit)
    arxiv_articles = fetch_arxiv_articles(topic, max_results=arxiv_limit)
    wiki = get_wikipedia_background(topic)
    return pubmed + arxiv_articles + wiki

### 📌 Step 7: Merge articles from PubMed, arXiv, and Wikipedia
Creates a unified list of relevant articles for the topic.

In [None]:
# 📌 8. Summarization with Grounding using GPT
def structured_summary_with_grounding(texts):
    joined_context = "\n".join([f"[{i}] {t['summary']}" for i, t in enumerate(texts)])
    citations = "\n".join([f"[{i}] {t['title']} - {t['source']}" for i, t in enumerate(texts)])
    messages = [
        HumanMessage(content=f"Answer the question based only on the texts below. Cite each claim using [number].\n\n{texts}\n\nCitations:\n{citations}")
    ]
    chat = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
    return chat(messages).content

### 📌 Step 8: Generate a grounded scientific answer using OpenAI GPT
Generates a summary based only on retrieved article content with references.

In [None]:
# 📌 9. Visualization of Source Contributions
def visualize_results(results):
    df = pd.DataFrame(results)
    source_counts = df['source'].value_counts()
    source_counts.plot(kind='bar', color='skyblue', title='Source Distribution')
    plt.ylabel('Count')
    plt.xlabel('Source')
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()

### 📌 Step 9: Visualize the number of articles fetched from each source
Displays a bar chart showing how many items came from PubMed, arXiv, and Wikipedia.

In [None]:
# 📌 10. Interactive Workflow
if __name__ == "__main__":
    topic = input("🔍 Enter a research topic: ")
    merged_data = build_merged_report(topic)
    visualize_results(merged_data)
    print("\n\n📚 Sources Fetched:")
    for doc in merged_data:
        print(f"- {doc['source']}: {doc['title'][:60]}")
    print("\n\n🧠 Answer Summary:")
    final_summary = structured_summary_with_grounding(merged_data)
    print(final_summary)