In [None]:
import os
import glob
import json
import math
import random
import re
from collections import Counter
import networkx as nx
import pdfplumber
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

# Set the OpenAI API key from environment variable
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# -- Document Loading ------------------------------------------------------

def load_docs_from_dir(dir_path):
    """
    Load all .pdf files from a directory and extract their full text.
    Returns a list of strings, one per PDF.
    """
    docs = []
    filenames = []
    for filepath in glob.glob(os.path.join(dir_path, '*.pdf')):
        try:
            text_pages = []
            with pdfplumber.open(filepath) as pdf:
                for page in pdf.pages:
                    text_pages.append(page.extract_text() or "")
            full_text = "\n".join(text_pages)
            docs.append(full_text)
            filenames.append(os.path.basename(filepath))
        except Exception as e:
            # skip unreadable PDFs
            print(f"Warning: could not load {filepath}: {e}")
    return docs, filenames

# -- 1. Extract topics and key concepts -----------------------------------

def extract_json_from_text(text):
    """Helper function to extract JSON content from markdown code blocks or raw text"""
    # Try to extract JSON from markdown code blocks
    json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', text, re.DOTALL)
    if json_match:
        return json_match.group(1).strip()
    
    # If no code blocks, try to find JSON-like structures
    json_match = re.search(r'(\{.*\})', text, re.DOTALL)
    if json_match:
        return json_match.group(1).strip()
    
    # Return original text if no JSON structure found
    return text

def extract_concepts_from_docs(doc_texts, filenames, model="gpt-4.1-nano"):  
    """
    Call OpenAI API to extract topics and key concepts from each document.
    Returns list of dicts with keys 'topics' and 'key_concepts'.
    """
    extractions = []
    for i, text in enumerate(doc_texts):
        filename = filenames[i] if i < len(filenames) else f"doc_{i}"
        
        prompt = (
            "Extract high-level topics and key concepts from the following document. "
            f"Return JSON with keys 'topics' and 'key_concepts'.\n\n{text}"
        )
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert summarizer. Return your response as a JSON object with 'topics' and 'key_concepts' as arrays."},
                {"role": "user", "content": prompt}
            ],
        )
        
        output = response.choices[0].message.content
        print("Raw output:", output)
        
        try:
            # Clean and extract JSON from the output
            json_str = extract_json_from_text(output)
            data = json.loads(json_str)
            
            # Ensure we have the expected keys
            if "topics" in data and "key_concepts" in data:
                extraction = {
                    "filename": filename,
                    "topics": data["topics"],
                    "key_concepts": data["key_concepts"]
                }
                extractions.append(extraction)
                print(f"Successfully extracted {len(data['topics'])} topics and {len(data['key_concepts'])} key concepts from {filename}")
            else:
                print(f"Warning: Parsed JSON doesn't have expected keys: {data.keys()}")
                extractions.append({
                    "filename": filename,
                    "topics": [], 
                    "key_concepts": []
                })
                
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")
            extractions.append({
                "filename": filename,
                "topics": [], 
                "key_concepts": []
            })
    
    return extractions

# -- 2. Construct the concept graph ----------------------------------------

def build_concept_graph(extractions, eps=1e-6):
    """
    Build a unified graph G where nodes are topics + key concepts and
    edges weighted by log(freq+eps) based on co-occurrence in docs.
    Returns: G (nx.Graph), topic_nodes, kc_nodes
    """
    freq = Counter()
    all_topics, all_kcs = set(), set()

    for ex in extractions:
        nodes = ex["topics"] + ex["key_concepts"]
        for i in range(len(nodes)):
            for j in range(i+1, len(nodes)):
                u, v = sorted((nodes[i], nodes[j]))
                freq[(u, v)] += 1
        all_topics.update(ex["topics"])
        all_kcs.update(ex["key_concepts"])

    G = nx.Graph()
    for (u, v), f in freq.items():
        weight = math.log(f + eps)
        G.add_edge(u, v, weight=weight)

    print(f"Graph built with {len(all_topics)} topics and {len(all_kcs)} key concepts")
    return G, all_topics, all_kcs

# -- Helpers for sampling --------------------------------------------------

def softmax(weights):
    exps = [math.exp(w) for w in weights]
    s = sum(exps) or 1.0
    return [e/s for e in exps]


def random_walk(G, start, steps):
    """
    Random walk on graph G for given steps from 'start',
    with transition probabilities via softmax over edge weights.
    """
    path = [start]
    current = start
    for _ in range(steps):
        nbrs = list(G[current])
        if not nbrs:
            break
        weights = [G[current][n]['weight'] for n in nbrs]
        probs = softmax(weights)
        current = random.choices(nbrs, probs)[0]
        path.append(current)
    return path

# -- 3. Concept combination sampling ---------------------------------------

def sample_concept_combinations(
    G, topic_nodes, kc_nodes,
    num_samples=100,
    topic_walk_steps=(1, 2),
    kc_walk_steps=(3, 4)
):
    """
    Generate sampled sets of topics and key concepts via multi-stage random walks.
    Returns list of dicts: {'topics': set, 'key_concepts': set}
    """
    # Safety check
    if not topic_nodes:
        print("Error: No topics found. Cannot sample combinations.")
        return []
        
    G_topic = G.subgraph(topic_nodes)
    G_topic_kc = G.subgraph(topic_nodes | kc_nodes)
    G_kc = G.subgraph(kc_nodes)
    samples = []

    topics_list = list(topic_nodes)
    print(f"Sampling from {len(topics_list)} topics")
    
    for _ in range(num_samples):
        t0 = random.choice(topics_list)
        t_steps = random.choice(topic_walk_steps)
        topic_path = random_walk(G_topic, t0, t_steps)
        sampled_topics = set(topic_path)

        kc_cands = [nbr for t in sampled_topics for nbr in G_topic_kc[t] if nbr in kc_nodes]
        if kc_cands:
            k0 = random.choice(kc_cands)
            k_steps = random.choice(kc_walk_steps)
            kc_path = random_walk(G_kc, k0, k_steps)
            sampled_kcs = set(kc_path)
        else:
            sampled_kcs = set()

        samples.append({
            "topics": list(sampled_topics),  # Convert sets to lists for JSON serialization
            "key_concepts": list(sampled_kcs)
        })
    
    return samples

# -- 4. Question generation ------------------------------------------------

def generate_questions_for_samples(combos, docs, extractions, model="gpt-4o"):  
    """
    For each sampled combo, pick two docs via Jaccard on concept sets,
    then call LLM to generate questions.
    Returns list of dicts: {'sample': combo, 'questions': [...]}.
    """
    doc_concepts = [set(ex['topics'] + ex['key_concepts']) for ex in extractions]
    # doc_concepts= doc_concepts[:1]
    results = []
    max_samples = 100
    # combos = combos[:max_samples]
    for i, combo in enumerate(combos):
        combo_id = f"combo_{i+1}"
        kg = set(combo['topics']) | set(combo['key_concepts'])
        sims = []
        for idx, dc in enumerate(doc_concepts):
            inter = kg & dc
            union = kg | dc
            sims.append((len(inter) / (len(union) or 1), idx))
        sims.sort(reverse=True)
        top_idxs = [i for _, i in sims[:2]]
        refs = [docs[i] for i in top_idxs]
        ref_files = [extractions[i]["filename"] for i in top_idxs]
        System_prompt =f"""
Each question must follow these instructions:
Model a Physical Scenario: Start from a real-world or idealized setup. Avoid abstract biology problems or purely conceptual statements.
Target a Solvable Quantity: Ask for a clear symbolic expression of a physical variable (e.g., tension, acceleration, energy).
Force Multi-Step Reasoning: Ensure the question involves a sequence of biology laws, transformations, and derivations to reach the answer.
Avoid Redundancy: Exclude extraneous details or variables that do not impact the final solution.
Be Unique: Do not rephrase standard textbook problems; ensure originality and complexity.
Single solution: Expect a single symbolic expression, unambiguous, presented in LaTeX. Multiple equivalent algebraic forms are allowed. No equations or floating-point approximations.
Use rigorous, concise phrasing.
Avoid colloquial or ambiguous terminology.
Units must be consistent; symbols should follow standard notation.
"""
        prompt = (
            f"Generate a set of difficult biology questions based on the following:\n"
            
            f"Topics: {combo['topics']}\n"
            f"Key Concepts: {combo['key_concepts']}\n"
            f"Reference Doc 1:\n{refs[0]}\n"
        )
        if len(refs) > 1:
            prompt += f"Reference Doc 2:\n{refs[1]}\n"
        prompt += "Return a JSON array of questions."

        # from ollama import chat
        # from ollama import ChatResponse

        # response: ChatResponse = chat(model='qwen3:8b', 
        #                                messages=[
        #         {"role": "system", "content": System_prompt},
        #         {"role": "user", "content": prompt}
        #     ])
        # # print(response['message']['content'])
        # # or access fields directly from the response object
        # print(response.message.content)
        # content = response.message.content
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": System_prompt},
                {"role": "user", "content": prompt}
            ]
        )
        content = response.choices[0].message.content
        
        try:
            # Clean and extract JSON from the output
            json_str = extract_json_from_text(content)
            questions = json.loads(json_str)
            if not isinstance(questions, list):
                # If the output is an object with a questions key
                if isinstance(questions, dict) and "questions" in questions:
                    questions = questions["questions"]
                else:
                    questions = [str(questions)]
        except json.JSONDecodeError:
            questions = [content]

        results.append({
            "id": combo_id,
            "topics": combo['topics'],
            "key_concepts": combo['key_concepts'],
            "reference_files": ref_files,
            "questions": questions
        })
        max_samples-=1
        if max_samples == 0:
            break


    return results

# -- Save outputs to files ------------------------------------------------

def save_extractions(extractions, output_file="document_extractions.json"):
    """Save the extracted topics and key concepts for each document"""
    # Ensure the extractions are serializable (convert sets to lists)
    serializable_extractions = []
    for ex in extractions:
        serializable_extractions.append({
            "filename": ex["filename"],
            "topics": list(ex["topics"]),
            "key_concepts": list(ex["key_concepts"])
        })
    
    with open(output_file, "w") as f:
        json.dump(serializable_extractions, f, indent=2)
    
    print(f"Saved document extractions to {output_file}")

def save_questions_with_topics(questions, output_file="questions_with_topics.json"):
    """Save the generated questions with their topic combinations"""
    with open(output_file, "w") as f:
        json.dump(questions, f, indent=2)
    
    print(f"Saved questions with topic combinations to {output_file}")

# -- Main Execution --------------------------------------------------------

if __name__ == "__main__":
    # Create output directory if it doesn't exist
    output_dir = "output_biology"
    os.makedirs(output_dir, exist_ok=True)
    
    # adjust this path to where your .pdf docs live
    docs_dir = "Bio_docs/"
    
    print("Loading documents...")
    docs, filenames = load_docs_from_dir(docs_dir)
    print(f"Loaded {len(docs)} documents")
    
    if not docs:
        print("No documents found. Please check the docs directory.")
        exit(1)
    
    # 1) Extract topics & KCs
    print("\nExtracting topics and key concepts...")
    extractions = extract_concepts_from_docs(docs, filenames)
    
    # Save extractions to file
    save_extractions(extractions, os.path.join(output_dir, "document_extractions.json"))
    
    # Verify we have valid extractions
    valid_extractions = [ex for ex in extractions if ex["topics"] or ex["key_concepts"]]
    if not valid_extractions:
        print("No valid topics or key concepts extracted. Check your data and API responses.")
        exit(1)
    
    # 2) Build graph
    print("\nBuilding concept graph...")
    G, topic_nodes, kc_nodes = build_concept_graph(extractions)
    
    if not topic_nodes:
        print("No topics found in the graph. Cannot proceed.")
        exit(1)
    
    # 3) Sample combinations
    print("\nSampling concept combinations...")
    combos = sample_concept_combinations(G, topic_nodes, kc_nodes, num_samples=100)  # Reduced for testing
    
    if not combos:
        print("Failed to generate concept combinations.")
        exit(1)
    
    # Save topic combinations
    with open(os.path.join(output_dir, "topic_combinations.json"), "w") as f:
        json.dump(combos, f, indent=2)
    
#     # 4) Generate questions
#     print("\nGenerating questions for each combination...")
#     q_outputs = generate_questions_for_samples(combos, docs, extractions)
    
#     # Save questions with topics
#     save_questions_with_topics(q_outputs, os.path.join(output_dir, "questions_with_topics.json"))
    
#     # Display results
#     print("\n===== GENERATED QUESTIONS =====")
#     for idx, out in enumerate(q_outputs, 1):
#         print(f"\nSample {idx}:")
#         print(f"Topics: {out['topics']}")
#         print(f"Key Concepts: {out['key_concepts']}")
#         print(f"Reference Files: {out['reference_files']}")
#         print("Questions:")
#         for q in out['questions']:
#             print(f" - {q}")
    
#     print(f"\nAll outputs saved to directory: {output_dir}") 

# if __name__ == "__main__":
#     # Create output directory if it doesn't exist
#     # Create output directory if it doesn't exist
#     output_dir = "output_irodov"
#     os.makedirs(output_dir, exist_ok=True)
    
#     # adjust this path to where your .pdf docs live
#     docs_dir = "irodov_docs/"
    
#     print("Loading documents...")
#     docs, filenames = load_docs_from_dir(docs_dir)
#     print(f"Loaded {len(docs)} documents")
#     document_extractions = []
#     with open("output_irodov/document_extractions.json", "r") as f:
#         document_extractions = json.load(f)
#     for doc in document_extractions:
#         print(doc)
#         break

#     topic_combinations = []
#     with open("output_irodov/topic_combinations.json", "r") as f:
#         topic_combinations = json.load(f)
#     for combo in topic_combinations:
#         print(combo)
#         break

    # # generate questions 
    # print("Generating questions...")
    # q_outputs = generate_questions_for_samples(topic_combinations, docs, document_extractions)

    # # Save questions with topics
    # save_questions_with_topics(q_outputs, os.path.join(output_dir, "questions_with_topics.json"))
    
    # # Display results
    # print("\n===== GENERATED QUESTIONS =====")
    # for idx, out in enumerate(q_outputs, 1):
    #     print(f"\nSample {idx}:")
    #     print(f"Topics: {out['topics']}")
    #     print(f"Key Concepts: {out['key_concepts']}")
    #     print(f"Reference Files: {out['reference_files']}")
    #     print("Questions:")
    #     for q in out['questions']:
    #         print(f" - {q}")
    
    # print(f"\nAll outputs saved to directory: {output_dir}")



Loading documents...




Loaded 8 documents

Extracting topics and key concepts...


2025-06-06 18:11:43,354 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw output: {
  "topics": [
    "Environmental Impact of Waste Management",
    "Microbiome and Omics Technologies in Composting",
    "Human Excretion and Sanitation Systems",
    "History and Evolution of Waste and Excreta Management",
    "Composting Science and Processes",
    "Microbial Community Dynamics and Succession",
    "Application of Microbiome Technologies for Pathogen and Toxin Reduction",
    "Specific Composting Systems: Manure, Biosolids, Food, and Landscape Waste",
    "Innovative and Sustainable Waste Treatment Methods",
    "Global Challenges and Goals in Sanitation and Waste Recycling"
  ],
  "key_concepts": [
    "Composting as a microbial biotechnology for waste recycling and environmental sustainability",
    "Role of microbiome analysis (amplicon sequencing, metagenomics, meta-omics) in understanding composting processes",
    "Management variables influencing composting: carbon-to-nitrogen ratio, moisture, temperature, pH, oxygen",
    "Pathogen inactivation 

2025-06-06 18:11:52,154 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw output: {
  "topics": [
    "mRNA vaccine technology and development",
    "Lipid nanoparticle (LNP) formulation and characterization",
    "Physicochemical properties of LNP-mRNA",
    "In vitro and in vivo delivery and biodistribution of mRNA-LNP",
    "Immunogenicity and immune response evaluation",
    "Antigen expression and protein production from LNP-mRNA",
    "Application of LNP-mRNA in infectious diseases (dengue, leishmaniasis)",
    "Comparison with traditional vaccine platforms (recombinant protein, plasmid DNA)",
    "Stability, storage, and freeze-thaw resilience of LNP-mRNA",
    "Innate immune activation and cytokine response to LNP-mRNA"
  ],
  "key_concepts": [
    "LNP composition including DODMA, DSPC, DMG-PEG-2000, and cholesterol",
    "Particle size (~100 nm), polydispersity index (<0.3), and zeta potential of LNPs",
    "Encapsulation efficiency (>85%) of mRNA in LNPs",
    "Microfluidic particle production method for reproducibility and scalability",
    "

2025-06-06 18:12:02,190 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw output: {
  "topics": [
    "Spherical Metal Oxides (SMOs) in Environmental Remediation",
    "Photocatalytic Degradation of Antibiotics",
    "Synthesis Strategies of SMOs (Template Methods, Template-Free Methods)",
    "Microstructures and Morphologies of SMOs (Hollow, Porous, Yolk-Shell, Multi-Shelled)",
    "Photocatalytic Mechanisms and Role of Reactive Oxygen Species (ROS)",
    "Structure-Activity Relationships in SMOs Catalysts",
    "Degradation Pathways of Specific Antibiotics (Tetracyclines, Quinolones, Sulfonamides)",
    "Challenges and Future Directions in Water Purification Technologies"
  ],
  "key_concepts": [
    "Unique properties of SMOs including high surface area, strong light absorption, and stability",
    "Design and controlled synthesis of various microstructures (hollow, porous, yolk-shell, multi-shell)",
    "Template methods: hard-template, soft-template, self-template, and template-free synthesis techniques",
    "Enhanced light utilization through hol

2025-06-06 18:12:10,898 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw output: {
  "topics": [
    "Marine Macroalgae and Ecosystem Function",
    "Fucoidan Structure and Recalcitrance",
    "Microbial Degradation of Fucoidan",
    "Planctomycetota and Marine Bacterial Enzymes",
    "Polysaccharide Utilization Loci (PULs)",
    "Fucose-Processing Enzymes and GH168 Family",
    "Enzymatic Mechanisms for Fucoidan Breakdown",
    "Structural Biology of Endo-Fucanases",
    "Genomic and Transcriptomic Analysis of Marine Bacteria",
    "Environmental Distribution and Ecological Role of Fucoidan-Degrading Microbes"
  ],
  "key-concepts": [
    "Recalcitrant Polysaccharide Fucoidan in Marine Algae",
    "Macroalgal Carbon Storage and Climate Regulation",
    "Diversity and Structural Complexity of Fucoidan (homofucans and heterofucans)",
    "Mechanisms of Fucoidan Degradation by Marine Bacteria",
    "Genomic Islands (PULs) Encoding Fucoidan-Degrading Enzymes",
    "GH168 Endo-Fucanases and Their Structural Features",
    "Substrate Specificity and Enzymati

2025-06-06 18:12:20,313 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw output: {
  "topics": [
    "Biodistribution and Pharmacokinetics of mRNA Lipid Nanoparticle Vaccines in Humans",
    "Mechanisms of mRNA and Lipid Decay in Blood Post-Vaccination",
    "Immunogenicity and Antibody Response to mRNA Vaccines",
    "Anti-PEG Antibody Development and Its Impact on Vaccine and Nanoparticle Clearance",
    "Interactions Between Lipid Nanoparticles and Human Immune Cells",
    "Measurement and Quantification Techniques for mRNA, Lipids, and Antibodies",
    "Effects of Pre-existing Anti-PEG and Anti-Spike Antibodies on Vaccine Response",
    "Vaccine mRNA Integrity and Stability In Vivo",
    "Role of Phagocytes in Nanoparticle Clearance and Immunogenicity",
    "Implications for Future mRNA Vaccine Design and Safety"
  ],
  "key_concepts": [
    "Detection of vaccine-derived mRNA and ionizable lipids in human blood post-vaccination",
    "Decay kinetics of intact versus degraded mRNA and lipid components",
    "Recirculation and circulation duration (~1

2025-06-06 18:12:32,197 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw output: {
  "topics": [
    "Advances in Metal Oxide Nanomaterials in Agriculture and Food Industry",
    "Applications of Metal Oxide Nanomaterials in Food Quality Control and Preservation",
    "Development of Nanosensors and Biosensors for Food Safety and Pathogen Detection",
    "Smart Food Packaging Technologies Using Metal Oxide Nanomaterials",
    "Use of Metal Oxide Nanomaterials for Agrochemical Delivery, Fertilizers, and Pest Control",
    "Environmental Remediation of Soil and Water Contaminants with Metal Oxide Nanostructures",
    "Photocatalytic Processes for Pesticide Degradation using Metal Oxide Nanomaterials",
    "Risks, Toxicity, and Regulatory Challenges of Metal Oxide Nanoparticles in Agriculture and Food",
    "Nano-enabled Environmental Monitoring and Soil/Water Treatment",
    "Future Perspectives, Market Barriers, and Safety Assessment of Metal Oxide Nanodevices"
  ],
  "key_concepts": [
    "Metal oxide nanomaterials (MO NMs) and their tailored physicoche

2025-06-06 18:12:40,911 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw output: {
  "topics": [
    "Insect Breeding and Genetic Improvement",
    "Selective Breeding Framework and Methodologies",
    "Insect Biology and Lifecycle in Breeding",
    "Phenotyping Techniques for Insects",
    "Genetic Parameter Estimation and Evaluation",
    "Breeding Strategies (Phenotypic, Pedigree, Genomic, Crossbreeding)",
    "Inbreeding, Genetic Diversity, and Long-term Sustainability",
    "Reproductive Biology and Mating Systems of Insects",
    "Environmental Effects and GxE Interactions",
    "Challenges and Opportunities in Commercial Insect Breeding"
  ],
  "key_concepts": [
    "Application of quantitative genetics and statistical models for insect breeding",
    "Breeding objectives tailored to economic traits like growth, body composition, and reproductive success",
    "Estimating genetic parameters such as heritability and genetic correlations in insects",
    "Phenotyping methods including group and individual measurements, and advanced automation",
   

2025-06-06 18:13:00,149 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Raw output: {
  "topics": [
    "Structure and internal organization of lipid nanoparticles (LNPs)",
    "Advanced characterization techniques for LNPs (SAXS, DLS, AFM)",
    "Design and modeling of nanostructured lipid carriers",
    "Role of lipids and surfactants (cetyl palmitate, polysorbate 80) in LNP stability",
    "Interactions between lipid nanoparticles and biological environment (protein corona, water binding)",
    "Implications for drug delivery and nanomedicine applications",
    "Theoretical models of nanoparticle shape, size distribution, and internal structure",
    "Novel structural insights versus classical core-shell models",
    "Thermodynamics and self-assembly of lipid surfactant systems",
    "Correlation between internal nanostructure and functional properties (drug loading, release)"
  ],
  "key_concepts": [
    "Barrel-like internal structure of lipid nanoparticles formed by stacked lipid platelets",
    "Bound water retained by polysorbate 80 polar heads in 

### doanload new data 

In [12]:
import os
import requests
import pandas as pd
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

def download_pdfs_from_csv(csv_path: str, output_dir: str):
    """
    Reads a CSV file with a column named 'Link', visits each link (following redirects),
    finds the actual PDF (either by content‐type or by looking for a "Download PDF" link),
    and saves it under output_dir. Filenames are generated from the final URL or from
    the CSV row index if no PDF filename can be inferred.
    
    Args:
        csv_path (str): Path to the CSV file (must contain a column named 'Link').
        output_dir (str): Directory where all downloaded PDFs will be saved. Created if it doesn't exist.
    """
    # 1. Prepare output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # 2. Load CSV
    df = pd.read_csv(csv_path, usecols=["Link_real"])
    session = requests.Session()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/111.0.0.0 Safari/537.36"
    }
    
    for idx, row in df.iterrows():
        original_link = row["Link_real"].strip()
        if not original_link:
            print(f"[{idx}] Skipping empty link.")
            continue
        
        try:
            # 3. First GET to follow redirects
            resp = session.get(original_link, headers=headers, allow_redirects=True, timeout=30)
            resp.raise_for_status()
        except Exception as e:
            print(f"[{idx}] ERROR fetching '{original_link}': {e}")
            continue
        
        # 4. Check if response is already a PDF
        content_type = resp.headers.get("Content-Type", "")
        final_url = resp.url
        
        def save_pdf(binary_content: bytes, filename: str):
            out_path = os.path.join(output_dir, filename)
            with open(out_path, "wb") as f:
                f.write(binary_content)
            print(f"[{idx}] Saved PDF → {out_path}")
        
        if "application/pdf" in content_type.lower() or final_url.lower().endswith(".pdf") or final_url.lower().endswith(".epdf"):
            # We have a direct PDF response
            # Try to infer a filename from the final URL
            parsed = urlparse(final_url)
            basename = os.path.basename(parsed.path)
            if not basename.lower().endswith((".pdf", ".epdf")):
                # fallback if path ends with something weird
                basename = f"file_{idx}.pdf"
            save_pdf(resp.content, basename)
            continue
        
        # 5. Otherwise, parse HTML for a "Download PDF" link
        soup = BeautifulSoup(resp.text, "html.parser")
        pdf_link = None
        
        # Search for <a> tags whose text contains "Download PDF" or "PDF"
        for a in soup.find_all("a", href=True):
            link_text = a.get_text(separator=" ").strip().lower()
            href = a["href"]
            if "download pdf" in link_text or link_text == "pdf" or "pdf" in link_text:
                pdf_link = urljoin(final_url, href)
                break
        
        if not pdf_link:
            print(f"[{idx}] WARNING: No 'Download PDF' link found at {final_url}")
            continue
        
        # 6. GET the PDF URL
        try:
            pdf_resp = session.get(pdf_link, headers=headers, allow_redirects=True, timeout=30)
            pdf_resp.raise_for_status()
        except Exception as e:
            print(f"[{idx}] ERROR fetching PDF from '{pdf_link}': {e}")
            continue
        
        # 7. Verify content-type again
        pdf_ct = pdf_resp.headers.get("Content-Type", "").lower()
        if "application/pdf" not in pdf_ct and not pdf_link.lower().endswith((".pdf", ".epdf")):
            print(f"[{idx}] WARNING: Expected PDF but got '{pdf_ct}' from {pdf_link}")
            continue
        
        # 8. Infer filename from PDF URL (fall back to index)
        parsed_pdf = urlparse(pdf_link)
        base_pdf_name = os.path.basename(parsed_pdf.path)
        if not base_pdf_name.lower().endswith((".pdf", ".epdf")):
            base_pdf_name = f"file_{idx}.pdf"
        
        save_pdf(pdf_resp.content, base_pdf_name)

if __name__ == "__main__":
    # Example usage
    csv_path = "Research Papers - Biology_New.csv"  # Path to your CSV file with 'Link' column
    output_dir = "Bio_docs"  # Directory to save downloaded PDFs
    download_pdfs_from_csv(csv_path, output_dir)


[0] Saved PDF → Bio_docs/73835_CE%5BRa1%5D_F(IS)_QC(AN_SS)_PF1(AG_SS)_PFA(IS)_PN(IS).pdf
[1] Saved PDF → Bio_docs/77882_CE%5BRa1%5D_F(SHU)_PF1(AB_OM)_PFA(IS)_PN(IS).pdf
[2] Saved PDF → Bio_docs/76588_CE%5BRa1%5D_F(SHU)_QC(PS_SS)_PF1(AG_SS)_PFA(IS)_PB(AG_IS)PN(IS).pdf
[3] Saved PDF → Bio_docs/76047_CE%5BRa1%5D_F(IS)_QC(AN_SS)_PF1(VD_SS)_redo_PFA(IS)_PB(VD_IS)_PN(IS).pdf
[4] Saved PDF → Bio_docs/74551_CE%5BRa1%5D_F(SL)_QC(PS_SS)_PF1(AG_SS)_PFA(IS)_PB(AG_IS)_PN(OM).pdf
[5] Saved PDF → Bio_docs/70795_CE%5BRa1%5D_F(IS)_QC(PS_SS)_PF1(AG_SL)_PFA(IS)_PB(AG_IS)_PN(IS).pdf
[6] Saved PDF → Bio_docs/78107_CE%5BRa1%5D_F(SHU)_QC(PS_SS)_PF1(AG_SL)_PFA(IS)_PB(AG_IS)_PN(IS).pdf
[7] Saved PDF → Bio_docs/75142_CE%5BRa1%5D__F(IS)_QC(PS_OM)_PF1(AG_SL)_PFA_NC(IS)_PN(IS).pdf
[8] Saved PDF → Bio_docs/76833_CE%5BRa1%5D_F(SS)_QC_PF1(HJ_SS)_PFA(IS)_PN(IS).pdf
[9] Saved PDF → Bio_docs/76613_CE%5BRa1%5D_F(IS)_QC(PS_OM)_PF1(VD_SL)_redo_PFA(IS)_PFA(IS).pdf
[12] Saved PDF → Bio_docs/s41598-025-03387-9.pdf


### download data from csv

In [1]:
import os
import pandas as pd
import requests
from urllib.parse import urlparse, unquote

def download_files(
    csv_path: str = "Research Papers - Biology_New.csv",
    output_dir: str = "bio_docs"
) -> None:
    """
    Reads a CSV with a 'Link' column and downloads each URL to output_dir,
    following any redirects until the actual PDF. Saves each file with a .pdf extension.
    """
    # Load CSV
    df = pd.read_csv(csv_path)
    if 'Link_real' not in df.columns:
        raise ValueError("CSV must contain a 'Link_real' column.")
    os.makedirs(output_dir, exist_ok=True)

    # Create a session and set a browser-like User-Agent
    session = requests.Session()
    session.headers.update({
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/112.0.0.0 Safari/537.36"
        )
    })

    links = df['Link_real'].dropna().tolist()
    total = len(links)

    for idx, orig_url in enumerate(links, start=1):
        try:
            # Perform the GET request, allowing redirects until the final URL
            resp = session.get(orig_url, stream=True, timeout=30)
            resp.raise_for_status()

            # The final URL after redirects
            final_url = resp.url
            parsed_final = urlparse(final_url)
            raw_name = os.path.basename(parsed_final.path)
            raw_name = unquote(raw_name)

            # If the final URL path has an extension, use it; otherwise force ".pdf"
            name, ext = os.path.splitext(raw_name)
            if ext.lower() in ('.pdf', '.epdf'):
                # If it's ".epdf", replace with ".pdf"
                filename = f"{name}.pdf"
            else:
                filename = f"{idx}.pdf"

            out_path = os.path.join(output_dir, filename)

            # Check Content-Type to ensure it's a PDF
            content_type = resp.headers.get('Content-Type', '')
            if 'application/pdf' not in content_type.lower():
                print(
                    f"[{idx}/{total}] WARNING → "
                    f"URL did not return a PDF (Content-Type: {content_type}). "
                    f"Still saving to {filename}, but file may be invalid."
                )

            # Stream‐write to disk
            with open(out_path, 'wb') as f:
                for chunk in resp.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            print(f"[{idx}/{total}] Downloaded → {filename}")

        except Exception as e:
            print(f"[{idx}/{total}] FAILED  → {orig_url}\n    {e}")

if __name__ == "__main__":
    download_files()




[1/41] Downloaded → 73835_CE[Ra1]_F(IS)_QC(AN_SS)_PF1(AG_SS)_PFA(IS)_PN(IS).pdf
[2/41] Downloaded → 77882_CE[Ra1]_F(SHU)_PF1(AB_OM)_PFA(IS)_PN(IS).pdf
[3/41] Downloaded → 76588_CE[Ra1]_F(SHU)_QC(PS_SS)_PF1(AG_SS)_PFA(IS)_PB(AG_IS)PN(IS).pdf
[4/41] Downloaded → 76047_CE[Ra1]_F(IS)_QC(AN_SS)_PF1(VD_SS)_redo_PFA(IS)_PB(VD_IS)_PN(IS).pdf
[5/41] Downloaded → 74551_CE[Ra1]_F(SL)_QC(PS_SS)_PF1(AG_SS)_PFA(IS)_PB(AG_IS)_PN(OM).pdf
[6/41] Downloaded → 70795_CE[Ra1]_F(IS)_QC(PS_SS)_PF1(AG_SL)_PFA(IS)_PB(AG_IS)_PN(IS).pdf
[7/41] Downloaded → 78107_CE[Ra1]_F(SHU)_QC(PS_SS)_PF1(AG_SL)_PFA(IS)_PB(AG_IS)_PN(IS).pdf
[8/41] Downloaded → 75142_CE[Ra1]__F(IS)_QC(PS_OM)_PF1(AG_SL)_PFA_NC(IS)_PN(IS).pdf
[9/41] Downloaded → 76833_CE[Ra1]_F(SS)_QC_PF1(HJ_SS)_PFA(IS)_PN(IS).pdf
[10/41] Downloaded → 76613_CE[Ra1]_F(IS)_QC(PS_OM)_PF1(VD_SL)_redo_PFA(IS)_PFA(IS).pdf
[11/41] Downloaded → s41598-025-00534-0.pdf
[12/41] Downloaded → s41598-025-99590-9.pdf
[13/41] Downloaded → 13.pdf
[14/41] Downloaded → s41598-025-0

## Author - Critique Model

In [None]:
import os
import re
import uuid
import json
from dotenv import load_dotenv
import os
import logging
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.chains import LLMChain
from langchain_google_genai import ChatGoogleGenerativeAI

# Load environment variables from .env
load_dotenv()

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

def author_critique_loop(
    topics: list[str],
    concepts: list[str],
    guidelines: str,
    few_shot_examples: list[dict],
    max_turns: int = 3,
) -> list[dict]:
    """
    Runs an iterative author-critic loop to generate and refine biology questions.

    Logs the entire multi-turn conversation (author + critic) to a single file.

    Returns a list of dicts with:
        unique_id: str,
        question: str,
        topics: list[str],
        concepts: list[str],
        file_conversation_log: str  # path to the full conversation log
    """
    # Create a unique run-level conversation log
    run_id = uuid.uuid4().hex
    os.makedirs('logs', exist_ok=True)
    conv_log_path = f"logs/{run_id}.log"
    conv_logger = open(conv_log_path, 'w')

    def log(msg: str):
        conv_logger.write(msg + "\n")
        conv_logger.flush()

    # Initialize LLMs (temperature locked to 1.0)
    # author_llm = ChatOpenAI(model_name="gpt-4.1-mini", temperature=1.0)
    # critic_llm = ChatOpenAI(model_name="gpt-4.1", temperature=1.0)
    gemini_api = os.getenv("GEMINI_API_KEY")
    critic_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20", temperature=1.0,google_api_key=gemini_api) 
    author_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=1.0,google_api_key=gemini_api) 

    # Build Few-Shot prompt for author
    example_prompt = PromptTemplate(
        input_variables=["example"],
        template="Example:\n{{example}}\n---",
        template_format="jinja2"
    )
    author_fs_prompt = FewShotPromptTemplate(
        examples=few_shot_examples,
        example_prompt=example_prompt,
        prefix=(
            "You are a biology education specialist. Generate medium novel Undergraduate level biology questions. "
            "Each question must start with <question> and end with </question> "
        ),
        suffix=(
            "Now, based on the topics: {{topics}}, key concepts: {{concepts}}, and guidelines: {{guidelines}}, "
            "produce list of new, high-quality biology questions, each wrapped in <question>...</question> tags"
        ),
        input_variables=["topics", "concepts", "guidelines"],
        template_format="jinja2",
    )
    author_chain = LLMChain(llm=author_llm, prompt=author_fs_prompt)

    # Build Critic prompt
    critic_prompt = PromptTemplate(
        input_variables=["questions", "guidelines"],
        template=(
            "You are a biology assessment expert. Critique the following questions:\n"
            "{questions}\n"
            "Evaluate them against these guidelines:\n{guidelines}\n"
            "Provide concise, actionable feedback on how to improve." \
            "the markdown LaTeX syntax must be correct, and the questions must be wrapped in <question>...</question> tags.\n"
        ),
    )
    critic_chain = LLMChain(llm=critic_llm, prompt=critic_prompt)

    # Turn 0: Generation
    log("=== Turn 0: Author generates questions ===")
    questions_text = author_chain.run(
        topics=topics, concepts=concepts, guidelines=guidelines
    )
    log(questions_text)

    # Turn 0: Critic feedback
    log("=== Turn 0: Critic feedback ===")
    feedback = critic_chain.run(questions=questions_text, guidelines=guidelines)
    log(feedback)

    # Refinement turns
    for turn in range(1, max_turns):
        log(f"=== Turn {turn}: Author refines questions ===")
        refine_prompt = PromptTemplate(
            input_variables=["questions", "feedback"],
            template=(
                "Refine these questions based on the feedback:\n{questions}\n"
                "Feedback:\n{feedback}\n"
                "Return an improved numbered list of biology questions, each wrapped in <question>...</question> tags."
            ),
        )
        refine_chain = LLMChain(llm=author_llm, prompt=refine_prompt)
        questions_text = refine_chain.run(questions=questions_text, feedback=feedback)
        log(questions_text)
        if turn == max_turns - 1:
            log("=== Final questions generated ===")
            break
        log(f"=== Turn {turn}: Critic feedback ===")
        feedback = critic_chain.run(questions=questions_text, guidelines=guidelines)
        log(feedback)

    # Close the conversation log
    conv_logger.close()

    # Parse final questions into structured entries
    entries: list[dict] = []
    pattern = re.compile(r'<question>(.*?)</question>', re.DOTALL)
    import datetime

# Compute today's date in YYYYMMDD format
    date_str = datetime.date.today().strftime("%d%m%Y")

    for idx,match in enumerate(pattern.findall(questions_text)):
        question_body = match.strip()
        uid = f"BIO_{date_str}_{idx+1}"
        entries.append({
            "unique_id": uid,
            "topics": topics,
            "concepts": concepts,
            "question": question_body,
            "file_conversation_log": conv_log_path
        })
        print(f"Generated question: {question_body}")

    return entries

if __name__ == "__main__":
    # Example usage
    # topics = [
    #     "Standard Model of Particle Physics",
    #     "Electroweak Symmetry Breaking",
    #     "Elementary Particles and Forces",
    # ]
    # concepts = [
    #     "Theories like the Seesaw Mechanism, Majorana Neutrinos, and Sterile Neutrinos address neutrino masses.",
    #     "Neutrino oscillations suggest nonzero masses, contradicting the Standard Model's massless assumption.",
    #     "Radiative corrections adjust particle masses at higher energy scales, posing a fine-tuning problem.",
    #     "Quantum Field Theory is foundational to the Standard Model, explaining fundamental forces excluding gravity.",
    # ]
    # take the combination of topics and concepts from the file output_apple/topic_combinations.json

    with open ("output_bio_msc/topic_combinations.json", "r") as f:
        topic_combinations = json.load(f)

    topic_combinations = topic_combinations[:20]
    # topics = topic_combinations[0]['topics']
    guidelines = (
    r"1. Model a biology Scenario: Start from a real-world or idealized setup that requires conceptual understanding and physical reasoning."
    r"2. Expect answers of single word answer, numerical answer ."
    r"3. Force Multi-Step Reasoning: Ensure the solution involves two or more biological principles or steps "
    r"4. Avoid How based questions: Do not ask for simple definitions or explanations of concepts without a problem to solve nor give any hints in the question."
    r"5. Be Unique: Do not copy or rephrase standard textbook problems. Create novel, conceptually rich scenarios."
    r"6. Single Solution: Each question must yield only one correct result; no ambiguous answers."
    r"7. Use rigorous, concise phrasing: Formulate questions in clear, professional language appropriate for advanced learners."
    r"9. Units must be consistent; all numerical answers must include correct SI units. Symbols should follow conventional biology notation"
    r"10. Questions must not be multipart: Each question must focus on solving for a single target quantity only."
    r"11. Solution Formatting: Format the equations, numbers and variables using LaTeX syntax such that all inline math, numbers, variables are enclosed by $...$ and block math is enclosed by $$...$$. Avoid using markdown for unformatted data."
    r"12. * **Multi-Step Equations:** ALWAYS use `\begin{aligned} ... \end{aligned}`. Align with `&` (e.g., `&=`). Each step on a new line using `\\`."
)




    few_shot_examples = [
#         {"example": r"""Unique Stationary Point of a Relaxion  
# The relaxion potential is  
#  $V(φ) = g Λ^3 φ + Λ_b^4 cos(φ/f)$,  
# with the given inequality  
#  $g Λ^3 f < Λ_b^4$.  
# Show that there is exactly one solution $φ∈(0,πf)$ of  
#  $\frac{dV}{dφ} = 0$ 
# and express that $φ$ in closed form."""},
        {"example": r"""As a part of a field trip, students were asked to fill two bottles with pond water containing algae and other photosynthesizing organisms. A handheld dissolved oxygen (DO) sensor was used to measure the amount of dissolved oxygen in each bottle. It was found that the initial O2 conc.(mg/L) was $6.24\,\mathrm{mg/L}$. After that one bottle was put under a light. The second bottle was wrapped with black paper to block all light and was put under the same light. After 24 hours of incubation period, the DO was again measured in both the bottles. It was observed that the final O2 concentration was $6.39\,\mathrm{mg/L}$ in the light bottle and $6.16\,\mathrm{mg/L}$ in the dark bottle. The results obtained were recorded. From the data provided, help the students to find out the gross primary productivity (GPP) of the pond ecosystem."""},
     ]

    all_questions = []

    for combo in topic_combinations:
        topics = combo['topics']
        concepts = combo['key_concepts']
        print(f"Topics: {topics}")
        print(f"Key Concepts: {concepts}")
        final_questions = author_critique_loop(
            topics, concepts, guidelines, few_shot_examples, max_turns=3
        )
        all_questions.append(
            {
            "data":final_questions
            }
            
            )
        print(f"Final Questions:\n{final_questions}\n")
        # save the final questions to a file


    # Save all questions to a file
    with open("final_questions_biology_msc.json", "w") as f:
        json.dump(all_questions, f, indent=2)
    
    # 2) write a Markdown file you can grab directly
    with open("final_questions_biology_msc.md", "w") as md:
        for batch in all_questions:
            for q in batch["data"]:
                # Write the question with its unique ID and topics
                md.write(f"### Question ID: {q['unique_id']}\n")
                # Write the question text
                md.write("**Question:**\n")
            # q["question"] already contains things like $\theta$
                md.write(q["question"] + "\n\n")
    # logging.info("Author–Critique loop complete.")

    file_path = "questions_6june_bio_msc.csv"
    file_exists = os.path.isfile(file_path)
    is_empty = (not file_exists) or (os.path.getsize(file_path) == 0)
    import csv
    # Open with newline='' so that csv.writer can quote internal newlines correctly
    with open(file_path, "a", newline="", encoding="utf-8") as csv_file:
            writer = csv.writer(csv_file)  # default delimiter=',' and quoting=QUOTE_MINIMAL
            if is_empty:
                # Write header row only if file is new/empty
                writer.writerow(["question"])
            # Now append each question, letting csv.writer quote any embedded newline or commas
            for batch in all_questions:
                for q in batch["data"]:
                    print(f"Writing question: {q['question']}")
                    writer.writerow([q["question"]])

Topics: ['Comparison with traditional vaccine platforms (recombinant protein, plasmid DNA)', 'Immunogenicity and immune response evaluation']
Key Concepts: ['Encapsulation efficiency (>85%) of mRNA in LNPs', 'Biodistribution showing delivery to injection site, liver, and spleen', 'Immunogenicity evidenced by antigen-specific IgG and IFN-γ responses in mice', 'Microfluidic particle production method for reproducibility and scalability']
Generated question: A research team is developing an mRNA vaccine against a novel viral pathogen. They have successfully encapsulated the mRNA within LNPs using a microfluidic production method, achieving 90% encapsulation efficiency. In an *in vivo* experiment with mice, biodistribution analysis reveals that 60% of the injected LNPs are delivered to the liver, 30% remain at the injection site, and 10% are distributed to the spleen. If the initial injected dose contains $5 \times 10^{12}$ mRNA molecules, and assuming that 5% of the LNPs are lost during t

In [2]:
!pip install pyyaml

Defaulting to user installation because normal site-packages is not writeable
