In [None]:
# ---------------- INSTALL DEPENDENCIES ----------------
!pip install google-generativeai PyPDF2

In [None]:
# full_pipeline_gemini.py
# Full pipeline: PDF -> extract Methodology -> dataset zip -> scan -> iterative preprocessing -> re-scan -> model generation & 2-epoch training
# Use in Colab: upload your PDF and optional dataset.zip to /content and update paths below.

import os
import sys
import json
import zipfile
import shutil
import traceback
import subprocess
from pathlib import Path
from collections import Counter
import random
import time

import PyPDF2
import google.generativeai as genai

# ---------- CONFIG ----------
# Replace with your Gemini API key OR set environment variable GEMINI_API_KEY
GEMINI_API_KEY = "YOUR GEMINI API KEY HERE"
if GEMINI_API_KEY in (None, "", "YOUR_API_KEY_HERE"):
    print("⚠️ Please set GEMINI_API_KEY environment variable or replace GEMINI_API_KEY in the script.")
    # Do not exit; user might replace later. But warn.
genai.configure(api_key=GEMINI_API_KEY)
MODEL = genai.GenerativeModel("gemini-2.5-flash")

# Paths (change as needed)
DEFAULT_EXTRACT_DIR = "/content/dataset_extracted"
PREPROCESS_OUTPUT_JSON = "/content/dataset_info_after_preprocessing.json"
TRAINING_METADATA_JSON = "/content/training_metadata.json"
PREPROCESSING_SCRIPT_PATH = "/content/preprocessing_generated.py"
MODEL_SCRIPT_PATH = "/content/model_generated.py"

# Threshold above which we sample exactly 10% of files
LARGE_DATASET_THRESHOLD = 1000

# Max attempts for iterative generation
MAX_ATTEMPTS_PREPROCESS = 10
MAX_ATTEMPTS_MODEL = 10

# ---------- Utilities ----------

def safe_get_text(response):
    """Safely extract text from Gemini response, or return '' if empty."""
    try:
        if hasattr(response, "text") and response.text:
            return response.text.strip()
        elif hasattr(response, "candidates") and response.candidates:
            for cand in response.candidates:
                if cand.content.parts:
                    return "".join(p.text for p in cand.content.parts if hasattr(p, "text"))
        return ""
    except Exception as e:
        print("⚠️ Could not extract text from Gemini response:", e)
        return ""


def safe_write(path, text):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

def read_text(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

# ---------- PDF extraction ----------

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def extract_section_via_gemini(paper_text, section_name, chunk_size=7500):
    """
    Chunk the paper_text and ask Gemini to return ONLY the requested section.
    Concatenate results from chunks.
    """
    parts = []
    for i in range(0, len(paper_text), chunk_size):
        chunk = paper_text[i:i+chunk_size]
        prompt = f"""
You are an assistant. From the following research paper text chunk, extract ONLY the '{section_name}' section (if present).
If the section continues across chunks, return the available portion from this chunk.
Return ONLY the section text — no commentary.

Paper chunk:
\"\"\"{chunk}\"\"\"
"""
        resp = MODEL.generate_content(prompt)
        txt = safe_get_text(resp)
        if txt:
            parts.append(txt)
    return "\n".join(parts).strip()

def extract_methodology(paper_text):
    # Try multiple section names
    for name in ["Methodology", "Methods", "Method", "Experimental Setup"]:
        sec = extract_section_via_gemini(paper_text, name)
        if sec and len(sec) > 20:
            return sec
    # fallback: ask Gemini to extract the part that *looks like* methodology
    prompt = f"""
You are an assistant. From the following research paper text, find and return ONLY the portion that explains how the research was conducted (algorithms, architectures, datasets, experimental procedure).
Return the text ONLY (no commentary).

Paper text:
\"\"\"{paper_text[:24000]}\"\"\"
"""
    resp = MODEL.generate_content(prompt)
    return safe_get_text(resp)

def extract_dataset_names_from_paper(paper_text):
    prompt = f"""
From the following research paper text, list the names of datasets used (e.g., MNIST, CIFAR-10, COCO, ImageNet, IMDB).
Return only dataset names separated by commas, or 'None' if none are mentioned.

Paper text:
\"\"\"{paper_text[:8000]}\"\"\"
"""
    resp = MODEL.generate_content(prompt)
    txt = safe_get_text(resp)
    if not txt or txt.lower().startswith("none"):
        return None
    # take the first line/list
    items = [s.strip() for s in re_split_commas_and_newlines(txt) if s.strip()]
    return items

def re_split_commas_and_newlines(s):
    # helper simple splitter preserving items
    parts = []
    for line in s.splitlines():
        for p in line.split(","):
            parts.append(p.strip())
    return parts

# ---------- Dataset zip handling & scanning ----------

def extract_zip(zip_path, extract_to=DEFAULT_EXTRACT_DIR):
    if os.path.exists(extract_to):
        shutil.rmtree(extract_to)
    os.makedirs(extract_to, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_to)
    return extract_to

def is_image_file(fname):
    return fname.lower().endswith(('.png','.jpg','.jpeg','.bmp','.tiff','.gif','.webp'))

def is_audio_file(fname):
    return fname.lower().endswith(('.wav','.mp3','.flac','.ogg','.m4a','.aac'))

def is_text_file(fname):
    return fname.lower().endswith(('.txt','.csv','.tsv','.json','.md'))

def scan_dataset(root_path, max_preview_files=50):
    info = {
        "root": os.path.abspath(root_path),
        "num_files": 0,
        "types": {"images":0, "audio":0, "text":0, "others":0},
        "has_train_test_split": False,
        "train_dir": None,
        "test_dir": None,
        "class_folders": [],
        "sample_files": {"images": [], "audio": [], "text": [], "others": []},
        "total_files_by_ext": {}
    }
    # check top-level for train/test
    try:
        entries = [p.name.lower() for p in Path(root_path).iterdir() if p.is_dir()]
    except Exception:
        entries = []
    if "train" in entries or "test" in entries:
        info["has_train_test_split"] = True
        if "train" in entries:
            info["train_dir"] = os.path.join(root_path, "train")
        if "test" in entries:
            info["test_dir"] = os.path.join(root_path, "test")

    ext_counter = Counter()
    for dirpath, dirnames, filenames in os.walk(root_path):
        for fname in filenames:
            fpath = os.path.join(dirpath, fname)
            info["num_files"] += 1
            ext = Path(fname).suffix.lstrip(".").lower()
            ext_counter[ext] += 1
            if is_image_file(fname):
                info["types"]["images"] += 1
                if len(info["sample_files"]["images"]) < max_preview_files:
                    info["sample_files"]["images"].append(fpath)
            elif is_audio_file(fname):
                info["types"]["audio"] += 1
                if len(info["sample_files"]["audio"]) < max_preview_files:
                    info["sample_files"]["audio"].append(fpath)
            elif is_text_file(fname):
                info["types"]["text"] += 1
                if len(info["sample_files"]["text"]) < max_preview_files:
                    info["sample_files"]["text"].append(fpath)
            else:
                info["types"]["others"] += 1
                if len(info["sample_files"]["others"]) < max_preview_files:
                    info["sample_files"]["others"].append(fpath)

    info["total_files_by_ext"] = dict(ext_counter)

    # detect class folders (immediate subdirs that contain many images)
    immediate_dirs = [p for p in Path(root_path).iterdir() if p.is_dir()]
    class_candidates = []
    for d in immediate_dirs:
        count_images = sum(1 for _ in d.rglob("*") if is_image_file(_.name))
        if count_images > 0:
            class_candidates.append((d.name, count_images))
    if len(class_candidates) > 1:
        info["class_folders"] = [c for c, n in class_candidates]
    return info

def plan_sampling(info, threshold=LARGE_DATASET_THRESHOLD):
    plan = {"use_sampling": False, "sample_percent": 1.0, "sample_count": info.get("num_files", 0)}
    if info.get("num_files", 0) > threshold:
        plan["use_sampling"] = True
        plan["sample_percent"] = 0.10
        plan["sample_count"] = max(1, int(round(info["num_files"] * 0.10)))
    return plan

# ---------- Running generated Python code safely ----------

def run_generated_python(code, workdir="/content", timeout=1200):
    """
    Runs generated Python code in a subprocess.
    - Executes any '!pip install ...' lines first (in the current environment)
    - Writes remaining code into a temp file under workdir and runs python on it
    Returns (success:bool, stdout+stderr:str)
    """
    os.makedirs(workdir, exist_ok=True)
    pip_lines = []
    other_lines = []
    for line in code.splitlines():
        stripped = line.strip()
        if stripped.startswith("!pip"):
            pip_lines.append(stripped[1:].strip())
        else:
            other_lines.append(line)
    try:
        # Run pip installs first (if any)
        for cmd in pip_lines:
            print(f"📦 Running install: {cmd}")
            subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        tmp_path = os.path.join(workdir, "_gen_code.py")
        with open(tmp_path, "w", encoding="utf-8") as f:
            f.write("\n".join(other_lines))
        # Run the script
        proc = subprocess.run(["python3", tmp_path], cwd=workdir, capture_output=True, text=True, timeout=timeout)
        stdout = proc.stdout or ""
        stderr = proc.stderr or ""
        success = proc.returncode == 0
        return success, stdout + ("\n---STDERR---\n" + stderr if stderr else "")
    except subprocess.CalledProcessError as e:
        return False, f"CalledProcessError: {e}\n{getattr(e, 'stderr', '')}"
    except Exception as e:
        return False, f"Exception while running generated code:\n{traceback.format_exc()}"

# ---------- Gemini helpers for iterative generation ----------

def ask_gemini(prompt, temperature=0.0, max_output_chars=20000):
    resp = MODEL.generate_content(prompt)
    txt = safe_get_text(resp)
    # trim if too long
    if len(txt) > max_output_chars:
        return txt[:max_output_chars]
    return txt

def iterative_generate_and_run(base_prompt, check_success_condition=None, max_attempts=5, workdir="/content"):
    """
    base_prompt -> ask Gemini to produce a Python script
    Run it. On error, send both code and error to Gemini and ask for fix (repeat).
    check_success_condition: optional function(success, combined_output) -> bool to accept success
    Returns final_code, run_log, success_bool
    """
    attempt = 1
    prompt = base_prompt
    last_code = None
    run_log = ""
    while attempt <= max_attempts:
        print(f"\n--- Gemini generation attempt {attempt}/{max_attempts} ---")
        code = ask_gemini(prompt)
        last_code = code
        safe_write(os.path.join(workdir, f"attempt_{attempt}_generated.py"), code)
        print(">> Running generated code...")
        success, out = run_generated_python(code, workdir=workdir)
        run_log += f"\n\n--- Attempt {attempt} ---\nSuccess: {success}\nOutput:\n{out}\n"
        if success:
            if check_success_condition is None or check_success_condition(True, out):
                return code, run_log, True
            else:
                # consider as failure and provide more context
                error_msg = f"Script ran but check_success_condition failed.\nOutput:\n{out}"
        else:
            error_msg = out
            print(error_msg)
        # prepare correction prompt
        correction_prompt = f"""
The following Python script was generated previously (the full script is below). When executed, it produced an error or didn't meet the required post-condition.

Please return a corrected Python script (ONLY the Python code, no surrounding explanation) that fixes the problem.

--- Previous script:
{code}

--- Observed problem / error / output:
{error_msg}

The environment is a Colab-like Linux environment. The script must be runnable as-is and must create/read any files under /content if necessary.
 Do NOT include any Markdown, backticks, or ```python fences.Return ONLY runnable Python code.
"""
        prompt = correction_prompt
        attempt += 1
    # if exhausted
    return last_code, run_log, False

# ---------- Top-level pipeline ----------

def pipeline_from_pdf_and_zip(pdf_path, dataset_zip_path=None, extract_dir=DEFAULT_EXTRACT_DIR):
    result = {
        "success": False,
        "methodology_text": None,
        "initial_dataset_info": None,
        "preprocessing_code": None,
        "preprocessing_run_log": None,
        "final_dataset_info": None,
        "model_code": None,
        "model_run_log": None,
        "artifacts": {}
    }

    # 1) Extract paper text and methodology
    print("📄 Extracting text from PDF...")
    paper_text = extract_text_from_pdf(pdf_path)
    if not paper_text or len(paper_text) < 50:
        print("⚠️ Warning: PDF text extraction returned very little content. Check PDF.")
    print("🧠 Asking Gemini to extract Methodology...")
    methodology_text = extract_methodology(paper_text)
    methodology_text=""" """
    result["methodology_text"] = methodology_text
    safe_write("/content/extracted_methodology.txt", methodology_text)
    print("✅ Methodology extracted (saved to /content/extracted_methodology.txt)")

    # 2) If dataset zip provided: extract + scan
    dataset_info = None
    if dataset_zip_path:
        print("📦 Extracting dataset zip...")
        dataset_root = extract_zip(dataset_zip_path, extract_to=extract_dir)
        print("🔎 Scanning dataset...")
        dataset_info = scan_dataset(dataset_root)
        result["initial_dataset_info"] = dataset_info
        safe_write("/content/initial_dataset_info.json", json.dumps(dataset_info, indent=2))
        print(f"Found {dataset_info.get('num_files',0)} files. Types: {dataset_info.get('types')}")
    else:
        print("ℹ️ No dataset zip provided. The pipeline will request Gemini to use dummy dataset if needed.")
        dataset_root = None

    # 3) Prepare and run preprocessing generation (if dataset provided)
    final_dataset_info = None
    if dataset_root:
        plan = plan_sampling(dataset_info)
        sampling_msg = ""
        if plan["use_sampling"]:
            sampling_msg = f"Dataset is large (num_files={dataset_info['num_files']}). Use EXACTLY 10% sampling (sample_count={plan['sample_count']}) for train+test."
        preprocess_prompt = f"""
You are an expert ML engineer producing Colab-ready Python code for dataset preprocessing.
Constraints:
- Output only Python code (NO MARKDOWN, NO EXPLANATION).
- The dataset root is: {dataset_root}
- The goal: produce a Python script that:
  1) Detects dataset structure (train/test folders, folder-per-class, csv listing, etc).
  2) If train/test folders don't exist, create an 80/20 split.
  3) Supports images, audio, and text/tabular CSV files.
  4) If dataset is large, {sampling_msg}
  5) Produces a JSON file at {PREPROCESS_OUTPUT_JSON} describing:
      - final num files per split,
      - paths used for train/test/val,
      - detected classes (if any),
      - summary of any transforms applied.
  6) Saves processed/organized files under /content/processed/ as needed.
  7) Keep runtime reasonable for Colab.

Here is a quick scan of the dataset (you must use this info to make decisions):
{json.dumps(dataset_info, indent=2)}

Also the research methodology to inform preprocessing decisions (short):
{methodology_text[:4000]}

Return ONLY a runnable Python script that performs preprocessing and writes the JSON at {PREPROCESS_OUTPUT_JSON}. Include any !pip install lines at the top if necessary.
"""
        # iterative generation + run
        print("🛠️ Generating preprocessing script via Gemini (iterative)...")
        code, run_log, ok = iterative_generate_and_run(preprocess_prompt, max_attempts=MAX_ATTEMPTS_PREPROCESS, workdir="/content")
        result["preprocessing_code"] = code
        result["preprocessing_run_log"] = run_log
        safe_write(PREPROCESSING_SCRIPT_PATH, code or "")
        if not ok:
            print("❌ Preprocessing generation failed after retries. Check logs.")
            # we still try to proceed using initial scan info
            final_dataset_info = dataset_info
        else:
            # if success, try load PREPROCESS_OUTPUT_JSON
            if os.path.exists(PREPROCESS_OUTPUT_JSON):
                with open(PREPROCESS_OUTPUT_JSON, "r", encoding="utf-8") as f:
                    final_dataset_info = json.load(f)
            else:
                # fallback to re-scan
                final_dataset_info = scan_dataset(dataset_root)
            result["final_dataset_info"] = final_dataset_info
            safe_write("/content/final_dataset_info.json", json.dumps(final_dataset_info, indent=2))
            print("✅ Preprocessing finished. Final dataset info saved to /content/final_dataset_info.json")
    else:
        print("ℹ️ Skipping preprocessing - no dataset provided.")
        final_dataset_info = None

    # 4) Generate model+training code using final_dataset_info + methodology
    print("🏗️ Generating model & training script via Gemini (iterative)...")
    dataset_info_str = json.dumps(final_dataset_info, indent=2) if final_dataset_info else "None (use sklearn make_classification fallback)"
    model_prompt = f"""
You are an expert ML engineer writing Colab-ready Python code for model definition, training and evaluation.
Constraints:
- Output ONLY Python code (NO explanations).Dont include ```python``` too in the output.
- Training MUST run for EXACTLY 2 EPOCHS.
- Save a training metadata JSON to {TRAINING_METADATA_JSON} with: epochs_run, final_accuracy, dataset_summary.
- Save model artifact to /content/final_model.pt (or another file in /content).
- Use random seed for determinism where applicable.
- Use dataset paths & details from this dataset_info: {dataset_info_str}
- If dataset_info is None, use sklearn.make_classification fallback dataset and still train 2 epochs.
- Keep code runnable in Colab and include !pip installs if needed.

Methodology (for model architecture/hyperparams) summary:
{methodology_text[:6000]}

Return ONLY a runnable Python script that trains the model for exactly 2 epochs and writes {TRAINING_METADATA_JSON}.
"""
    model_code, model_run_log, ok_model = iterative_generate_and_run(model_prompt, max_attempts=MAX_ATTEMPTS_MODEL, workdir="/content")
    result["model_code"] = model_code
    result["model_run_log"] = model_run_log
    safe_write(MODEL_SCRIPT_PATH, model_code or "")

    # 5) Attempt to retrieve training metadata if produced
    if os.path.exists(TRAINING_METADATA_JSON):
        with open(TRAINING_METADATA_JSON, "r", encoding="utf-8") as f:
            try:
                training_meta = json.load(f)
                result["artifacts"]["training_metadata"] = training_meta
            except Exception:
                result["artifacts"]["training_metadata_raw"] = read_text(TRAINING_METADATA_JSON)
    if os.path.exists("/content/final_model.pt"):
        result["artifacts"]["model_path"] = "/content/final_model.pt"

    result["success"] = ok_model
    print("🔚 Pipeline finished. success =", result["success"])
    safe_write("/content/pipeline_result.json", json.dumps(result, indent=2))
    return result

# ---------- Example usage ----------
if __name__ == "__main__":
    # Edit these:
    pdf_path = "/content/12911_2025_Article_3027.pdf"         # <-- replace with your PDF path in Colab
    dataset_zip_path = "/content/dataset.zip"    # <-- replace with uploaded dataset.zip or set to None

    if not os.path.exists(pdf_path):
        print(f"⚠️ PDF not found at {pdf_path}. Please upload it to /content or change pdf_path.")
    # dataset_zip_path optional:
    if dataset_zip_path and not os.path.exists(dataset_zip_path):
        print(f"⚠️ Dataset zip not found at {dataset_zip_path}. Proceeding without dataset.")
        dataset_zip_path = None

    res = pipeline_from_pdf_and_zip(pdf_path, dataset_zip_path)
    print("\n--- Pipeline summary ---")
    print("Methodology length:", len(res.get("methodology_text") or ""))
    if res.get("final_dataset_info"):
        print("Final dataset files:", res["final_dataset_info"].get("num_files"))
    print("Preprocessing script saved to:", PREPROCESSING_SCRIPT_PATH)
    print("Model script saved to:", MODEL_SCRIPT_PATH)
    print("Result metadata saved to: /content/pipeline_result.json")


  *   **Optimizer:** The Adam optimization rule was selected for its ability to handle sparse gradients and its adaptive learning rate. This involves calculating moving averages of gradients ($m_t$) and squared gradients ($v_t$), followed by bias correction ($\hat{m}_t, \hat{v}_t$). The weights are updated using a learning rate ($\alpha$), the bias-corrected moving average of gradients, and the square root of the bias-corrected moving average of squared gradients (with a small constant $\epsilon$ for numerical stability). Hyperparameters $\beta_1$ and $\beta_2$ control the exponential decay rates for the moving averages.


📄 Extracting text from PDF...
🧠 Asking Gemini to extract Methodology...
⚠️ Could not extract text from Gemini response: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 1.
⚠️ Could not extract text from Gemini response: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 1.
⚠️ Could not extract text from Gemini response: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 1.
⚠️ Could not extract text from Gemini response: Invalid operation: The `response.text` quick accessor requires the res