<a href="https://colab.research.google.com/github/SruthiM-10/Irrigation-Scheduling-Recommendations-And-Dataset/blob/main/Updated_crawler%2Bscoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)

Mounted at /content/drive


In [None]:
! pip install -q langchain_community langchain_openai faiss-cpu langchain langchain_core langgraph pinecone PyPDF2 doi2pdf

[31mERROR: Cannot install langchain==0.1.14 and langchain_community==0.0.17 because these package versions have conflicting dependencies.[0m[31m
[0m[31mERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts[0m[31m
[0m

In [None]:
"""
Paper Crawler

This script downloads research papers based on DOIs and saves them to a specified directory.
It starts with a seed paper and follows references to download more papers.
"""

import os
import time
import uuid
import json
import requests
import pandas as pd
from doi2pdf import doi2pdf
import PyPDF2
from langchain_community.document_loaders import WebBaseLoader
from typing import Dict, List, Any, Tuple, Optional, Union
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage
import pickle
from langchain_core.pydantic_v1 import Field, conlist, BaseModel
from langchain_core.output_parsers import PydanticOutputParser

# Set environment variables for LangChain and OpenAI
import os
os.environ["LANGCHAIN_TRACING_V2"] = "false"
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "irrigation-crawler"
os.environ["OPENAI_API_KEY"] = "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"
# mg's acccount
os.environ["SERPSTACK_API_KEY"] = "a44490072b649f025cfa40e7dbf1ea25"

In [None]:
# save point for intermediate papers
DOWNLOAD_DIR = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Papers (current)"

# save point for each csv_file per seed paper
OUTPUT_DIR = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files (current)"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# LLM Configs

In [None]:
class ReferenceItem(BaseModel):
    """A reference from a paper."""
    title: str = Field(description="Title of the referenced paper")
    authors: str = Field(description="Authors of the referenced paper")
    year: str = Field(description="Publication year")
    doi: str = Field(description="DOI if available, otherwise 'NA'")
    doi_url: str = Field(description="DOI URL if available, otherwise 'NA'")
    venue: str = Field(description="Publication venue (journal or conference)")

class ReferenceLLM(BaseModel):
    """List of references extracted from a paper."""
    reference_set: conlist(ReferenceItem, min_items=0) = Field(
        description="List of all references extracted from the paper"
    )

class RelevanceItem(BaseModel):
    relevance_score: int = Field(default=0, description="Score from 1-10 on relevance to irrigation scheduling")
    score_justification: str = Field(default="NA")

class RelevanceLLM(BaseModel):
    """Model for LLM feature extraction response"""
    feature_set: List[RelevanceItem] = Field(description="Scoring of relevance to irrigation scheduling") # different description for compiled_dataset (on accident)

RELEVANCE_SCORING_INSTRUCTIONS = """
You are an assistant extracting irrigation experiment data from a research paper.

Score the paper on a scale from 1-10 (where 10 is highest) on its relevance to "Predicting Crop Quality Using Irrigation Scheduling Methods".

For the relevance_score, specifically consider:
- How central irrigation scheduling methods are to the paper
- Whether the paper includes results on: irrigation_method, number_of_irrigation_events, amount_irrigated, ET_levels, daily_drainage, fruit_quality, vitamin_levels, plant_yield, avg_fruits_per_plant, avg_fruit_weight
- The comprehensiveness of the reported results
- The paper's focus on crop quality prediction using irrigation scheduling

Provide a brief justification for your score in the score_justification field.

RULES:
- For papers without irrigation focus, provide a fair relevance score based on any related content
- Use "NA" for values not reported
- Never write full sentences in data fields

{text}
"""

REFERENCE_EXTRACTION_INSTRUCTIONS = """
You are a research assistant extracting reference information from a research paper.

Extract all citations and references from the following research paper text.
Focus on identifying DOIs, titles, authors, publication year, and publication venue.
If a DOI is available, prioritize that as it's the most reliable identifier.
For each reference, include as much of the following as possible:
- Title of the paper
- Authors (as a list if possible)
- Publication year
- DOI (in format like 10.xxxx/xxxxx or as URL https://doi.org/10.xxxx/xxxxx)
- Publication venue (journal or conference)

Format each reference as a structured object.
Be accurate and comprehensive.

Paper text:
{text}
"""

# Helper Functions

In [None]:
def setup_llm() -> Optional[ChatOpenAI]:
    if not os.environ["OPENAI_API_KEY"]:
        print("ERROR: OPENAI_API_KEY is not set. Please set your OpenAI API key.")
        return None

    try:
        # Set up the OpenAI model
        return ChatOpenAI(model="gpt-4o-mini", temperature=0)
    except Exception as e:
        print(f"Error setting up LLM: {e}")
        return None

def extract_text_from_pdf(pdf_path: str) -> str:
    print(f"\nAttempting to extract text from PDF file: {pdf_path}")
    print(f"File exists: {os.path.exists(pdf_path)}")
    print(f"File size: {os.path.getsize(pdf_path) if os.path.exists(pdf_path) else 'N/A'} bytes")

    try:
        pdf_text = ""
        # Read the PDF
        with open(pdf_path, 'rb') as file_:
            print(f"Successfully opened PDF file")
            reader = PyPDF2.PdfReader(file_)
            num_pages = len(reader.pages)
            print(f"PDF has {num_pages} pages")

            # Extract the text
            for page_num in range(num_pages):
                page = reader.pages[page_num]
                page_text = page.extract_text()
                pdf_text += page_text
                if page_num == 0:
                    print(f"First page extract length: {len(page_text)} characters")

        final_text = pdf_text.strip().replace("\n", " ")
        print(f"Total extracted text length: {len(final_text)} characters")
        return final_text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        import traceback
        traceback.print_exc()
        return ""

def extract_text_from_file(file_path: str) -> str:
    """
    Extract text from a file (PDF or text).

    Args:
        file_path: Path to the file

    Returns:
        Extracted text
    """
    if file_path.lower().endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    else:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read().strip()
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
            return ""

def get_final_url(initial_url: str) -> Optional[str]:
    """
    Follow redirects to get the final URL.

    Args:
        initial_url: Initial URL to follow

    Returns:
        Final URL after redirects or None if request failed
    """
    try:
        response = requests.get(initial_url, timeout=30)
        # The response.url attribute holds the final URL after all redirects
        final_url = response.url
        return final_url
    except requests.RequestException as e:
        print(f"Request error following URL {initial_url}: {e}")
    except Exception as e:
        print(f"Unexpected error following URL {initial_url}: {e}")
    return None

def webpage_base_loader(web_page: str) -> str:
    """
    Load content from a web page.

    Args:
        web_page: URL of the web page to load

    Returns:
        Text content of the web page
    """
    try:
        loader = WebBaseLoader(web_page)
        loader_docs = loader.load()
        text = " ".join([doc.page_content for doc in loader_docs])
        preprocessed_text = text.strip().replace("\n", " ").replace("\r", " ").replace("  ", " ")
        return preprocessed_text
    except Exception as e:
        print(f"Error loading web page {web_page}: {e}")
        return ""

# PDF Download Helpers

In [None]:
from urllib.parse import urljoin, quote_plus
from bs4 import BeautifulSoup
import os, requests
from typing import Dict, Any, Optional, List
from urllib.parse import urlparse, urljoin



def download_pdf_via_publisher(
    info: dict,
    doi: str,
    dest_path: str,
    *,
    timeout: int = 30,
    verbose: bool = True,
) -> bool:
    """
    Robust PDF downloader using publisher endpoints.
    - Uses `info['pdf_url']` if present.
    - Visits DOI landing page to pick up cookies/consent.
    - Tries SAGE patterns (/doi/pdf, /doi/pdfplus).
    - Parses landing page for meta/link/anchor PDF URLs.
    - Falls back to DOI content negotiation (Accept: application/pdf).

    Writes to dest_path. Returns True on success.
    """
    # Localized imports to avoid shadowing issues
    import os, requests
    from urllib.parse import urlparse, urljoin
    from bs4 import BeautifulSoup

    # Minimal headers (don’t rely on a global DEFAULT_HEADERS)
    BASE_HEADERS = {
        "User-Agent": "Mozilla/5.0",
        "Accept-Language": "en-US,en;q=0.9",
    }

    def log(*a):
        if verbose:
            print(*a)

    def _stream_to_file(resp: requests.Response, path: str, chunk: int = 65536) -> None:
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, "wb") as f:
            for b in resp.iter_content(chunk_size=chunk):
                if b:
                    f.write(b)

    def _try_get_and_save(sess: requests.Session, url: str, referer: str) -> bool:
        hdrs = {**BASE_HEADERS,
                "Accept": "application/pdf,application/octet-stream;q=0.9,*/*;q=0.8",
                "Referer": referer}
        try:
            r = sess.get(url, headers=hdrs, allow_redirects=True, timeout=timeout, stream=True)
            ctype = (r.headers.get("Content-Type") or "").lower()
            first = b""
            if r.ok:
                try:
                    first = next(r.iter_content(5))
                except StopIteration:
                    pass
            log(f"[GET] {r.status_code} {url} | ctype={ctype} | first5={first[:5]!r}")
            if r.ok and (("application/pdf" in ctype) or first.startswith(b"%PDF-")):
                # Write (include first chunk if we peeked it)
                with open(dest_path, "wb") as f:
                    if first:
                        f.write(first)
                    for chunk in r.iter_content(65536):
                        if chunk:
                            f.write(chunk)
                return True
            return False
        except Exception as e:
            log(f"[GET] ERROR {url}: {e}")
            return False

    # Normalize DOI to a URL
    doi_url = doi if doi.startswith(("http://", "https://")) else ("https://doi.org/" + doi)

    s = requests.Session()

    # Warm session on landing page (cookies/consent)
    try:
        r0 = s.get(doi_url, headers=BASE_HEADERS, allow_redirects=True, timeout=timeout)
        landing_url = r0.url
        host = (urlparse(landing_url).hostname or "").lower()
        log(f"[landing] {r0.status_code} {landing_url}")
    except Exception as e:
        landing_url = doi_url
        host = (urlparse(landing_url).hostname or "").lower()
        log(f"[landing] ERROR {doi_url}: {e}")

    # 1) Fast-path: info['pdf_url'] (join if relative)
    if isinstance(info, dict) and info.get("available") and info.get("pdf_url"):
        candidate = str(info["pdf_url"]).strip()
        if not candidate.startswith(("http://", "https://")):
            candidate = urljoin(landing_url, candidate)
        log(f"[info] trying info.pdf_url → {candidate}")
        if _try_get_and_save(s, candidate, referer=landing_url):
            return True
        log("[info] pdf_url failed, falling back...")

    # 2) SAGE patterns (/doi/pdf and /doi/pdfplus)
    if "journals.sagepub.com" in host:
        doi_suffix = doi_url.rsplit("/", 1)[-1]
        base = "https://journals.sagepub.com"
        candidates = [
            f"{base}/doi/pdf/{doi_suffix}?download=true",
            f"{base}/doi/pdf/{doi_suffix}",
            f"{base}/doi/pdfplus/{doi_suffix}?download=true",
            f"{base}/doi/pdfplus/{doi_suffix}",
        ]
        if "/doi/" in landing_url:
            pdf_try = landing_url.replace("/doi/", "/doi/pdf/")
            candidates.extend([pdf_try + "?download=true", pdf_try])
        for u in candidates:
            log(f"[sage] try {u}")
            if _try_get_and_save(s, u, referer=landing_url):
                return True
        log("[sage] patterns failed, trying parse...")

    # 3) Parse landing page for meta/link/anchor
    try:
        r = s.get(landing_url, headers={**BASE_HEADERS, "Referer": landing_url},
                  allow_redirects=True, timeout=timeout)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, "html.parser")
            # meta
            for tag in soup.find_all("meta"):
                if (tag.get("name","") or "").lower() == "citation_pdf_url":
                    u = tag.get("content")
                    if u:
                        u = urljoin(r.url, u)
                        log(f"[parse] meta citation_pdf_url → {u}")
                        if _try_get_and_save(s, u, referer=landing_url):
                            return True
            # link
            for link in soup.find_all("link", attrs={"type": "application/pdf"}):
                u = link.get("href")
                if u:
                    u = urljoin(r.url, u)
                    log(f"[parse] link[type=application/pdf] → {u}")
                    if _try_get_and_save(s, u, referer=landing_url):
                        return True
            # anchors
            for a in soup.find_all("a", href=True):
                href = a["href"].lower()
                if href.endswith(".pdf") or "/pdf" in href or "download" in href:
                    u = urljoin(r.url, a["href"])
                    log(f"[parse] anchor → {u}")
                    if _try_get_and_save(s, u, referer=landing_url):
                        return True
    except Exception as e:
        log(f"[parse] ERROR: {e}")

    # 4) Content negotiation last
    try:
        hdrs = {**BASE_HEADERS, "Accept": "application/pdf", "Referer": landing_url}
        r = s.get(doi_url, headers=hdrs, allow_redirects=True, timeout=timeout, stream=True)
        ctype = (r.headers.get("Content-Type") or "").lower()
        first = b""
        if r.ok:
            try:
                first = next(r.iter_content(5))
            except StopIteration:
                pass
        log(f"[cn] {r.status_code} {r.url} | ctype={ctype} | first5={first[:5]!r}")
        if r.ok and (("application/pdf" in ctype) or first.startswith(b"%PDF-")):
            _stream_to_file(r, dest_path)
            return True
    except Exception as e:
        log(f"[cn] ERROR: {e}")

    return False


# ---- Simple "is this a full paper?" heuristics ----
MIN_PDF_BYTES = 250_000   # 250 KB
MIN_PAGES     = 4

def is_likely_full_paper(pdf_path: str,
                         min_bytes: int = MIN_PDF_BYTES,
                         min_pages: int = MIN_PAGES) -> bool:
    """Cheap checks: file size and page count."""
    import os
    size = os.path.getsize(pdf_path)
    print("Size of the paper", size)
    print("pdf path", pdf_path)
    if size >= min_bytes:
        return True  # big enough, assume fine

    # Small file: check it’s at least N pages; otherwise reject
    try:
        import PyPDF2
        with open(pdf_path, "rb") as f:
            r = PyPDF2.PdfReader(f)
            n = len(r.pages)
            return n >= min_pages
    except Exception:
        # If we can't read it and it's tiny, reject
        return False


def _normalize_doi(doi: str) -> str:
    doi = doi.strip()
    if doi.startswith("http://") or doi.startswith("https://"):
        return doi
    # bare DOI -> https resolution
    return "https://doi.org/" + doi.lstrip("doi:").strip()

def _is_pdf_response(resp: requests.Response) -> bool:
    ctype = resp.headers.get("Content-Type", "").lower()
    return "application/pdf" in ctype

def _looks_like_pdf_url(url: str) -> bool:
    u = url.lower()
    return u.endswith(".pdf") or "/pdf" in u or "download" in u

def _validate_pdf_url(url: str, timeout: int = 20) -> tuple[bool, str, int]:
    """
    Returns (is_pdf, final_url, status_code).
    Uses HEAD first; falls back to GET(stream) + magic-bytes check.
    """
    try:
        # Some servers don't support HEAD properly; allow redirects.
        BASE_HEADERS = {
          "User-Agent": "Mozilla/5.0",
          "Accept-Language": "en-US,en;q=0.9",
        }
        h = {**BASE_HEADERS, "Accept": "application/pdf"}
        r = requests.head(url, headers=h, allow_redirects=True, timeout=timeout)
        if r.status_code == 200 and _is_pdf_response(r):
            # Optionally guard against tiny HTML responses mislabeled as pdf
            clen = int(r.headers.get("Content-Length", "0") or 0)
            if clen > 1024:  # >1KB, heuristic
                return True, r.url, r.status_code

        # Fallback to GET with stream, read a small chunk
        r = requests.get(url, headers=h, allow_redirects=True, timeout=timeout, stream=True)
        if r.status_code in (200, 206):
            # Check content-type or sniff PDF header
            if _is_pdf_response(r):
                return True, r.url, r.status_code
            # Sniff first bytes
            try:
                first = next(r.iter_content(chunk_size=5))
            except StopIteration:
                first = b""
            if first.startswith(b"%PDF-"):
                return True, r.url, r.status_code
        return False, r.url, r.status_code
    except Exception:
        return False, url, -1

def _extract_pdf_from_landing(landing_url: str, timeout: int = 20) -> str | None:
    try:
        BASE_HEADERS = {
          "User-Agent": "Mozilla/5.0",
          "Accept-Language": "en-US,en;q=0.9",
        }
        r = requests.get(landing_url, headers=BASE_HEADERS, allow_redirects=True, timeout=timeout)
        if r.status_code != 200:
            return None
        html = r.text
        soup = BeautifulSoup(html, "html.parser")

        # 1) High-signal meta tags used by many publishers
        for tag in soup.find_all("meta"):
            name = (tag.get("name") or "").lower()
            if name == "citation_pdf_url":
                pdf_url = tag.get("content")
                if pdf_url:
                    ok, final, _ = _validate_pdf_url(urljoin(r.url, pdf_url))
                    if ok:
                        return final

        # 2) <link type="application/pdf" ...>
        for link in soup.find_all("link", attrs={"type": "application/pdf"}):
            pdf_url = link.get("href")
            if pdf_url:
                ok, final, _ = _validate_pdf_url(urljoin(r.url, pdf_url))
                if ok:
                    return final

        # 3) Anchor tags that look like PDFs
        candidates = []
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if _looks_like_pdf_url(href):
                candidates.append(urljoin(r.url, href))

        # Deduplicate & validate
        seen = set()
        for c in candidates:
            if c in seen:
                continue
            seen.add(c)
            ok, final, _ = _validate_pdf_url(c)
            if ok:
                return final
        return None
    except Exception:
        return None

def _unpaywall_best_pdf(doi_url: str, email: str, timeout: int = 15) -> str | None:
    """
    Optional: query Unpaywall for OA locations.
    You must supply a contact email per their terms.
    """
    if not email:
        return None
    try:
        doi_suffix = doi_url.rsplit("/", 1)[-1]
        api = f"https://api.unpaywall.org/v2/{doi_suffix}"
        BASE_HEADERS = {
          "User-Agent": "Mozilla/5.0",
          "Accept-Language": "en-US,en;q=0.9",
        }
        r = requests.get(api, params={"email": email}, headers=BASE_HEADERS, timeout=timeout)
        if r.status_code != 200:
            return None
        j = r.json()
        # Prefer direct PDF if available
        loc = (j.get("best_oa_location") or {}) or {}
        pdf_url = loc.get("url_for_pdf") or loc.get("url")
        if pdf_url:
            ok, final, _ = _validate_pdf_url(pdf_url)
            if ok:
                return final
        # Fallback: try all oa_locations
        for loc in (j.get("oa_locations") or []):
            for key in ("url_for_pdf", "url"):
                u = loc.get(key)
                if not u:
                    continue
                ok, final, _ = _validate_pdf_url(u)
                if ok:
                    return final
        return None
    except Exception:
        return None

def _serpstack_find_pdf(query: str, serp_api_key: str, timeout: int = 20) -> str | None:
    try:
        url = "https://api.serpstack.com/search"
        params = {
            "access_key": serp_api_key,
            "query": query,
            "num": 10,           # grab a few more
        }
        BASE_HEADERS = {
          "User-Agent": "Mozilla/5.0",
          "Accept-Language": "en-US,en;q=0.9",
        }
        r = requests.get(url, params=params, headers=BASE_HEADERS, timeout=timeout)
        if r.status_code != 200:
            return None
        j = r.json()
        results = j.get("organic_results") or []
        # Prefer direct .pdf URLs first
        candidates = []
        for res in results:
            u = (res.get("url") or "").strip()
            if not u:
                continue
            # Sometimes serpstack returns tracking redirects; we still validate
            if _looks_like_pdf_url(u):
                candidates.append(u)
        # Then consider any result URL and validate it (some link to /pdf behind)
        for res in results:
            u = (res.get("url") or "").strip()
            if u and u not in candidates:
                candidates.append(u)
        # Validate candidates
        seen = set()
        for u in candidates:
            if u in seen:
                continue
            seen.add(u)
            ok, final, _ = _validate_pdf_url(u)
            if ok:
                return final
        return None
    except Exception:
        return None

def _crossref_title_authors_year(doi_url: str, timeout: int = 12) -> tuple[str|None, str|None, str|None]:
    """
    Best-effort metadata for better SERP queries.
    """
    try:
        suffix = doi_url.rsplit("/", 1)[-1]
        api = f"https://api.crossref.org/works/{suffix}"
        BASE_HEADERS = {
          "User-Agent": "Mozilla/5.0",
          "Accept-Language": "en-US,en;q=0.9",
        }
        r = requests.get(api, headers=BASE_HEADERS, timeout=timeout)
        if r.status_code != 200:
            return None, None, None
        j = r.json().get("message", {})
        title = (j.get("title") or [None])[0]
        year = None
        for key in ("published-print", "published-online", "issued"):
            if j.get(key) and j[key].get("date-parts"):
                year = str(j[key]["date-parts"][0][0])
                break
        authors = j.get("author") or []
        first_author_last = None
        if authors:
            last = authors[0].get("family") or authors[0].get("given") or None
            if last:
                first_author_last = last
        return title, first_author_last, year
    except Exception:
        return None, None, None

def find_open_pdf_for_doi(
    doi: str,
    serp_api_key: str | None = None,
    unpaywall_email: str | None = None,
    max_serp_queries: int = 1,
) -> dict:
    """
    Try multiple strategies to find an open (non-paywalled) PDF for a DOI.

    Returns a dict:
    {
      "available": bool,
      "pdf_url": str | None,
      "source": "content-negotiation" | "landing-meta" | "unpaywall" | "serpstack" | "none",
      "reason": str
    }
    """
    doi_url = _normalize_doi(doi)

    # 1) DOI content negotiation: ask for PDF directly
    try:
        BASE_HEADERS = {
          "User-Agent": "Mozilla/5.0",
          "Accept-Language": "en-US,en;q=0.9",
        }
        h = {**BASE_HEADERS, "Accept": "application/pdf"}
        r = requests.get(doi_url, headers=h, allow_redirects=True, timeout=20, stream=True)
        if r.status_code in (200, 206) and (_is_pdf_response(r) or next(r.iter_content(5), b"").startswith(b"%PDF-")):
            return {"available": True, "pdf_url": r.url, "source": "content-negotiation", "reason": "doi->pdf via accept header"}
    except Exception as e:
        print(e)
        pass

    # 2) Parse landing page for explicit PDF links or meta
    pdf = _extract_pdf_from_landing(doi_url)
    if pdf:
        return {"available": True, "pdf_url": pdf, "source": "landing-meta", "reason": "found citation_pdf_url/link/pdf anchor"}

    # 3) Unpaywall OA (optional)
    if unpaywall_email:
        pdf = _unpaywall_best_pdf(doi_url, unpaywall_email)
        if pdf:
            return {"available": True, "pdf_url": pdf, "source": "unpaywall", "reason": "best_oa_location"}

    # 4) SERPStack fallback (filetype:pdf)
    key = serp_api_key or os.getenv("SERPSTACK_API_KEY")
    if key:
        # Build a better query using Crossref if possible
        title, author, year = _crossref_title_authors_year(doi_url)
        base = f'"{title}"' if title else doi
        parts = [base, "filetype:pdf"]
        if author: parts.append(author)
        if year: parts.append(year)
        query = " ".join(parts)
        for _ in range(max_serp_queries):
            pdf = _serpstack_find_pdf(query, key)
            if pdf:
                return {"available": True, "pdf_url": pdf, "source": "serpstack", "reason": f"query={query}"}

    return {"available": False, "pdf_url": None, "source": "none", "reason": "no open PDF found"}


Main PDF Download

In [None]:
def check_doi_availability(doi: str) -> bool:
    info = find_open_pdf_for_doi(doi, unpaywall_email= "n3543460@gmail.com", serp_api_key= os.getenv("SERPSTACK_API_KEY"))
    print(info)
    #return bool(info.get("available", False))
    return info


def search_research_paper_using_doi(doi: str, download_dir: str = DOWNLOAD_DIR) -> Optional[str]:
    """
    Download a research paper using its DOI.

    Args:
        doi: DOI of the paper to download
        download_dir: Directory to save the downloaded PDF

    Returns:
        Path to the downloaded PDF or None if download failed
    """
    try:
        info = check_doi_availability(doi)
        is_available = bool(info.get("available", False))
        # First check if the paper is likely available
        if not is_available:
            print(f"Paper with DOI {doi} appears to be behind a paywall or not freely accessible. Skipping download.")
            return None

        # Generate a filename with its doi
        paper_name = str(uuid.uuid4()) + ".pdf"

        # Construct the full file path
        file_path = os.path.join(download_dir, paper_name)

        print(f"Downloading paper with DOI: {doi}")
        print("info contains: ", json.dumps(info))
        # Download the paper and save it to the specified directory
        ok = download_pdf_via_publisher(info, doi, file_path)
        if not ok:
            print(f"Direct publisher download failed for {doi}")
            return None


        if os.path.exists(file_path):
          if not is_likely_full_paper(file_path):
              print(f"{os.path.basename(file_path)} is too small/too few pages; deleting.")
              os.remove(file_path)
              return None
          else:
            print(f"Successfully downloaded {os.path.basename(file_path)} since the size is okay")

        # Verify the downloaded PDF is valid and not just an error page or paywall notification
        if os.path.exists(file_path):
            file_size = os.path.getsize(file_path)

            # Check if the file is too small (likely an error page)
            if file_size < 10000:  # Less than 10KB is suspicious
                with open(file_path, 'rb') as f:
                    content = f.read(1000).lower()  # Read first 1000 bytes
                    error_indicators = [b'error', b'not found', b'paywall', b'subscribe', b'purchase', b'access denied']

                    if any(indicator in content for indicator in error_indicators):
                        print(f"Downloaded file appears to be an error page or paywall notification. Deleting {file_path}")
                        os.remove(file_path)
                        return None

            # Validate that it's a proper PDF file
            try:
                import PyPDF2

                with open(file_path, 'rb') as f:
                    # Try to parse the PDF
                    try:
                        pdf_reader = PyPDF2.PdfReader(f)

                        # Check if PDF has pages
                        if len(pdf_reader.pages) == 0:
                            print(f"PDF file has no pages. Deleting {file_path}")
                            os.remove(file_path)
                            return None

                        # Try to extract text from the first page as a validation
                        first_page_text = pdf_reader.pages[0].extract_text()

                        # If we can't extract any text (or very little), it might be a scanned image-only PDF,
                        # which is fine, or it might be corrupt or have access restrictions
                        if not first_page_text:
                            # Check if PDF has access restrictions
                            if pdf_reader.is_encrypted:
                                print(f"PDF is encrypted/restricted. Deleting {file_path}")
                                os.remove(file_path)
                                return None

                        # Check if the file contains common paywall text patterns in the first few pages
                        paywall_text_patterns = ['sign in to access', 'subscribe to access', 'purchase this article',
                                              'access to this content', 'login to view', 'create an account', 'subscription required']

                        # Check first 3 pages or as many as available
                        max_pages_to_check = min(3, len(pdf_reader.pages))
                        for i in range(max_pages_to_check):
                            page_text = pdf_reader.pages[i].extract_text().lower()
                            if any(pattern in page_text for pattern in paywall_text_patterns):
                                print(f"PDF appears to be a paywall notification. Deleting {file_path}")
                                os.remove(file_path)
                                return None

                    except PyPDF2.errors.PdfReadError as e:
                        print(f"Invalid PDF file: {e}. Deleting {file_path}")
                        os.remove(file_path)
                        return None
            except Exception as e:
                print(f"Error validating PDF: {e}")
                # If we can't validate, assume it's valid rather than losing potentially good files

            print(f"Successfully downloaded to {file_path}")
            return file_path
        else:
            print(f"Download claimed to succeed but file does not exist: {file_path}")
            return None
    except Exception as e:
        print(f"Failed to download paper with DOI {doi}: {e}")
        return None

# Main Crawler Functions

In [None]:

def extract_references_from_pdf(text: str) -> List[Dict[str, Any]]:
  """
  Extract references from a PDF file using LLM.

  Args:
      pdf_path: Path to the PDF file

  Returns:
      List of references with DOI information
  """
  try:
        if not text or len(text) < 1000:  # Sanity check for minimum text length
            print(f"Insufficient text extracted")
            return []

        # Prepare LLM
        llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
        # structured_llm_reference = llm.with_structured_output(ReferenceLLM)

        # Create prompt for reference extraction
        # If text is too long, truncate it to fit within context window
        if len(text) > 60000:  # Approximate limit to stay within context window
            text = text[:60000]
            print("Text was truncated to fit within context window")

        system_message = SystemMessage(REFERENCE_EXTRACTION_INSTRUCTIONS.format(text=text))

        # Extract references using LLM
        print("Extracting references using LLM...")
        try:
            # references_result = structured_llm_reference.invoke([system_message])
            references_result = llm.invoke([system_message])
            parser = PydanticOutputParser(pydantic_object= ReferenceLLM)
            references = parser.parse(references_result.content)
            references = references.reference_set
            print(f"Extracted {len(references)} references")

            # Convert to list of dictionaries for easier handling
            refs_list = []
            for ref in references:
                ref_dict = {
                    "title": ref.title,
                    "authors": ref.authors,
                    "year": ref.year,
                    "doi": ref.doi,
                    "doi_url": ref.doi_url,
                    "venue": ref.venue
                }
                # Keep only references with valid DOIs
                if ref.doi != "NA" or ref.doi_url != "NA":
                    refs_list.append(ref_dict)

            print(f"Found {len(refs_list)} references with DOIs")
            return refs_list

        except Exception as e:
            print(f"Error during reference extraction: {e}")
            return []

  except Exception as e:
      print(f"Failed to extract references from PDF: {e}")
      return []

def extract_features_and_references(text: str, llm: ChatOpenAI, reference_only: bool = False,
                                  split_into_chunks: bool = False) -> Tuple[List[RelevanceItem], List[ReferenceItem]]:
    """
    Extract features and references from paper text using an LLM.

    Args:
        text: Paper text to analyze
        llm: LLM instance to use for extraction
        reference_only: If True, only extract references
        split_into_chunks: If True, split text into chunks for processing

    Returns:
        Tuple containing (features, references) as lists of Pydantic models
    """
    features = []
    references = []

    try:
        # Extract features if requested
        if not reference_only:
            # structured_llm = llm.with_structured_output(RelevanceLLM)

            if split_into_chunks:
                chunk_size = len(text) // 3
                text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

                for chunk in text_chunks:
                    try:
                        system_message_prompt = [SystemMessage(content=RELEVANCE_SCORING_INSTRUCTIONS.format(text=chunk))]
                        # invoke_results = structured_llm.invoke(system_message_prompt)
                        invoke_results = llm.invoke(system_message_prompt)
                        parser = PydanticOutputParser(pydantic_object=RelevanceLLM)
                        invoke_results = parser.parse(invoke_results.content)
                        features.extend(invoke_results.feature_set)
                    except Exception as chunk_error:
                        print(f"Error processing chunk: {chunk_error}")
            else:
                try:
                    system_message_prompt = [SystemMessage(content=RELEVANCE_SCORING_INSTRUCTIONS.format(text=text))]
                    # invoke_results = structured_llm.invoke(system_message_prompt)
                    invoke_results = llm.invoke(system_message_prompt)
                    parser = PydanticOutputParser(pydantic_object=RelevanceLLM)
                    invoke_results = parser.parse(invoke_results.content)
                    features = invoke_results.feature_set
                except Exception as feature_error:
                    print(f"Error extracting features: {feature_error}")
                    if "token" in str(feature_error).lower():
                        print("Text too long, retrying with chunking...")
                        return extract_features_and_references(text, llm, reference_only, True)

        # Extract references
        score = features[0].dict().get("relevance_score", 0)
        if score < 8:
            print(f"Skipping paper with low relevance score: {score}")
            return [], []

        references = extract_references_from_pdf(text)
        # if you want to use this, remember to change with_structured_output
        # try:
        #     structured_llm_reference = llm.with_structured_output(ReferenceLLM)
        #     system_message_prompt_references = [SystemMessage(content=REFERENCE_EXTRACTION_INSTRUCTIONS.format(text=text))]
        #     references_result = structured_llm_reference.invoke(system_message_prompt_references)
        #     references = references_result.reference_set
        # except Exception as ref_error:
        #     print(f"Error extracting references: {ref_error}")
        #     if ("token" in str(ref_error).lower() or "JSON" in str(ref_error)) and not split_into_chunks:
        #         # Try with a smaller portion of text focused on the references section
        #         print("Retrying with smaller reference section...")
        #         try:
        #             # Try to find references section (typically at the end)
        #             lower_text = text.lower()
        #             ref_section_start = max(
        #                 lower_text.rfind("references"),
        #                 lower_text.rfind("bibliography"),
        #                 lower_text.rfind("literature cited")
        #             )

        #             if ref_section_start > 0:
        #                 ref_text = text[ref_section_start:]
        #                 system_message_prompt_references = [SystemMessage(content=REFERENCE_EXTRACTION_INSTRUCTIONS.format(text=ref_text))]
        #                 references_result = structured_llm_reference.invoke(system_message_prompt_references)
        #                 references = references_result.reference_set
        #         except Exception as inner_error:
        #             print(f"Failed to extract references from section: {inner_error}")
        #             if not split_into_chunks:
        #                   print("Retrying with text chunking...")
        #                   return extract_features_and_references(text, llm, reference_only, True)

    except Exception as e:
        print(f"Error in extraction: {e}")

        # If it's a token limit error, try splitting the text
        if not split_into_chunks and "token" in str(e).lower():
            print("Retrying with text chunking...")
            return extract_features_and_references(text, llm, reference_only, True)

    return features, references

def process_paper(file_path: str, doi: str, llm: ChatOpenAI, content_source: str = "pdf", reference_only: bool = False) -> Dict[str, Any]:
    """
    Process a single paper to extract features and references.

    Args:
        file_path: Path to the paper file
        doi: DOI of the paper
        llm: LLM instance to use for extraction
        reference_only: If True, only extract references

    Returns:
        Dictionary with processing results
    """
    print(f"Processing paper: {file_path}")

    # Extract text from the paper
    text = extract_text_from_file(file_path)

    if not text or len(text) < 300:
        print(f"Insufficient text extracted from {file_path}")
        return {"doi": doi, "file_path": file_path, "features": [], "references": [], "error": "Insufficient text"}

    # Extract features and references
    features, references = extract_features_and_references(text, llm, reference_only)
    if len(features) == 0:
        return {"doi": doi, "file_path": file_path, "features": [], "references": [], "error": "Too low relevance"}

    # Convert Pydantic models to dictionaries and add DOI to each feature
    feature_dicts = []
    for feature in features:
        feature_dict = feature.dict()
        feature_dict["doi_url"] = doi
        feature_dicts.append(feature_dict)

    result = {
        "doi": doi,
        "file_path": file_path,
        "features": feature_dicts,
        "references": references,
        "processed_at": time.strftime("%Y-%m-%d %H:%M:%S"),
        "content source": content_source
    }

    return result

# Main Crawler

In [None]:
BENCHMARK = 10000

def add_seed_paper(pdf_path: str, doi: str) -> None:
    """
    Add a seed paper to start the crawling process and extract its references.

    Args:
        pdf_path: Path to the seed paper PDF
        doi: DOI of the seed paper
    """
    if not os.path.exists(pdf_path):
        print(f"Seed paper not found at {pdf_path}")
        return

    result = process_paper(pdf_path, doi, llm)
    if result.get("features") != []:
      downloaded_papers.append(result)
    references = result.get("references", [])

    # Add references to queue
    for ref in references:
        doi = ref.get("doi_url") or ref.get("doi")
        if not doi:
            return

        if doi not in visited_dois:
            reference_queue.append(ref)
            print(f"Added to queue: {doi}")

    print(f"Added {len(references)} references to queue from seed paper")


def crawl_papers(initial_dataset: Optional[pd.DataFrame] = None, max_papers: int = BENCHMARK, seed_path: str = None, seed_doi: str = None, initial_reference_queue: list = [], initial_visited_dois: set = set()) -> None:
    """
    Main function to crawl and download papers recursively by following references.

    Args:
        max_papers: Maximum number of papers to download
        seed_path: Path to the seed paper PDF
        seed_doi: DOI of the seed paper
    """
    global downloaded_papers
    global visited_dois
    global reference_queue
    global START_PDF_PATH
    global count
    global total
    total = 0
    count = 0
    downloaded_papers = []
    visited_dois = initial_visited_dois
    reference_queue = initial_reference_queue

    # Start with seed paper if provided
    if seed_path:
        add_seed_paper(seed_path, seed_doi or f"seed-local:{uuid.uuid4()}") # TODO get real doi, not seed-local
        START_PDF_PATH = seed_path
    elif seed_doi:
        file_path = search_research_paper_using_doi(seed_doi)
        START_PDF_PATH = file_path
        if file_path:
            add_seed_paper(file_path, seed_doi)
    elif len(initial_reference_queue) != 0:
      pass
    else:
        print("No seed paper provided. Please provide either a seed paper path or a DOI.")
        return

    # Process references recursively until we reach the maximum or run out of references
    print(f"Starting recursive crawling with {len(reference_queue)} references in queue")

    while count < max_papers and len(reference_queue) != 0:
        reference = reference_queue.pop(0)
        # try:
        #   START_PDF_PATH = reference.get("START_PDF_PATH")
        # except Exception as e:
        #   print(e)
        doi = reference.get("doi_url") or reference.get("doi")

        # Normalize DOI format
        if doi.startswith("https://doi.org/"):
            clean_doi = doi[16:]
        else:
            clean_doi = doi

        # Skip if we've already processed this DOI or it's invalid
        if not doi or doi in visited_dois or f"https://doi.org/{clean_doi}" in visited_dois:
            continue

        print(f"Processing paper {count+1}/{max_papers}: {doi}")
        print(f"Reference info: {reference.get('title', 'No title')} ({reference.get('year', 'Unknown year')})")

        # Download the paper
        file_path = search_research_paper_using_doi(doi)

        # If download fails, try to get content from the web
        content_source = "pdf"
        # if file_path is None:
        #     try:
        #         redirect_url = get_final_url(doi)
        #         if redirect_url:
        #             web_text = webpage_base_loader(redirect_url)
        #             if web_text and len(web_text) > 300:
        #                 # Save web content as a text file
        #                 text_file_path = os.path.join(DOWNLOAD_DIR, f"{str(uuid.uuid4())}.txt")
        #                 with open(text_file_path, 'w', encoding='utf-8') as f:
        #                     f.write(web_text)
        #                 file_path = text_file_path
        #                 content_source = "web"
        #     except Exception as e:
        #         print(f"Failed to get content for {doi}: {e}")

        if file_path is not None:
            # Save paper info
            result = process_paper(file_path, doi, llm, content_source= content_source)
            if len(result.get("features")) == 0:
              # Mark as visited
              visited_dois.add(doi)
              if doi.startswith("https://doi.org/"):
                  visited_dois.add(doi[16:])
              else:
                  visited_dois.add(f"https://doi.org/{doi}")
              total += 1
              print(f"Total processed paper {total} - {count}/{max_papers}")
              continue

            downloaded_papers.append(result)

            # Mark as visited
            visited_dois.add(doi)
            if doi.startswith("https://doi.org/"):
                visited_dois.add(doi[16:])
            else:
                visited_dois.add(f"https://doi.org/{doi}")

            count += 1
            total += 1

            print(f"Successfully processed paper {count}/{max_papers}")
            references = result.get("references", [])

            for ref in references:
                doi = ref.get("doi_url") or ref.get("doi")
                if not doi:
                    return

                if doi not in visited_dois:
                    reference_queue.append(ref)
                    print(f"Added to queue: {doi}")

            print(f"Added {len(references)} references to queue from paper {count}")
            print(f"Queue now contains {len(reference_queue)} references")

        time.sleep(2)
        if count % 10 == 0:
          # pdf_basename = os.path.splitext(os.path.basename(START_PDF_PATH))[0]
          pdf_basename = "compiled"
          csv_filename = f"{pdf_basename}_irrigation_dataset.csv"
          SAVE_LOCATION = os.path.join("/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files (current)", csv_filename)
          if initial_dataset is None:
            try:
              final_dataframe = pd.DataFrame.from_dict(downloaded_papers)
              final_dataframe.to_csv(SAVE_LOCATION, index = False)
              print(f"Dataset saved to: {SAVE_LOCATION}")
            except Exception as e:
              try:
                with open(csv_filename, "wb") as f:
                  pickle.dump(downloaded_papers, f)
                print(f"Dataset saved to: {SAVE_LOCATION}")
              except Exception as e:
                print(f"Error saving dataset: {e}")
          else:
            try:
              downloaded_papers_df = (pd.DataFrame.from_dict(downloaded_papers)).rename(columns= {'features': 'relevance_dict'})
              downloaded_papers_df['relevance_score'] = downloaded_papers_df['relevance_dict'].apply(lambda x: x[0].get('relevance_score'))
              final_dataframe = pd.concat([initial_dataset, pd.DataFrame.from_dict(downloaded_papers)], ignore_index= True)
              final_dataframe.to_csv(SAVE_LOCATION, index = False)
              print(f"Dataset saved to: {SAVE_LOCATION}")
            except Exception as e:
              try:
                with open(f"{csv_filename}_{uuid.uuid4()}", "wb") as f:
                  pickle.dump(downloaded_papers, f)
                print(f"Secondary Dataset saved to: {SAVE_LOCATION}")
              except Exception as e:
                print(f"Error saving dataset: {e}")

    print(f"Crawling complete. Downloaded {count} papers.")
    print(f"Remaining references in queue: {len(reference_queue)}")
    print("Run this script again to continue downloading from the queue.")

In [None]:
prev_dataset_path = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files (11 16 25)/Extra analysis/features_compiled_irrigation_dataset (1).csv"
prev_dataset = pd.read_csv(prev_dataset_path)
prev_dataset

# test
downloaded_papers = pd.read_csv("/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files (11 16 25)/1-s2.0-S0378377422005297-main_irrigation_dataset.csv")

downloaded_papers_df = (pd.DataFrame.from_dict(downloaded_papers)).rename(columns={'features': 'relevance_dict'})
downloaded_papers_df['relevance_score'] = downloaded_papers_df['relevance_dict'].apply(lambda x: eval(x)[0].get('relevance_score'))
downloaded_papers_df

Unnamed: 0,doi,file_path,relevance_dict,references,processed_at,content source,relevance_score
0,seed-local:92894d84-4051-43c0-9f4f-9d94496dc7bb,/content/drive/MyDrive/1:1_Sruthi_Manoj/Datase...,"[{'relevance_score': 9, 'score_justification':...",[{'title': 'Pre-and post-sowing irrigation sch...,2025-09-12 14:23:48,pdf,9


In [None]:
# building on previous features_compiled, trying to download more of the papers without doing more llm calls
prev_dataset_path = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files (11 16 25)/Extra analysis/features_compiled_irrigation_dataset (1).csv"
prev_dataset = pd.read_csv(prev_dataset_path)
prev_dataset

dir_path = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Seed Papers"
csv_path  = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files (current)"

existing_dois = set(prev_dataset["doi"])
existing_seed_paths = set(prev_dataset["file_path"])

seed_paths = []
reference_queue = []

def filter_duplicate_reference(possible_references: list):
  final_possible_references = []
  for reference in possible_references:
    doi = reference.get("doi")
    doi_url = reference.get("doi_url")

    flag = False
    for curr_reference in final_possible_references:
      if doi_url == curr_reference.get("doi_url") or doi == curr_reference.get("doi"):
        flag = True
        break
    if not flag:
      final_possible_references.append(reference)
  return final_possible_references

for filename in os.listdir(dir_path):
  if filename.endswith(".pdf"):
    pdf_path = os.path.join(dir_path, filename)
    if f"{os.path.splitext(os.path.basename(pdf_path))[0]}_irrigation_dataset.csv" in os.listdir(csv_path):
      print(f"Skipped {filename} because its in csv_path")
      continue
    SEED_PATH = os.path.join(dir_path, filename)

    if SEED_PATH not in existing_seed_paths:
        print("invalid path")
        seed_paths.append(SEED_PATH)
        continue

    references = eval(prev_dataset.loc[prev_dataset["file_path"] == SEED_PATH, "references"].iloc[0])
    possible_references = [reference for reference in references]

    final_possible_references = filter_duplicate_reference(possible_references)

    visited = set()
    while len(final_possible_references) != 0:
      reference = final_possible_references.pop(0)
      doi = reference.get("doi_url")
      if doi in visited:
        continue

      visited.add(doi)

      if doi in existing_dois:
         print("exists", doi)
         future_references = eval(prev_dataset.loc[prev_dataset["doi"] == doi, "references"].iloc[0])
         final_possible_references.extend([future_ref for future_ref in future_references])
      else:
        reference_queue.append(reference)
  else:
    continue

  reference_queue = filter_duplicate_reference(reference_queue)

exists https://doi.org/10.1016/j.agwat.2022.107982
exists https://doi.org/10.1016/j.agwat.2024.108709
exists https://doi.org/10.1038/s41598-024-52972-x
exists https://doi.org/10.3389/fpls.2023.1242074
exists https://doi.org/10.1038/s41598-021-97899-9
exists https://doi.org/10.1007/s11269-018-2087-1
exists https://doi.org/10.1007/s00271-020-00715-x
exists https://doi.org/10.1007/s00271-012-0378-5
exists https://doi.org/10.5539/jas.v4n1p71
exists https://doi.org/10.24017/science.2018.3.5
exists https://doi.org/10.1007/s11540-010-9179-5
exists https://doi.org/10.1371/journal.pone.0231831
exists https://doi.org/10.4236/jwarp.2016.83024
exists https://doi.org/10.1007/s10343-010-0222-x
exists http://dx.doi.org/10.5539/jas.v4n1p71
exists http://dx.doi.org/10.1093/jxb/erl131
exists http://dx.doi.org/10.1071/PP9740529
exists http://dx.doi.org/10.1007/BF02986910
exists https://doi.org/10.1071/AR9780897
exists https://doi.org/10.1007/s11540-014-9261-5
exists https://doi.org/10.21273/HORTSCI.33.4.

In [None]:
len(reference_queue)

901

In [None]:
len(seed_paths)

84

In [None]:
llm = setup_llm()
import warnings

warnings.filterwarnings(
    "ignore",
    message="Received a Pydantic BaseModel V1 schema",
    category=UserWarning,
)
crawl_papers(initial_reference_queue=reference_queue)

pdf_basename = os.path.splitext(os.path.basename(START_PDF_PATH))[0]
# pdf_basename = "compiled"
csv_filename = f"{pdf_basename}_irrigation_dataset.csv"
SAVE_LOCATION = os.path.join("/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files", csv_filename)
try:
  final_dataframe = pd.DataFrame.from_dict(downloaded_papers)
  final_dataframe.to_csv(SAVE_LOCATION, index = False)
  print(f"Dataset saved to: {SAVE_LOCATION}")
except Exception as e:
  try:
    with open(csv_filename, "wb") as f:
      pickle.dump(downloaded_papers, f)
    print(f"Dataset saved to: {SAVE_LOCATION}")
  except Exception as e:
    print(f"Error saving dataset: {e}")

Starting recursive crawling with 881 references in queue
Processing paper 1/10000: https://doi.org/10.2134/agronj2008.0218s
Reference info: AquaCrop —the FAO crop model to simulate yield response to water: III. Parameterization testing maize (2009)
{'available': True, 'pdf_url': 'https://pdfs.semanticscholar.org/8d56/b799ae33ab3558095755fcc6c26d27f26f1a.pdf', 'source': 'serpstack', 'reason': 'query=https://doi.org/10.2134/agronj2008.0218s filetype:pdf'}
Downloading paper with DOI: https://doi.org/10.2134/agronj2008.0218s
info contains:  {"available": true, "pdf_url": "https://pdfs.semanticscholar.org/8d56/b799ae33ab3558095755fcc6c26d27f26f1a.pdf", "source": "serpstack", "reason": "query=https://doi.org/10.2134/agronj2008.0218s filetype:pdf"}
[landing] 403 https://acsess.onlinelibrary.wiley.com/doi/10.2134/agronj2008.0218s
[info] trying info.pdf_url → https://pdfs.semanticscholar.org/8d56/b799ae33ab3558095755fcc6c26d27f26f1a.pdf
[GET] 200 https://pdfs.semanticscholar.org/8d56/b799ae33ab

KeyboardInterrupt: 

In [None]:
# running on a new seed paper
dir_path = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Seed Papers"
csv_path  = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files"
FINISHED_SEED_PATH = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Seed Papers/Effect of Deficit Irrigation on Wheat (Triticum aestivum L.) Yield and Water Use Efficiency.pdf"

for filename in os.listdir(dir_path):
  if filename.endswith(".pdf"):
    pdf_path = os.path.join(dir_path, filename)
    if f"{os.path.splitext(os.path.basename(pdf_path))[0]}_irrigation_dataset.csv" in os.listdir(csv_path):
      print(f"Skipped {filename} because its in csv_path")
      continue
    SEED_PATH = os.path.join(dir_path, filename)
    if SEED_PATH == FINISHED_SEED_PATH:
      continue
  else:
    continue
  llm = setup_llm()

  import warnings

  warnings.filterwarnings(
      "ignore",
      message="Received a Pydantic BaseModel V1 schema",
      category=UserWarning,
  )

  crawl_papers(seed_path= SEED_PATH)

  # this part wasn't in code that is currently running, so losing a bunch of papers
  # also deleting any with too low relevance + serpstack api key limit

  pdf_basename = os.path.splitext(os.path.basename(START_PDF_PATH))[0]
  # pdf_basename = "compiled"
  csv_filename = f"{pdf_basename}_irrigation_dataset.csv"
  SAVE_LOCATION = os.path.join("/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files", csv_filename)
  try:
    final_dataframe = pd.DataFrame.from_dict(downloaded_papers)
    final_dataframe.to_csv(SAVE_LOCATION, index = False)
    print(f"Dataset saved to: {SAVE_LOCATION}")
  except Exception as e:
    try:
      with open(csv_filename, "wb") as f:
        pickle.dump(downloaded_papers, f)
      print(f"Dataset saved to: {SAVE_LOCATION}")
    except Exception as e:
      print(f"Error saving dataset: {e}")

Skipped 1-s2.0-S0378377422005297-main.pdf because its in csv_path
Skipped Effects of different irrigation scheduling methods on physiology, yield, and irrigation water productivity of soybean varieties.pdf because its in csv_path
Skipped Effects of irrigation scheduling on the yield and irrigation water productivity of cucumber in coconut coir culture (1).pdf because its in csv_path
Skipped fpls-14-1242074.pdf because its in csv_path
Skipped agronomy-12-00810-v3.pdf because its in csv_path
Skipped sustainability-15-10204-v2.pdf because its in csv_path
Processing paper: /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Seed Papers/Effectofirrigationschedulingonyieldqualityandwateruseefficiencyofpotatoplantsgrownunderdeficitirrigationconditions.pdf

Attempting to extract text from PDF file: /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Seed Papers/Effectofirrigationschedulingonyieldqualityandwateruseefficiencyofpotatoplantsgrownunderdeficitirrigationconditions.pdf
File exists: True
File 

Traceback (most recent call last):
  File "/tmp/ipython-input-475724992.py", line 24, in extract_text_from_pdf
    num_pages = len(reader.pages)
                ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/PyPDF2/_page.py", line 2063, in __len__
    return self.length_function()
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/PyPDF2/_reader.py", line 445, in _get_num_pages
    return self.trailer[TK.ROOT]["/Pages"]["/Count"]  # type: ignore
           ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/PyPDF2/generic/_data_structures.py", line 266, in __getitem__
    return dict.__getitem__(self, key).get_object()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/PyPDF2/generic/_base.py", line 259, in get_object
    obj = self.pdf.get_object(self)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/PyPDF2/_reader.py", line 

HTTPSConnectionPool(host='research.libraries.wsu.edu', port=8443): Max retries exceeded with url: /xmlui/bitstream/handle/2376/7035/em4830.pdf?sequence=1&isAllowed=y (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7ad4d90240e0>, 'Connection to research.libraries.wsu.edu timed out. (connect timeout=20)'))
{'available': False, 'pdf_url': None, 'source': 'none', 'reason': 'no open PDF found'}
Paper with DOI https://research.libraries.wsu.edu:8443/xmlui/bitstream/handle/2376/7035/em4830.pdf?sequence=1&isAllowed=y appears to be behind a paywall or not freely accessible. Skipping download.


KeyboardInterrupt: 

In [None]:
reference_queue = []
import ast

# building upon the previous set of references (if applicable)
dir_path = "/content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files"
for filename in os.listdir(dir_path):
  if filename.endswith(".csv"):
    file_path = os.path.join(dir_path, filename)
    df = pd.read_csv(file_path)
    for index, row in df.iterrows():
      feature = ast.literal_eval(row["features"])
      if len(feature) == 0:
        continue
      score = feature[0].get("relevance_score")
      if score < 8:
        continue

      references = ast.literal_eval(row["references"])
      if len(references) == 0:
        continue
      for ref in references:
        ref["START_PDF_PATH"] = row["file_path"]
      reference_queue.extend(references)

In [None]:
len(reference_queue)

838

In [None]:
llm = setup_llm()

import warnings

warnings.filterwarnings(
    "ignore",
    message="Received a Pydantic BaseModel V1 schema",
    category=UserWarning,
)
crawl_papers(initial_reference_queue= reference_queue)

Starting recursive crawling with 838 references in queue
Processing paper 1/10000: https://doi.org/10.1016/j.agwat.2022.107982
Reference info: Pre-and post-sowing irrigation scheduling impacts on crop phenology and water productivity of cotton (Gossypium hirsutum L.) in sub-tropical north-western India (2022)
Downloading paper with DOI: https://doi.org/10.1016/j.agwat.2022.107982
Found paper:  Pre-and post-sowing irrigation scheduling impacts on crop phenology and water productivity of cotton (Gossypium hirsutum L.) in sub-tropical north-western India
Invalid PDF file: EOF marker not found. Deleting /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Papers/888b460c-2732-49b0-ac07-a4d6092778dd.pdf
Dataset saved to: /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/csv_files/compiled_irrigation_dataset.csv
Processing paper 1/10000: NA
Reference info: Cotton (2012)
Error checking availability of NA: Invalid URL 'NA': No scheme supplied. Perhaps you meant https://NA?
Paper with DOI NA appears t



Successfully downloaded to /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Papers/1a3d9a90-500e-45b7-a588-a69ffe5b3495.pdf
Processing paper: /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Papers/1a3d9a90-500e-45b7-a588-a69ffe5b3495.pdf

Attempting to extract text from PDF file: /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Papers/1a3d9a90-500e-45b7-a588-a69ffe5b3495.pdf
File exists: True
File size: 974565 bytes
Successfully opened PDF file
PDF has 9 pages
First page extract length: 3380 characters
Total extracted text length: 43846 characters
Skipping paper with low relevance score: 3
Total processed paper 13 - 5/10000
Processing paper 6/10000: https://doi.org/10.1016/j.jplph.2017.05.001
Reference info: Potassium application regulates nitrogen metabolism and osmotic adjustment in cotton (Gossypium hirsutum L.) functional leaf under drought stress (2017)
Downloading paper with DOI: https://doi.org/10.1016/j.jplph.2017.05.001
Found paper:  Title: Potassium application regulates nitrog



Successfully downloaded to /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Papers/9977a069-5140-4fb1-869e-ad0abc399f37.pdf
Processing paper: /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Papers/9977a069-5140-4fb1-869e-ad0abc399f37.pdf

Attempting to extract text from PDF file: /content/drive/MyDrive/1:1_Sruthi_Manoj/Dataset/Papers/9977a069-5140-4fb1-869e-ad0abc399f37.pdf
File exists: True
File size: 957107 bytes
Successfully opened PDF file
PDF has 14 pages
First page extract length: 3504 characters
Total extracted text length: 63233 characters
Skipping paper with low relevance score: 3
Total processed paper 66 - 31/10000
Processing paper 32/10000: https://doi.org/10.3389/fpls.2013.00516
Reference info: Effects of fuzzless cottonseed phenotype on cottonseed nutrient composition in near isogenic cotton (Gossypium hirsutum L.) mutant lines under well-watered and water stress conditions (2013)
Downloading paper with DOI: https://doi.org/10.3389/fpls.2013.00516
Found paper:  Effects of f