In [None]:
!playwright install 

In [None]:
import os
from dotenv import load_dotenv
from pprint import pprint

load_dotenv()

api_key = os.getenv("TAVILY_KEY")
# print(api_key)


# DOWNLOAD DIRECTLY IF a PDF opens straight up

In [None]:
import requests
def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_filename

download_file(url)

---

# Corpus Collection

In [2]:
"""
llm_query_builder.py
--------------------

A tiny utility that asks an LLM (OpenAI in this example) to **randomly pick**
one to three terms from each topic bucket of your `keyword_corpus`
and return a search‑ready string suitable for Tavily / Google.

• Each run produces a different query (the LLM shuffles internally).
• You can add system instructions to tweak style—e.g., force operators
  like site:, intitle:, filetype:pdf, etc.
• The resulting query string is returned so you can feed it straight
  to Tavily’s `search()` call.
"""

import os, json, random
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ---- 1) Load / define the corpus ------------------------------------------
# domain_keyword_corpus.py

keyword_corpus = {
    # 1. Coal‑(Bed/Mine) Methane – Reservoir & Drainage Technology
    "coal_methane": [
        "coal mine methane",                # CMM
        "cmm extraction",
        "cmm utilization",
        "coal‑bed methane",                 # CBM
        "high‑gas mine",
        "outburst mine",
        "fiery mine",
        "underground drainage",
        "surface drainage",
        "in‑seam borehole",
        "cross‑measure borehole",
        "gob gas drainage",
        "gob venthole",
        "stress‑relieved seam extraction",
        "multi‑branch horizontal directional well",
        "multipurpose surface well",
        "hydraulic fracturing",
        "hydraulic slotting",
        "permeability enhancement",
        "adsorption desorption",
        "methane oxidation efficiency",
        "ventilation‑air‑methane capture"   # VAM
    ],

    # 2. Economics & Policy of Methane Abatement
    "abatement_economics": [
        "abatement cost curve",
        "conserved supply curve",           # CSC
        "marginal abatement cost",          # MAC
        "carbon pricing",
        "avoidable‑emission cost",
        "extraction‑step diversity",
        "utilization‑step diversity",
        "technology standardisation",
        "infrastructure strengthening"
    ],

    # 3. Carbon Capture & Storage / CO₂‑focused Terms
    "ccs_co2": [
        "carbon capture and storage",       # CCS
        "geological storage",
        "aquifer storage",
        "depleted field storage",
        "enhanced coal‑bed methane recovery",  # ECBMR
        "leakage rate",
        "bottom‑up energy‑system model",    # MARKAL
        "enhanced oil recovery",            # EOR
        "climate‑change mitigation",
        "non‑co2 ghg reduction"
    ],

    # 4. Cross‑cutting Concepts & Metrics
    "cross_cutting": [
        "greenhouse gas synergy effect",
        "low‑concentration gas transport",
        "gas‑water two‑phase flow",
        "safety regulations",
        "gas outburst prevention",
        "borehole sealing technology",
        "methane power generation",
        "electricity from cmm",
        "electricity from cbm",
        "surface‑underground joint extraction",
        "huainan mode",
        "jincheng mode"
    ]
}

# ---- 2) Build the LLM prompt ----------------------------------------------
_SYSTEM = """You are a research search‑query generator.
When asked, you will:
1. Randomly CHOOSE 1‑3 terms from EACH topic bucket provided.
2. Shuffle the chosen terms.
3. Return ONE single‑line boolean query string wrapped in double quotes
   for multi‑word phrases and separated by spaces.
4. Do NOT add any explanation—output ONLY the query."""

def build_prompt(corpus: dict, k_per_topic: int = 3) -> str:
    # create a compact JSON block for the assistant to sample from
    compact = {k: random.sample(v, min(k_per_topic, len(v)))
               for k, v in corpus.items()}
    return (
        f"{json.dumps(compact, indent=None)}\n\n"
        "Compose the query now:"
    )

# ---- 3) Ask the LLM --------------------------------------------------------
def get_random_query(corpus: dict = keyword_corpus) -> str:
    prompt = build_prompt(corpus)
    rsp = client.chat.completions.create(
        model="gpt-4o-mini",  # or whichever model
        messages=[
            {"role": "system", "content": _SYSTEM},
            {"role": "user",   "content": prompt}
        ],
        max_tokens=50,
        temperature=0.7
    )
    return rsp.choices[0].message.content.strip()

# ---- 4) Example ------------------------------------------------------------

q = get_random_query()
q = q[1:-1] + " methane mititgation"
q = set(q.split(" "))
q = ' '.join(q)
print("Generated search string →", str(q))
    # you can now call: tavily.search(q, search_depth="advanced")



Generated search string → curve mititgation borehole in‑seam drainage coal‑bed supply conserved storage gob enhanced aquifer methane recovery gas


---

# SEARCH FACILITY

In [22]:
# !pip install -qU langchain langchain-openai langchain-tavily
from typing import Any, Dict, Optional, TypedDict, List
from pydantic import Field
from langchain_core.messages import BaseMessage
import datetime
import os

from langgraph.prebuilt import create_react_agent
from langchain_openai import ChatOpenAI
from langchain_tavily import TavilySearch
from langchain.schema import HumanMessage, SystemMessage
from langchain_community.utilities import SerpAPIWrapper
from langchain_community.tools import DuckDuckGoSearchRun

class PaperMetadata(TypedDict):
    title: str
    link: str
    snippet: str = Field(description="Short summary of the paper's content")
    abstract: str | None = Field(description="The exact abstract stated on the paper's website.")
    published_date: Optional[str] = Field(description="This field shoul contain the extact date the paper was published on")

class State(TypedDict):
    papers: List[PaperMetadata]

    
    
    
# Initialize LLM
llm = ChatOpenAI(model="gpt-4.1", temperature=0)

# Initialize Tavily Search Tool
tavily_search_tool = TavilySearch(
    max_results=5,
    topic="general",
    tavily_api_key=os.getenv("TAVILY_KEY")
)

serp_api_wrapper = SerpAPIWrapper(serpapi_api_key=os.getenv("SERP_API_KEY"))

def serp_tool(query: str) -> str:
    """
    Use this tool to search the web for the most relevant papers and research articles 
    based on the keywords. The date today is {today}. 
    It is not necessary that all keywords must exist in the paper, 
    but should be related to methane mitigation
    """
    return serp_api_wrapper.run(query)

def duckduckgo_tool(query: str) -> str:
    """
    Use this tool to search the web for the most relevant papers and research articles 
    based on the keywords. The date today is {today}. 
    It is not necessary that all keywords must exist in the paper, 
    but should be related to methane mitigation
    """
    return DuckDuckGoSearchRun().run(query)

# Set up Prompt with 'agent_scratchpad'
today = datetime.datetime.today().strftime("%D")
prompt =  f"""You are a helpful reaserch assistant, 
    you will be given a query and you will need to
    search the web for the most relevant papers and research articles 
    based on the keywords. The date today is {today}. 
    It is not necessary that all keywords must exist in the paper, 
    but should be related to methane mitigation"""

# Create an agent that can use tools
agent = create_react_agent(
    model=llm,
    tools=[tavily_search_tool],
    prompt=prompt,
    response_format=State,
)

user_input =  q
print("User Input: ", user_input)

# Construct input properly as a dictionary
# response = agent.stream({"messages": [HumanMessage(content=user_input)]}, stream_mode="values")
# for i in response:
#     pprint(i)


response = agent.invoke({"messages": [HumanMessage(content=user_input)]})



User Input:  curve mititgation borehole in‑seam drainage coal‑bed supply conserved storage gob enhanced aquifer methane recovery gas


In [23]:
response['structured_response']['papers']

[{'title': 'ENHANCED RECOVERY OF COAL BED METHANE WITH ...',
  'link': 'https://www.adv-res.com/Coal-Seq_Consortium/ECBM_Sequestration_Knowledge_Base/EIA%20Reports%20and%20Presentations/Ph3_34%20ECBM_20demo.pdf',
  'snippet': 'demonstration pilot, the financing could come from international development agencies or bilateral funding agencies. When the project goes to Stage 3 - commercial demonstration - CUCBM might continue to finance it or international oil and gas companies might provide financing in exchange for a share of the commercial revenues.',
  'abstract': 'This report discusses demonstration projects for enhanced coal bed methane (ECBM) recovery, including technical, economic, and financing aspects. It covers staged project development, government and industry involvement, and expert reviews. The focus is on methane recovery and storage in coal seams, relevant to mitigation and supply.',
  'published_date': ''},
 {'title': 'Prediction of Coal Bed Methane Recovery Rate and Its

---

# BASIC FILTERATION

In [24]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
from ipymarkup import show_span_box_markup
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)
model_name = "nicolauduran45/specter-climate-change-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForTokenClassification.from_pretrained(model_name)
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Using device: mps


Device set to use mps:0


In [25]:
def predict_with_proper_aggregation(text):
    # Get the raw predictions
    raw_entities = ner(text)
    
    # Aggregate subword pieces into complete entities
    aggregated_entities = []
    current_entity = None
    
    for entity in raw_entities:
        # Check if this is a continuation token (starts with ##)
        is_continuation = entity["word"].startswith("##")
        
        if is_continuation and current_entity:
            # Update the current entity by removing ## and appending
            current_entity["word"] += entity["word"][2:]
            current_entity["end"] = entity["end"]
            
            # Update the score (average or keep the minimum)
            current_entity["score"] = min(current_entity["score"], entity["score"])
            
            # If entity types differ, use the one with higher confidence
            if entity["entity_group"] != current_entity["entity_group"] and entity["score"] > current_entity["score"]:
                current_entity["entity_group"] = entity["entity_group"]
                current_entity["score"] = entity["score"]
        else:
            # If we have a previous entity, add it to results
            if current_entity:
                aggregated_entities.append(current_entity)
            
            # Start a new entity
            current_entity = entity.copy()
    
    # Don't forget the last entity
    if current_entity:
        aggregated_entities.append(current_entity)
    
    # Further aggregation: detect split entities that might not use ## notation
    # but should be merged based on adjacent positions
    i = 0
    while i < len(aggregated_entities) - 1:
        current = aggregated_entities[i]
        next_entity = aggregated_entities[i + 1]
        
        # Check if entities are adjacent and should be merged
        if (current["end"] == next_entity["start"] and 
            current["entity_group"] == next_entity["entity_group"]):
            # Merge entities
            current["word"] += next_entity["word"]
            current["end"] = next_entity["end"]
            current["score"] = (current["score"] + next_entity["score"]) / 2
            # Remove the next entity as it's now merged
            aggregated_entities.pop(i + 1)
        else:
            i += 1
    
    
    spans = [(s['start'], s['end'], s['entity_group'])for s in aggregated_entities]
    show_span_box_markup(text, spans)
    # pprint(aggregated_entities)
    return aggregated_entities


In [None]:
entities = []
for i in response['structured_response']['papers']:
    entities.extend(predict_with_proper_aggregation(i['abstract']))

In [34]:
def select_best_paper(papers, entities_list):
    """Select the best paper based on NER entity scores"""
    
    def calculate_paper_score(entities):
        if not entities:
            return 0
        
        weights = {
            'climate-mitigations': 2.0,
            'climate-hazards': 1.5,
            'climate-problem-origins': 1.2,
            'climate-nature': 1.0,
            'climate-assets': 0.8,
            'climate-properties': 0.6,
            'climate-impacts': 0.6
        }
        
        weighted_score = 0
        for entity in entities:
            weight = weights.get(entity['entity_group'], 1.0)
            weighted_score += entity['score'] * weight
        
        return weighted_score / len(entities) if entities else 0
    
    # Process entities for each paper separately
    paper_scores = []
    for i, paper in enumerate(papers):
        # Process entities for this specific paper
        paper_entities = predict_with_proper_aggregation(paper['abstract'])
        score = calculate_paper_score(paper_entities)
        
        papers[i]['score'] = score
        paper_scores.append({
            'index': i,
            'paper': papers[i],
            'score': score,
            'entity_count': len(paper_entities)
        })
    
    # Sort and display
    paper_scores.sort(key=lambda x: x['score'], reverse=True)
    
    print("All papers ranked by score (descending):")
    print("-" * 50)
    for i, item in enumerate(paper_scores, 1):
        print(f"{i}. Score: {item['score']:.3f}")
        print(f"   Title: {item['paper']['title'][:60]}...")
        print(f"   Entities: {item['entity_count']}")
        print()
    
    return paper_scores[0]['paper']

# Apply the selection
papers = response['structured_response']['papers']
best_paper = select_best_paper(papers, entities)
print(f"Selected paper: {best_paper['title']}")
print(f"Score: {best_paper['score']}")

All papers ranked by score (descending):
--------------------------------------------------
1. Score: 1.262
   Title: Mitigation of coal spontaneous combustion and enhanced coalb...
   Entities: 7

2. Score: 1.154
   Title: 4 Coalbed Methane Produced Water Management and Beneficial ....
   Entities: 5

3. Score: 1.080
   Title: ENHANCED RECOVERY OF COAL BED METHANE WITH ......
   Entities: 7

4. Score: 1.063
   Title: Investigation into the variation characteristics and influen...
   Entities: 7

5. Score: 0.608
   Title: Prediction of Coal Bed Methane Recovery Rate and Its Improve...
   Entities: 6

Selected paper: Mitigation of coal spontaneous combustion and enhanced coalbed ...
Score: 1.2619621753692627


---