In [None]:
import os

# LangSmith Configuration
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "absa-agent-ddg"

from langsmith import Client
client = Client()
try:
    client.list_projects()
    print("Connection to LangSmith successful! Using project:", os.environ["LANGCHAIN_PROJECT"])
except Exception as e:
    print("Error connecting to LangSmith: Check your API key!")
    print(e)

Connection to LangSmith successful! Using project: absa-agent-ddg


In [94]:
import operator
import json
import requests
from typing import Annotated, TypedDict, List, Optional
from pydantic import BaseModel, Field
from bs4 import BeautifulSoup
from langchain_ollama import ChatOllama
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langgraph.graph import StateGraph, END
from langsmith import traceable

In [95]:
LLM_MODEL = "gemma3:27b"
LLM_URL = "http://132.199.138.16:3333"
LLM_TEMPERATURE = 0

RETRIEVER_MAX_RESULTS = 50

In [None]:
# LLM Configuration (Ollama API)
llm = ChatOllama(
    model=LLM_MODEL,
    base_url=LLM_URL,
    temperature=LLM_TEMPERATURE,
    format="json" # Ensure the model supports JSON mode if possible
)

# Search wrapper for structured results
search_wrapper = DuckDuckGoSearchAPIWrapper()

# Helper to load prompts
def load_prompt(filename):
    with open(f"prompt_template/{filename}", "r") as f:
        return f.read()

# Define Pydantic models for structured output
class Review(BaseModel):
    review_title: Optional[str] = Field(description="Title of the review", default=None)
    review_text: str = Field(description="Content of the review")
    stars: Optional[int] = Field(description="Star rating", default=None)

class ExtractionResult(BaseModel):
    reviews: List[Review] = Field(description="List of extracted reviews")

In [97]:
# Graph State Definition
class GraphState(TypedDict):
    query: str
    retrieved_content: List[dict]
    relevant_ids: List[str]
    reviews: List[dict]
    summary: str # Keep for compatibility or future use

In [None]:
# Node 1: Retrieval
@traceable(run_type="retriever")
def retrieval_node(state: GraphState):
    print("--- RETRIEVAL ---")
    query = state["query"]
    results = search_wrapper.results(query, max_results=RETRIEVER_MAX_RESULTS)
    
    # Filter out TripAdvisor results
    filtered_results = [r for r in results if "tripadvisor" not in r['link'].lower()]
    
    relevant_ids = []
    for idx, result in enumerate(filtered_results):
        rid = f"result_{idx+1}"
        result["id"] = rid
        relevant_ids.append(rid)
    return {"retrieved_content": filtered_results, "relevant_ids": relevant_ids}

# Node 3: Extraction - Fetches content and extracts reviews using structured output
@traceable(run_type="llm")
def extract_reviews_node(state: GraphState):
    print("--- EXTRACT REVIEWS ---")
    relevant_ids = state.get("relevant_ids", [])
    results = state["retrieved_content"]
    all_reviews = []
    
    id_map = {r['id']: r for r in results}
    template = load_prompt("extract_reviews.txt")
    structured_llm = llm.with_structured_output(ExtractionResult)
    
    # Ensure _html directory exists
    if not os.path.exists("_html"):
        os.makedirs("_html")
    
    for rid in relevant_ids:
        if rid not in id_map: continue
        url = id_map[rid]['link']
        print(f"Fetching: {url}...")
        
        try:
            res = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
            if res.status_code != 200:
                print(f"Error loading {url}: Status {res.status_code}")
                continue
            
            # Save HTML content
            html_filename = f"_html/{rid}.html"
            with open(html_filename, "w", encoding="utf-8") as f:
                f.write(res.text)
                
            soup = BeautifulSoup(res.text, "html.parser")
            # Extract text, limit to ~8000 characters for context window
            page_text = soup.get_text(separator=" ", strip=True)[:8000]
            
            prompt = template.format(page_text=page_text)
            response = structured_llm.invoke(prompt)
            
            for rev in response.reviews:
                # Convert Pydantic model to dict and add URL
                rev_dict = rev.model_dump()
                rev_dict["website_url"] = url
                all_reviews.append(rev_dict)
                
        except Exception as e:
            print(f"Error at {url}: {e}")
            
    return {"reviews": all_reviews}

# Node 4: Summarization
@traceable(run_type="llm")
def summarize_reviews_node(state: GraphState):
    print("--- SUMMARIZE ---")
    reviews = state.get("reviews", [])
    query = state["query"]
    
    if not reviews:
        return {"summary": "No reviews found to summarize."}
        
    reviews_text = ""
    for idx, rev in enumerate(reviews):
        stars = f"({rev['stars']} stars)" if rev.get('stars') else ""
        reviews_text += f"Review {idx+1} {stars}: {rev['review_text']}\n\n"
        
    template = load_prompt("summarize_reviews.txt")
    prompt = template.format(query=query, reviews_text=reviews_text)
    
    # Since it's a summary, we can use the regular LLM instead of structured output (unless we want a JSON summary)
    response = llm.invoke(prompt)
    
    return {"summary": response.content}

# Graph Construction
workflow = StateGraph(GraphState)
workflow.add_node("retrieval", retrieval_node)
workflow.add_node("extract", extract_reviews_node)
workflow.add_node("summarize", summarize_reviews_node)

workflow.set_entry_point("retrieval")
workflow.add_edge("retrieval", "extract")
workflow.add_edge("extract", "summarize")
workflow.add_edge("summarize", END)

app = workflow.compile()
print("LangGraph with Extraction and Summarization created (Evaluation removed).")

LangGraph with Evaluation, Extraction and Summarization created.


In [99]:
# Test run with LangSmith configuration
config = {
    "tags": ["dev-test", "absa-summary"],
    "metadata": {
        "user": "nils",
        "model": LLM_MODEL
    }
}

# Query for reviews
inputs = {"query": "customer reviews for l'osteria pizza at regensburg"}

print("Starting agent run...")
final_reviews = []
final_summary = ""

for output in app.stream(inputs, config=config):
    for key, value in output.items():
        print(f"\n--- Node '{key}' finished ---")
        if key == "extract" and "reviews" in value:
            final_reviews = value["reviews"]
        if key == "summarize" and "summary" in value:
            final_summary = value["summary"]

print("\n" + "="*50)
print("SUMMARY:")
print(final_summary)

print("\n" + "="*50)
print(f"EXTRACTED REVIEWS ({len(final_reviews)} found):")
for i, rev in enumerate(final_reviews[:5]): # Show first 5
    print(f"\n[{i+1}] {rev.get('review_title', 'No Title')}")
    print(f"Stars: {rev.get('stars', 'N/A')} | URL: {rev.get('website_url')}")
    print(f"Text: {rev.get('review_text', '')[:150]}...")

print("\nDone!")

Starting agent run...
--- RETRIEVAL ---

--- Node 'retrieval' finished ---
--- EVALUATE ---
Relevant IDs found: ['result_2', 'result_3', 'result_7', 'result_8', 'result_9', 'result_13', 'result_14', 'result_15', 'result_18', 'result_20', 'result_21', 'result_22', 'result_23', 'result_24', 'result_33', 'result_35', 'result_37']

--- Node 'evaluate' finished ---
--- EXTRACT REVIEWS ---
Fetching: https://de.restaurantguru.com/LOsteria-Regensburg...
Fetching: https://community.ricksteves.com/travel-forum/germany/italian-restaurants-in-regensburg...
Fetching: https://push.bildshopdirekt.de/losteria-regensburg-regensburg/...
Error loading https://push.bildshopdirekt.de/losteria-regensburg-regensburg/: Status 520
Fetching: https://schmuck-wendt.de/losteria-regensburg-losteria-neutraubling-speisekarte/...
Error loading https://schmuck-wendt.de/losteria-regensburg-losteria-neutraubling-speisekarte/: Status 520
Fetching: https://fr.restaurantguru.com/LOsteria-Regensburg...
Fetching: https://mapp

In [100]:
final_reviews

[{'review_title': None,
  'review_text': 'Essen super lecker! Mitarbeiter sehr freundlich und aufmerksam!',
  'stars': 5,
  'website_url': 'https://de.restaurantguru.com/LOsteria-Regensburg'},
 {'review_title': None,
  'review_text': 'Das Essen ist super lecker und der Service ist sehr aufmerksam.',
  'stars': 5,
  'website_url': 'https://de.restaurantguru.com/LOsteria-Regensburg'},
 {'review_title': None,
  'review_text': 'Essen gut. Service bemüht sich. Im Winter nicht im Eingangsbereich sitzen. Wheelchair accessibility: Nicht geeignet da die WCs nur über eine enge Treppe nach unten erreicht werden können. Und im Lokal ist es zu eng.',
  'stars': 5,
  'website_url': 'https://de.restaurantguru.com/LOsteria-Regensburg'},
 {'review_title': None,
  'review_text': 'Hi Can anyone recommend a good Italian restaurant in Regensburg? TIA',
  'stars': None,
  'website_url': 'https://community.ricksteves.com/travel-forum/germany/italian-restaurants-in-regensburg'},
 {'review_title': None,
  'rev