In [1]:
import wikipedia

print(wikipedia.summary("Python (programming language)", sentences=2))

Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.


In [2]:
import sys
import torch

print(f"Python: {sys.version}")
print(f"Torch:  {torch.__version__}")
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))
    print("Compute capability:", torch.cuda.get_device_capability(0))

Python: 3.13.7 (main, Aug 15 2025, 12:34:02) [GCC 15.2.1 20250813]
Torch:  2.8.0+cu129
CUDA available: True
CUDA device: NVIDIA GeForce GTX 1660 Ti
Compute capability: (7, 5)


In [None]:
# -*- coding: utf-8 -*-
"""
City trip diary summarizer using:
- wikipedia Python library (no manual HTTP calls)
- Ollama local LLM server with the 'phi3:mini' model

Quick start (terminal):
  pip install wikipedia ollama

Ollama:
  - Ensure the local server is running (http://localhost:11434)
  - Pull model:  ollama pull phi3:mini

Notebook-friendly:
  - Call run_city_diary(CITY="Opole", NUM_SITES=3) directly in a cell
  - Or pass SPECIFIC_LOCATIONS=["Opole Town Hall", ...] to lock pages

"""

from __future__ import annotations
import os
import random
import textwrap
from datetime import date
from typing import Iterable, List, Optional, Tuple

import wikipedia
import ollama  # pip install ollama

# -------------------- Defaults (easy to tweak) --------------------
MODEL_NAME = "mistral:7b"    # change to any local Ollama model you like
WIKI_LANG = "en"
MAX_ARTICLE_CHARS = 2400    # truncate Wikipedia article text per page
MAX_NEW_TOKENS = 256        # LLM budget for the final diary entry
OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "http://localhost:11434")

# Seed for reproducibility (optional)
random.seed(42)
wikipedia.set_lang(WIKI_LANG)


# -------------------- Wikipedia helpers --------------------
def _is_probably_article(title: str) -> bool:
    """Filter out likely non-article titles (lists, categories, templates)."""
    t = title.strip().lower()
    if t.startswith(("category:", "template:")) or t.startswith("list of"):
        return False
    if "disambiguation" in t:
        return False
    return True


def search_city_candidates(city: str, max_results_per_query: int = 50) -> List[str]:
    """
    Build a diverse candidate pool of likely POIs (articles) for the given city.
    """
    city_lc = city.strip().lower()
    queries = [
        f"Buildings and structures in {city}",
        f"{city} monuments",
        f"{city} landmarks",
        f"{city} architecture",
        f"{city} church",
        f"{city} tower",
        f"{city} museum",
        f"{city} amphitheatre",
        f"{city} historic sites",
    ]

    candidates: List[str] = []
    for q in queries:
        try:
            results = wikipedia.search(q, results=max_results_per_query)
            for r in results:
                if _is_probably_article(r) and city_lc in r.lower():
                    candidates.append(r)
        except Exception:
            # Ignore individual query failures and keep going
            pass

    # De-duplicate while preserving order
    unique = []
    seen = set()
    for c in candidates:
        if c not in seen:
            unique.append(c)
            seen.add(c)
    print(unique)
    return unique


def fetch_wikipedia_extract(title: str, max_chars: int = MAX_ARTICLE_CHARS) -> str:
    """
    Get clean article content for a title, with basic disambiguation handling.
    """
    try:
        page = wikipedia.page(title, auto_suggest=False, preload=False)
        text = page.content or ""
        return text[:max_chars]
    except wikipedia.DisambiguationError as e:
        # Choose a random option and recurse
        options = [opt for opt in e.options if _is_probably_article(opt)]
        if not options:
            options = e.options
        new_title = random.choice(options)
        return fetch_wikipedia_extract(new_title, max_chars=max_chars)
    except wikipedia.PageError:
        # Try a fallback suggestion from search
        try:
            alt = wikipedia.search(title)
            if alt:
                return fetch_wikipedia_extract(alt[0], max_chars=max_chars)
        except Exception:
            pass
        return ""
    except Exception:
        # Any unexpected error -> empty
        return ""


def choose_sites(
    city: str,
    num_sites: int,
    specific_locations: Optional[Iterable[str]] = None
) -> List[str]:
    """
    Decide which site article titles to use.
    - If specific_locations are provided, use up to num_sites of those (in given order).
    - Else pick randomly from search results; if empty, fall back to a curated list per city.
    """
    if specific_locations:
        titles = [t for t in specific_locations if isinstance(t, str) and t.strip()]
        return titles[:num_sites]

    pool = search_city_candidates(city)

    if not pool:
        raise RuntimeError(
            f"No candidate articles found for city '{city}'. "
            "Try providing SPECIFIC_LOCATIONS."
        )

    random.shuffle(pool)
    return pool[:num_sites]


# -------------------- Prompting / LLM --------------------
def build_messages_for_trip(
    city: str,
    sites_and_texts: List[Tuple[str, str]],
    style: str = "diary"
):
    """
    Build a system+user message pair for Ollama chat, injecting current date.
    style: "diary" or "bullets" (feel free to add more styles if you like)
    """
    today = date.today().strftime("%B %d, %Y")

    if style == "diary":
        system = (
            "You are a concise diary-writing assistant for tourists. "
            "The current date is {today}. Write very concisely, but cover all visited sites"
        )
        instruction = (
            "Write a diary-like entry that summarizes a guided city trip in {city}. "
            "The group visited the following sites today. For each site, capture what makes it unique "
            "(key history, architecture, dates, measurements, notable facts). Avoid fluff; keep it tight."
        )
    else:  # "bullets"
        system = (
            "You are a concise, factual assistant. The current date is {today}. "
            "Summarize clearly in bullet points."
        )
        instruction = (
            "Summarize a guided city trip in {city} as ~6-10 concise bullet points. "
            "For each visited site include key dates, names, measurements, or counts where relevant."
        )

    # Join site texts
    parts = []
    for title, content in sites_and_texts:
        parts.append(f"### {title}\n{content}\n")
    sites_block = "\n".join(parts)

    user = (
        instruction.format(city=city) +
        "\n\nVisited sites (Wikipedia excerpts, possibly truncated):\n\n" +
        sites_block
    )

    return [
        {"role": "system", "content": system.format(today=today)},
        {"role": "user", "content": user},
    ]


def generate_with_ollama(
    model: str,
    messages: List[dict],
    max_new_tokens: int = MAX_NEW_TOKENS,
    temperature: float = 0.6,
    top_p: float = 0.9,
    repeat_penalty: float = 1.05,
    stream: bool = False
) -> str:
    """
    Call the local Ollama server using the Python client.
    """
    # Allow overriding OLLAMA_HOST via env; ollama lib picks it up automatically.
    # (If you're pointing to a remote host, set OLLAMA_HOST before importing ollama.)
    _ = ollama.list()  # quick connectivity check; raises if server is down

    resp = ollama.chat(
        model=model,
        messages=messages,
        options={
            "num_predict": max_new_tokens,
            "temperature": temperature,
            "top_p": top_p,
            "repeat_penalty": repeat_penalty,
        },
        stream=stream,
    )

    if stream:
        chunks = []
        for chunk in resp:
            if "message" in chunk and "content" in chunk["message"]:
                chunks.append(chunk["message"]["content"])
        return "".join(chunks).strip()

    return resp["message"]["content"].strip()


# -------------------- Main entry (Notebook-friendly) --------------------
def run_city_diary(
    CITY: str = "Opole",
    NUM_SITES: int = 5,
    SPECIFIC_LOCATIONS: Optional[Iterable[str]] = None,
    MODEL: str = MODEL_NAME,
    STYLE: str = "diary",
    verbose: bool = True,
) -> Tuple[List[str], str]:
    """
    Orchestrates the full flow:
      1) Pick site titles (from SPECIFIC_LOCATIONS or search/fallback)
      2) Fetch Wikipedia excerpts for each site
      3) Ask the model to produce a diary/bullets summary

    Returns: (selected_titles, summary_text)
    """
    # pick sites
    titles = choose_sites(CITY, NUM_SITES, SPECIFIC_LOCATIONS)

    # fetch extracts
    sites_and_texts: List[Tuple[str, str]] = []
    for t in titles:
        txt = fetch_wikipedia_extract(t, max_chars=MAX_ARTICLE_CHARS)
        if not txt.strip():
            # skip empty pages; try to keep count at least >0
            if verbose:
                print(f"[warn] Empty content for '{t}', skipping.")
            continue
        sites_and_texts.append((t, txt))

    if not sites_and_texts:
        raise RuntimeError("No article content retrieved. Try different SPECIFIC_LOCATIONS or city.")

    # build messages & generate
    messages = build_messages_for_trip(CITY, sites_and_texts, style=STYLE)
    summary = generate_with_ollama(MODEL, messages, max_new_tokens=MAX_NEW_TOKENS)

    if verbose:
        print("\n==== CITY ====\n", CITY)
        print("\n==== VISITED SITES ====")
        for t, _ in sites_and_texts:
            print(" -", t)
        print("\n==== SUMMARY ====\n", textwrap.fill(summary, 100))

    # Return only the titles we actually used (with content)
    used_titles = [t for t, _ in sites_and_texts]
    return used_titles, summary


# -------------------- Example usage --------------------

titles, diary = run_city_diary(
     CITY="Opole",
     NUM_SITES=5,
     SPECIFIC_LOCATIONS=None,  # or ["Opole Town Hall", "Piast Tower (Opole)"]
     MODEL="mistral:7b",
     STYLE="diary",            # or "bullets"
     verbose=True,
 )
print(diary)


['Opole Zoo', 'Opole Cathedral', 'Opole Główne railway station', 'Opole-Kamień Śląski Airport', 'Opole', 'New Synagogue (Opole)', 'Opole Voivodeship', 'Opole Town Hall', 'Moszna, Opole Voivodeship', 'Dębowiec, Opole Voivodeship', 'Czyżowice, Opole Voivodeship', 'Tułowice, Opole Voivodeship', 'Biała, Opole Voivodeship', 'Jaryszów, Opole Voivodeship', 'Opole University of Technology', 'Lubomirski Palace (Opole Lubelskie)', 'Opole Lubelskie', 'Vladislaus II of Opole', 'Diocese of Opole', 'Casimir I of Opole', 'Kępa, Opole Voivodeship', 'Przechód, Opole Voivodeship', 'Vladislaus I of Opole', 'Bolko I of Opole', 'Bolko IV of Opole', 'Prędocin, Opole Voivodeship', 'Duchy of Opole and Racibórz', 'Duchy of Opole', 'Smolarnia, Opole Voivodeship', 'Markowice, Opole Voivodeship', 'National Festival of Polish Song in Opole']

==== CITY ====
 Opole

==== VISITED SITES ====
 - Casimir I of Opole
 - Opole University of Technology
 - Czyżowice, Opole Voivodeship
 - Bolko IV of Opole
 - Duchy of Opole


In [4]:
# -*- coding: utf-8 -*-
"""
Random Opole monument summarizer using:
- wikipedia Python library (no manual HTTP calls)
- Ollama local LLM server with the 'phi3:mini' model

Requires:
  - Ollama running locally (http://localhost:11434)
  - Model pulled: `ollama pull phi3:mini`
  - Python deps: `pip install wikipedia ollama`
"""

import os
import random
import textwrap
import wikipedia
from datetime import date

# If you run Ollama on a different host/port, set OLLAMA_HOST, e.g.:
# os.environ["OLLAMA_HOST"] = "http://127.0.0.1:11434"

import ollama  # Python client for Ollama

# -------------------- Config --------------------
MODEL_NAME = "phi3:mini"
WIKI_LANG = "en"
MAX_ARTICLE_CHARS = 2400
MAX_NEW_TOKENS = 256

# Seed for reproducibility across runs (optional)
random.seed(42)

# -------------------- Wikipedia helpers --------------------
wikipedia.set_lang(WIKI_LANG)

_FALLBACK_OPOLE_TITLES = [
    "Opole Town Hall",
    "Cathedral of the Holy Cross, Opole",
    "Piast Tower (Opole)",
    "Amphitheatre Tysiąclecia in Opole",
    "Church of the Exaltation of the Holy Cross, Opole",
    "Museum of Opole Silesia",
]

def is_probably_article(title: str) -> bool:
    """Filter out likely non-article titles (lists, categories, templates)."""
    t = title.strip().lower()
    if t.startswith("category:") or t.startswith("list of") or t.startswith("template:"):
        return False
    if "disambiguation" in t:
        return False
    return True

def get_random_opole_monument() -> str:
    """
    Use wikipedia.search with several queries likely to surface actual buildings/structures in Opole.
    If that fails, fall back to a small curated list.
    """
    queries = [
        "Buildings and structures in Opole",
        "Opole landmarks",
        "Opole architecture",
        "Opole church",
        "Opole tower",
        "Opole museum",
        "Opole amphitheatre",
    ]
    candidates = []
    for q in queries:
        try:
            results = wikipedia.search(q)
            for r in results:
                if is_probably_article(r) and "opole" in r.lower():
                    candidates.append(r)
        except Exception:
            # Ignore and try next query
            pass

    # Unique & filtered
    candidates = [c for i, c in enumerate(candidates) if c not in candidates[:i]]

    if not candidates:
        candidates = _FALLBACK_OPOLE_TITLES[:]

    return random.choice(candidates)

def fetch_wikipedia_extract(title: str, max_chars: int = MAX_ARTICLE_CHARS) -> str:
    """
    Get clean article content for a title, with basic disambiguation handling.
    """
    try:
        page = wikipedia.page(title, auto_suggest=False, preload=False)
        text = page.content or ""
        return text[:max_chars]
    except wikipedia.DisambiguationError as e:
        # Choose a random option and recurse
        options = [opt for opt in e.options if is_probably_article(opt)]
        if not options:
            options = e.options
        new_title = random.choice(options)
        return fetch_wikipedia_extract(new_title, max_chars=max_chars)
    except wikipedia.PageError:
        # Try a fallback suggestion from search
        try:
            alt = wikipedia.search(title)
            if alt:
                return fetch_wikipedia_extract(alt[0], max_chars=max_chars)
        except Exception:
            pass
        return ""
    except Exception:
        # Any unexpected error -> empty
        return ""

# -------------------- Prompting --------------------
def build_phi_style_messages(page_title: str, page_text: str):
    """
    Use Ollama's chat format (system + user messages).
    Phi-style models work well with a simple, explicit instruction.
    """
    today = date.today().strftime("%B %d, %Y")
    system = f"You are a concise diary writing assistant, responsible for a memorable note for tourists about their city trip today {today}."
    user = (
        "Write a dairy-like entry that summarizes a city trip that tourists made with a tourguide"
        "past the following sites mentioned in the Wikipedia article below. "
        "Icnclude key details why it is a unique site. "
        "Avoid fluff.\n\n"
        f"Article title: {page_title}\n\n"
        f"Article text:\n{page_text}"
    )
    return [
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ]

def summarize_with_ollama(messages, max_new_tokens=MAX_NEW_TOKENS):
    """
    Call the local Ollama server using the Python client.
    """
    resp = ollama.chat(
        model=MODEL_NAME,
        messages=messages,
        options={
            "num_predict": max_new_tokens,
            "temperature": 0.6,
            "top_p": 0.9,
            "repeat_penalty": 1.05,
        },
        stream=False,  # set True to stream tokens; then iterate resp as a generator
    )
    # resp looks like {"model": ..., "message": {"role": "assistant", "content": "..."} , ...}
    return resp["message"]["content"].strip()

# -------------------- Main --------------------
def main():
    # Quick connectivity check (optional): list models; will raise if server is down
    try:
        _ = ollama.list()
    except Exception as e:
        raise SystemExit(
            "Could not reach the local Ollama server at http://localhost:11434.\n"
            "Start it and ensure the model is pulled:\n"
            "  1) ollama serve  (if not already running)\n"
            "  2) ollama pull phi3:mini\n"
            f"Error: {e}"
        )

    title = get_random_opole_monument()
    raw = fetch_wikipedia_extract(title, max_chars=MAX_ARTICLE_CHARS)

    if not raw.strip():
        raise RuntimeError(f"Could not fetch article content for '{title}'.")

    messages = build_phi_style_messages(title, raw)
    summary = summarize_with_ollama(messages, max_new_tokens=MAX_NEW_TOKENS)

    print("\n==== RANDOM OPOLE MONUMENT ====\n", title)
    print("\n==== SUMMARY ====\n", textwrap.fill(summary, 100))

if __name__ == "__main__":
    main()



==== RANDOM OPOLE MONUMENT ====
 New Synagogue (Opole)

==== SUMMARY ====
 September 18, 2025: Visited the hauntingly beautiful ruins of the New Synagogue in Opole, now
Poland. Founded as a Reform Jewish congregation and built between 1893-1897 by Felix Henry, it was a
striking Moorish Revival style structure that replaced the Old Synagogue dating back to 1841.
However, on November 9, 1938, during Kristallnacht, this symbol of Jewish faith and community was
mercilessly destroyed by Nazis under the orders of rabbi Hans Hirschberg. Standing amidst its ashes
today is a stark reminder of the atrocities of the Holoca0023-165473-209807-5  The New Synagogue was
not only a place of worship but also an integral part of Jewish life and culture in Opole. It's
heartbreaking to see such a significant site reduced to rubble, yet the resilience of the spirit is
evident as the Old Synagogue still stands nearby, repurposed for commercial use since 1897.


In [8]:
# --- Setup: pip install wikipedia if needed ---
# %pip install wikipedia pandas tqdm

import time
from typing import Dict, List, Optional, Tuple
import pandas as pd
from tqdm.auto import tqdm
import wikipedia

def _try_get_page(title: str) -> Tuple[Optional[wikipedia.WikipediaPage], str]:
    """Attempt an exact page fetch without autosuggest; return (page_or_None, note)."""
    try:
        page = wikipedia.page(title, auto_suggest=False, preload=False)
        return page, "exact"
    except wikipedia.DisambiguationError as e:
        # Heuristic: prefer options that mention the city in parentheses or after a comma
        return None, f"disambiguation:{len(e.options)}"
    except wikipedia.PageError:
        return None, "not_found"
    except Exception as e:
        return None, f"error:{type(e).__name__}"

def _resolve_disambiguation(options: List[str], city: str, name: str) -> List[str]:
    """Order disambiguation options by how well they match the intended page."""
    city_l = city.lower()
    name_l = name.lower()
    scored = []
    for opt in options:
        o_l = opt.lower()
        score = 0
        if city_l in o_l: score += 3
        if name_l in o_l: score += 2
        # parentheses with city often indicate the right one
        if f"({city_l})" in o_l: score += 1
        scored.append((score, opt))
    scored.sort(reverse=True)
    return [opt for _, opt in scored]

def _search_titles(query: str, max_results: int = 20) -> List[str]:
    try:
        return wikipedia.search(query, results=max_results) or []
    except Exception:
        return []

def find_wikipedia_for_monument(
    name_monument: str,
    city: str,
    languages: List[str] = ("en",),
    polite_delay: float = 0.2,
    max_search_results: int = 20,
) -> Dict[str, Optional[str]]:
    """
    Try to find a Wikipedia article for a given (monument, city).
    Tries multiple languages in order; returns metadata about the best match found.
    """
    name = (name_monument or "").strip()
    cty  = (city or "").strip()
    if not name or not cty:
        return {
            "wiki_lang": None, "found": False, "match_title": None, "url": None,
            "method": None, "note": "missing_name_or_city", "pageid": None
        }

    # Title candidates (common patterns)
    title_candidates = [
        name,                                     # "Piast Tower"
        f"{name} ({cty})",                        # "Cathedral ... (Opole)"
        f"{name}, {cty}",                         # "Town Hall, Opole"
        f"{name} in {cty}",                       # "Amphitheatre ... in Opole"
    ]

    # Search queries
    search_queries = [
        f"{name} {cty}",
        f"{name} {cty} site",
        f"{name} {cty} landmark",
        f"{name} {cty} building",
    ]

    for lang in languages:
        wikipedia.set_lang(lang)
        # 1) Try exact-ish candidates first
        for title in title_candidates:
            page, note = _try_get_page(title)
            if page:
                return {
                    "wiki_lang": lang, "found": True, "match_title": page.title,
                    "url": page.url, "method": "title", "note": note, "pageid": page.pageid
                }
            if note.startswith("error"):
                # transient issues: brief delay and continue
                time.sleep(polite_delay)

        # 2) Search and then try best-looking results
        for q in search_queries:
            results = _search_titles(q, max_results=max_search_results)
            # Prefer titles containing the city or the full name
            ranked = []
            c_l = cty.lower(); n_l = name.lower()
            for r in results:
                r_l = r.lower()
                score = 0
                if c_l in r_l: score += 2
                if n_l in r_l: score += 1
                ranked.append((score, r))
            ranked.sort(reverse=True)
            for _, rtitle in ranked:
                try:
                    page = wikipedia.page(rtitle, auto_suggest=False, preload=False)
                    return {
                        "wiki_lang": lang, "found": True, "match_title": page.title,
                        "url": page.url, "method": "search", "note": q, "pageid": page.pageid
                    }
                except wikipedia.DisambiguationError as e:
                    for opt in _resolve_disambiguation(e.options, cty, name):
                        try:
                            page = wikipedia.page(opt, auto_suggest=False, preload=False)
                            return {
                                "wiki_lang": lang, "found": True, "match_title": page.title,
                                "url": page.url, "method": "search_disamb", "note": q, "pageid": page.pageid
                            }
                        except Exception:
                            continue
                except Exception:
                    continue
            time.sleep(polite_delay)

    # Nothing found in any language
    return {
        "wiki_lang": None, "found": False, "match_title": None, "url": None,
        "method": None, "note": "not_found_any_lang", "pageid": None
    }

def annotate_csv_with_wikipedia(
    csv_path: str,
    out_path: Optional[str] = None,
    languages: List[str] = ("en", "pl"),
    polite_delay: float = 0.2,
) -> pd.DataFrame:
    """
    Reads a CSV with columns: name_monument, city.
    Adds columns describing the match found on Wikipedia.
    Saves to out_path if provided; returns the annotated DataFrame.
    """
    df = pd.read_csv(csv_path)
    required = {"name_monument", "city", "number_reviews", "wheelchair_accesability"}
    missing = required - set(df.columns.str.lower())
    # Try case-insensitive mapping
    col_map = {c.lower(): c for c in df.columns}
    if missing:
        raise ValueError(f"CSV must contain columns {required}; got {list(df.columns)}")

    results = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Checking Wikipedia"):
        name = row[col_map.get("name_monument", "name_monument")]
        city = row[col_map.get("city", "city")]
        meta = find_wikipedia_for_monument(
            name_monument=name,
            city=city,
            languages=languages,
            polite_delay=polite_delay
        )
        results.append(meta)

    meta_df = pd.DataFrame(results)
    out_df = pd.concat([df.reset_index(drop=True), meta_df], axis=1)

    if out_path:
        out_df.to_csv(out_path, index=False)

    return out_df

csv_in  = "cultural_demoset.csv"
csv_out = "wiki_cultural_demoset.csv"

annotated = annotate_csv_with_wikipedia(
    csv_path=csv_in,
    out_path=csv_out,         # or None to skip saving
    languages=["en"],   # try English then Polish
    polite_delay=0.2
)

annotated.head()




  lis = BeautifulSoup(html).find_all('li')
Checking Wikipedia: 100%|██████████| 303/303 [06:39<00:00,  1.32s/it]


Unnamed: 0,name_monument,city,number_reviews,wheelchair_accesability,wiki_lang,found,match_title,url,method,note,pageid
0,Rijksmuseum,Amsterdam,106747,True,en,True,Rijksmuseum,https://en.wikipedia.org/wiki/Rijksmuseum,title,exact,26230
1,Vondelpark,Amsterdam,57536,True,en,True,Vondelpark,https://en.wikipedia.org/wiki/Vondelpark,title,exact,3231819
2,Dam Square,Amsterdam,44864,True,en,True,Dam Square,https://en.wikipedia.org/wiki/Dam_Square,title,exact,638647
3,NEMO Science Museum,Amsterdam,35107,True,en,True,List of tourist attractions in Amsterdam,https://en.wikipedia.org/wiki/List_of_tourist_...,search,NEMO Science Museum Amsterdam,5024484
4,ARTIS,Amsterdam,34963,True,en,True,Wereldmuseum Amsterdam,https://en.wikipedia.org/wiki/Wereldmuseum_Ams...,search,ARTIS Amsterdam,7148085


In [3]:
print(len(annotated[annotated["method"] == "title"]))

83


In [6]:
from typing import Optional, Iterable
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import wikipedia
from sentence_transformers import SentenceTransformer

# ---------------------------------------------
# Config (easy to tweak)
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # swap as desired
MAX_ARTICLE_CHARS = 4000                                 # more text = slower, but often better
BATCH_SIZE = 32                                          # embedding batch size
# ---------------------------------------------

def _fetch_article_text(title: str, lang: Optional[str], max_chars: int) -> str:
    """Fetch plain text for a page title in a given language (safe truncation)."""
    if lang:
        try:
            wikipedia.set_lang(lang)
        except Exception:
            pass  # keep whatever default if lang invalid
    try:
        page = wikipedia.page(title, auto_suggest=False, preload=False)
        return (page.content or "")[:max_chars]
    except Exception:
        return ""

def embed_texts(texts: Iterable[str], model: SentenceTransformer, batch_size: int = BATCH_SIZE) -> np.ndarray:
    """Embed a list of texts using Sentence-Transformers (L2-normalized)."""
    # model.encode returns numpy array; normalize for cosine via dot
    vecs = model.encode(list(texts), batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True, normalize_embeddings=True)
    return vecs

def rank_wikipedia_by_query(
    df: pd.DataFrame,
    query: str = "world war 2",
    text_col_limit: int = MAX_ARTICLE_CHARS,
    model_name: str = EMBED_MODEL,
    top_k: Optional[int] = None,
    verbose: bool = True,
) -> pd.DataFrame:
    """
    From an annotated DataFrame (with columns like: name_monument, city, method, match_title, wiki_lang, url),
    compute embeddings for entries with method=='title' and rank by cosine similarity to the query phrase.

    Returns a new DataFrame with an added 'similarity' column, sorted desc by similarity.
    """
    # Basic checks
    needed_cols = {"method", "match_title", "wiki_lang", "url"}
    missing = needed_cols - set(df.columns)
    if missing:
        raise ValueError(f"DataFrame missing required columns: {missing}")

    # Filter to "title" hits only
    sub = df[df["method"].astype(str).str.lower() == "title"].copy()
    if sub.empty:
        raise ValueError("No rows with method == 'title' to score.")

    # Fetch article texts (with simple in-memory cache)
    cache = {}
    texts = []
    titles = []
    langs = []
    urls = []

    if verbose:
        print(f"Fetching article text for {len(sub)} rows...")

    for _, row in tqdm(sub.iterrows(), total=len(sub)):
        title = str(row["match_title"])
        lang = None if pd.isna(row["wiki_lang"]) else str(row["wiki_lang"])
        url  = None if pd.isna(row["url"]) else str(row["url"])

        key = (title, lang)
        if key not in cache:
            cache[key] = _fetch_article_text(title, lang, text_col_limit)
        text = cache[key]

        titles.append(title)
        langs.append(lang)
        urls.append(url)
        texts.append(text)

    sub = sub.assign(_article_text=texts, _title=titles, _lang=langs, _url=urls)

    # Drop any truly empty texts (no content could be fetched)
    sub = sub[sub["_article_text"].str.len() > 0]
    if sub.empty:
        raise ValueError("All 'title' rows had empty fetched content; cannot rank.")

    # Load embedding model
    if verbose:
        print(f"Loading embedding model: {model_name}")
    st_model = SentenceTransformer(model_name)

    # Embed documents and the query (normalized)
    if verbose:
        print("Embedding documents...")
    doc_vecs = embed_texts(sub["_article_text"].tolist(), st_model)

    if verbose:
        print(f"Embedding query: '{query}'")
    query_vec = embed_texts([query], st_model)[0]  # shape (dim,)

    # Cosine similarity (dot product since vectors are L2-normalized)
    sims = doc_vecs @ query_vec

    # Attach and sort
    ranked = sub.copy()
    ranked["similarity"] = sims
    ranked = ranked.sort_values("similarity", ascending=False).reset_index(drop=True)

    # Select useful columns for display
    display_cols = [
        "similarity",
        "match_title",
        "wiki_lang",
        "url",
    ]
    # Include original monument & city if present
    for c in ("name_monument", "city", "method", "note", "pageid"):
        if c in ranked.columns:
            display_cols.append(c)

    return ranked[display_cols]

# -------------------------
# Example usage in notebook
# -------------------------
# Suppose you have `annotated` from the prior step:
ranked = rank_wikipedia_by_query(
    annotated,
    query="Art",                  # <-- flexible: change to any theme, e.g. "gothic architecture"
    text_col_limit=4000,            # how much article text to embed
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    top_k=None,                     # set an int to slice head()
)
ranked.head(10)


Fetching article text for 124 rows...


100%|██████████| 124/124 [01:24<00:00,  1.47it/s]


Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
Embedding documents...


Batches: 100%|██████████| 4/4 [00:02<00:00,  1.41it/s]


Embedding query: 'Art'


Batches: 100%|██████████| 1/1 [00:00<00:00, 63.83it/s]


Unnamed: 0,similarity,match_title,wiki_lang,url,name_monument,city,method,note,pageid
0,0.316342,Museum of Art Collections,en,https://en.wikipedia.org/wiki/Museum_of_Art_Co...,Museum of Art Collections,Bucharest,title,exact,3201895
1,0.301075,Museum of Decorative Arts in Prague,en,https://en.wikipedia.org/wiki/Museum_of_Decora...,Museum of Decorative Arts in Prague,Prague,title,exact,23548614
2,0.292459,Hungarian National Gallery,en,https://en.wikipedia.org/wiki/Hungarian_Nation...,Hungarian National Gallery,Budapest,title,exact,7645878
3,0.270278,Musée d'Art Moderne de Paris,en,https://en.wikipedia.org/wiki/Mus%C3%A9e_d%27A...,Musée d'Art Moderne de Paris,Paris,title,exact,15614518
4,0.262694,Stedelijk Museum Amsterdam,en,https://en.wikipedia.org/wiki/Stedelijk_Museum...,Stedelijk Museum Amsterdam,Amsterdam,title,exact,931845
5,0.260603,Little Princess statue,en,https://en.wikipedia.org/wiki/Little_Princess_...,Little Princess Statue,Budapest,title,exact,38595315
6,0.254313,Palais de la Porte Dorée,en,https://en.wikipedia.org/wiki/Palais_de_la_Por...,Palais de la Porte Dorée,Paris,title,exact,17470851
7,0.252783,Vatican Museums,en,https://en.wikipedia.org/wiki/Vatican_Museums,Vatican Museums,Rome,title,exact,229765
8,0.252783,Vatican Museums,en,https://en.wikipedia.org/wiki/Vatican_Museums,Museo Gregoriano Profano,Rome,title,exact,229765
9,0.252783,Vatican Museums,en,https://en.wikipedia.org/wiki/Vatican_Museums,Braccio Nuovo,Rome,title,exact,229765


In [7]:
ranked.to_csv("art_culture_ranked.csv")