In [None]:
from pathlib import Path
import sys

PROJECT_DIR = Path.cwd()
if not (PROJECT_DIR / "pipeline.py").exists():
    raise FileNotFoundError("Run this notebook from the modular folder.")

if str(PROJECT_DIR) not in sys.path:
    sys.path.insert(0, str(PROJECT_DIR))


In [None]:
from pipeline import ensure_models_downloaded

ensure_models_downloaded(base_dir=PROJECT_DIR, auto_download=True)


In [None]:
# Optional: regenerate cleaned_search_history.csv from raw JSON
from data_prep import clean_search_history_json
clean_search_history_json("search_history.json", output_csv="cleaned_search_history.csv")


In [None]:
from embedding_utils import embed_texts
from pipeline import load_search_history

df = load_search_history(path="cleaned_search_history.csv")
title_embeddings = embed_texts(df["title"].tolist())


In [None]:
from pipeline import (
    run_iterative_build,
    GRAPH_MODEL_NAME,
    QUESTION_GRAPH_CHUNK_SIZE,
    QUESTION_GRAPH_CHUNK_OVERLAP,
    MAX_THEMES_PER_QUERY,
)

profiles, final_nodes_df, final_edges_df = run_iterative_build(
    sample_size=len(df),
    df=df,
    title_embeddings=title_embeddings,
    window_days=7,
    extract_question_graphs=True,
    graph_model_name=GRAPH_MODEL_NAME,
    graph_chunk_size=QUESTION_GRAPH_CHUNK_SIZE,
    graph_chunk_overlap=QUESTION_GRAPH_CHUNK_OVERLAP,
    max_themes=MAX_THEMES_PER_QUERY,
)


In [None]:
final_nodes_df.head()