# Rebuilding the agent in Langchain

In [1]:
! pip install langsmith openai
! pip install -qU requests bs4 lxml chromadb langchain langchain-text-splitters langchain-openai
! pip install -qU duckduckgo-search langchain-community ddgs

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m19.1 MB/s[0m eta [36m0:00

In [2]:
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_PROJECT"] = ""
os.environ["OPENAI_API_KEY"] = ""

In [3]:
# kb_en_to_chroma.py  — minimal & direct
import os, re, time, requests
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

BASE = "https://www.kapitalbank.az"
START = f"{BASE}/en"
UA = {"User-Agent": "kb-minicrawl/0.2"}
TIMEOUT = 15
MAX_PAGES = 50

def clean_url(u):
    u = urldefrag(u)[0]
    if not u: return None
    if not u.startswith("http"): u = urljoin(BASE, u)
    if not u.startswith(START): return None
    if re.search(r"\.(pdf|jpe?g|png|gif|svg|mp4|zip|docx?|xlsx?)$", u, re.I): return None
    return u

def extract_text(html):
    s = BeautifulSoup(html, "lxml")
    for t in s(["script","style","noscript","svg","footer","nav","header"]): t.decompose()
    n = s.select_one("main") or s.select_one("article") or s.body or s
    return " ".join((n.get_text(" ", strip=True) if n else s.get_text(" ", strip=True)).split())

visited, queue, pages = set(), [START], []
while queue and len(visited) < MAX_PAGES:
    url = queue.pop(0)
    if url in visited: continue
    try:
        r = requests.get(url, headers=UA, timeout=TIMEOUT)
        if r.ok and "text/html" in r.headers.get("Content-Type",""):
            txt = extract_text(r.text)
            if len(txt) > 200:
                pages.append({"url": url, "text": txt})
            s = BeautifulSoup(r.text, "lxml")
            for a in s.find_all("a", href=True):
                u = clean_url(a["href"])
                if u and u not in visited:
                    queue.append(u)
        visited.add(url); time.sleep(0.15)
    except requests.RequestException:
        visited.add(url)

import json

# Save the crawled pages data to a file for later use
pages_outfile = "kapitalbank_pages.json"
with open(pages_outfile, "w", encoding="utf-8") as f:
    json.dump(pages, f, indent=2, ensure_ascii=False)
print(f"Saved {len(pages)} pages to {pages_outfile}")

# Load crawled pages from JSON file to make them available for Chroma processing
with open(pages_outfile, "r", encoding="utf-8") as f:
    pages = json.load(f)
print(f"Loaded {len(pages)} pages from {pages_outfile}")

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# ---- LangChain chunking ----
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
docs, metas = [], []
for p in pages:
    for chunk in splitter.split_text(p["text"]):
        docs.append(chunk)
        metas.append({"url": p["url"]})

# ---- OpenAI embeddings -> Chroma ----
persist_dir = "chroma_kapitalbank"
emb = OpenAIEmbeddings(model="text-embedding-3-small")  # cheap & solid
vs = Chroma.from_texts(
    texts=docs,
    embedding=emb,
    persist_directory=persist_dir,
    collection_name="kapitalbank_en",
    metadatas=metas,
)
vs.persist()
print(f"Indexed pages={len(pages)} chunks={len(docs)} into {persist_dir}/ (collection 'kapitalbank_en')")

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_dir = "chroma_kapitalbank"
collection_name = "kapitalbank_en"
emb = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing/persisted Chroma vector store
vs = Chroma(
    persist_directory=persist_dir,
    embedding_function=emb,
    collection_name=collection_name
)

Saved 39 pages to kapitalbank_pages.json
Loaded 39 pages from kapitalbank_pages.json
Indexed pages=39 chunks=161 into chroma_kapitalbank/ (collection 'kapitalbank_en')


  vs.persist()
  vs = Chroma(


In [4]:
# from langsmith import Client
# from langchain_core.prompts import ChatPromptTemplate

# client = Client()
# prompt = ChatPromptTemplate.from_template("Answer the user's question using the tools you have. Always check internal knowledge first, then use the web.")
# url = client.push_prompt("general-agent-prompt", object=prompt)
# # url is a link to the prompt in the UI
# print(url)

In [5]:
from typing import Any, Dict
from langsmith import Client, traceable
from langchain.chat_models import init_chat_model
from langchain_core.tools import create_retriever_tool, tool, render_text_description
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.agents import create_agent

# --- assume your vector store exists as `vs` ---
retriever = vs.as_retriever(search_kwargs={"k": 3})
retrieve_tool = create_retriever_tool(
    retriever=retriever,
    name="retrieve",
    description="Search the internal vector store for passages relevant to the user's question."
)

_ddg = DuckDuckGoSearchRun()
@tool("duckduckgo_search")
def duckduckgo_search(query: str) -> str:
    """Search the web with DuckDuckGo and return a brief summary of top results."""
    return _ddg.run(query)

model  = init_chat_model("openai:gpt-4o-mini", temperature=0)
client = Client()
TOOLS  = [retrieve_tool, duckduckgo_search]

@traceable(name="agentic_solution", run_type="chain", tags=["rag","tools"])
def agentic_solution(inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
    q: str = inputs["question"]

    # 1) Pull + render your LangSmith prompt to a STRING
    prompt_tpl = client.pull_prompt("general-agent-prompt")
    system_prompt = prompt_tpl.format(tools=render_text_description(TOOLS))

    # 2) One-liner agent creation (v1)
    agent = create_agent(
        model,                    # or "openai:gpt-4o-mini"
        tools=TOOLS,
        system_prompt=system_prompt,  # system prompt accepted directly
    )

    # 3) Invoke
    result = agent.invoke({"messages": [{"role": "user", "content": q}]})
    return {"answer": result["messages"][-1].content.strip()}


In [6]:
out = agentic_solution({"question": 'Where Kapitalbank is located?'})
print(out)

{'answer': 'Kapitalbank is primarily located in Azerbaijan, where it serves as a significant financial institution. It has multiple branches across the country. Additionally, there are references to Kapital Bank branches in Washington, DC, USA, indicating that there may be a connection or similar naming convention. \n\nIf you are looking for a specific branch or more detailed information, please specify the location or context you are interested in.'}


In [7]:
print(out['answer'])

Kapitalbank is primarily located in Azerbaijan, where it serves as a significant financial institution. It has multiple branches across the country. Additionally, there are references to Kapital Bank branches in Washington, DC, USA, indicating that there may be a connection or similar naming convention. 

If you are looking for a specific branch or more detailed information, please specify the location or context you are interested in.
