# Prompt Engineering

In [1]:
! pip install langsmith openai
! pip install -qU requests bs4 lxml chromadb langchain langchain-text-splitters langchain-openai
! pip install -qU duckduckgo-search langchain-community ddgs

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m97.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m27.6 MB/s[0m eta [36m0:00

In [2]:
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"] = ""
os.environ["LANGSMITH_PROJECT"] = ""
os.environ["OPENAI_API_KEY"] = ""

## Rebuilding baseline

In [3]:
# kb_en_to_chroma.py  — minimal & direct
import os, re, time, requests
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

BASE = "https://www.kapitalbank.az"
START = f"{BASE}/en"
UA = {"User-Agent": "kb-minicrawl/0.2"}
TIMEOUT = 15
MAX_PAGES = 50

def clean_url(u):
    u = urldefrag(u)[0]
    if not u: return None
    if not u.startswith("http"): u = urljoin(BASE, u)
    if not u.startswith(START): return None
    if re.search(r"\.(pdf|jpe?g|png|gif|svg|mp4|zip|docx?|xlsx?)$", u, re.I): return None
    return u

def extract_text(html):
    s = BeautifulSoup(html, "lxml")
    for t in s(["script","style","noscript","svg","footer","nav","header"]): t.decompose()
    n = s.select_one("main") or s.select_one("article") or s.body or s
    return " ".join((n.get_text(" ", strip=True) if n else s.get_text(" ", strip=True)).split())

visited, queue, pages = set(), [START], []
while queue and len(visited) < MAX_PAGES:
    url = queue.pop(0)
    if url in visited: continue
    try:
        r = requests.get(url, headers=UA, timeout=TIMEOUT)
        if r.ok and "text/html" in r.headers.get("Content-Type",""):
            txt = extract_text(r.text)
            if len(txt) > 200:
                pages.append({"url": url, "text": txt})
            s = BeautifulSoup(r.text, "lxml")
            for a in s.find_all("a", href=True):
                u = clean_url(a["href"])
                if u and u not in visited:
                    queue.append(u)
        visited.add(url); time.sleep(0.15)
    except requests.RequestException:
        visited.add(url)

import json

# Save the crawled pages data to a file for later use
pages_outfile = "kapitalbank_pages.json"
with open(pages_outfile, "w", encoding="utf-8") as f:
    json.dump(pages, f, indent=2, ensure_ascii=False)
print(f"Saved {len(pages)} pages to {pages_outfile}")

# Load crawled pages from JSON file to make them available for Chroma processing
with open(pages_outfile, "r", encoding="utf-8") as f:
    pages = json.load(f)
print(f"Loaded {len(pages)} pages from {pages_outfile}")

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# ---- LangChain chunking ----
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
docs, metas = [], []
for p in pages:
    for chunk in splitter.split_text(p["text"]):
        docs.append(chunk)
        metas.append({"url": p["url"]})

# ---- OpenAI embeddings -> Chroma ----
persist_dir = "chroma_kapitalbank"
emb = OpenAIEmbeddings(model="text-embedding-3-small")  # cheap & solid
vs = Chroma.from_texts(
    texts=docs,
    embedding=emb,
    persist_directory=persist_dir,
    collection_name="kapitalbank_en",
    metadatas=metas,
)
vs.persist()
print(f"Indexed pages={len(pages)} chunks={len(docs)} into {persist_dir}/ (collection 'kapitalbank_en')")

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_dir = "chroma_kapitalbank"
collection_name = "kapitalbank_en"
emb = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing/persisted Chroma vector store
vs = Chroma(
    persist_directory=persist_dir,
    embedding_function=emb,
    collection_name=collection_name
)

Saved 39 pages to kapitalbank_pages.json
Loaded 39 pages from kapitalbank_pages.json
Indexed pages=39 chunks=161 into chroma_kapitalbank/ (collection 'kapitalbank_en')


  vs.persist()
  vs = Chroma(


In [4]:
from langsmith import Client
from langchain_core.prompts import ChatPromptTemplate

client = Client()
prompt = ChatPromptTemplate.from_template("Answer the user's question using only the provided information below \n\n {documents}")
url = client.push_prompt("rag-prompt", object=prompt)
# url is a link to the prompt in the UI
print(url)

https://smith.langchain.com/prompts/rag-prompt/f6048092?organizationId=ae0ed3f3-9d18-4222-863a-b38e47905e23


In [5]:
from openai import OpenAI
from langsmith.wrappers import wrap_openai
from langsmith import traceable

openai_client = wrap_openai(OpenAI())

# Assume vs, emb, and persist_dir are already defined and initialized above

def retriever(query: str, k: int = 3):
    """
    Retrieve top-k most relevant chunks from the Chroma vector store for a query.
    """
    results = vs.similarity_search(query, k=k)
    # `results` is usually a list of Document objects with `page_content` and `metadata`
    return [doc.page_content for doc in results]

@traceable
def rag_traceble_load_prompt(question: str) -> str:

    docs = retriever(question)
    prompt = client.pull_prompt("rag-prompt")
    prompt_value = prompt.format(documents=docs)

    # This call is not traced yet
    resp = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt_value},
            {"role": "user", "content": question},
        ],
    )

    return resp.choices[0].message.content

In [6]:
print(rag_traceble_load_prompt("Where Kapitalbank is located?"))

Kapital Bank is located in Azerbaijan and has the largest service network in the country, with 119 branches and 52 departments all over Azerbaijan.


## Versioning and updating prompts

In [7]:
from langsmith import Client
from langchain_core.prompts import ChatPromptTemplate

client = Client()
prompt = ChatPromptTemplate.from_template(
    """
        Review the user's question and the provided information below.\n
        {documents}\n
        Before answering this question, think if the information below is relevant and enough to answer the question.
        Then, think step by step from the first principles and your intuition about this question.
        Finally, answer the question combining the information below and your thoughts.
    """
)
url = client.push_prompt("rag-prompt", object=prompt)
# url is a link to the prompt in the UI
print(url)

https://smith.langchain.com/prompts/rag-prompt/417b14ef?organizationId=ae0ed3f3-9d18-4222-863a-b38e47905e23


In [8]:
@traceable
def rag_traceble_load_prompt(question: str, prompt: str) -> str:

    docs = retriever(question)
    prompt = client.pull_prompt(prompt)
    prompt_value = prompt.format(documents=docs)

    # This call is not traced yet
    resp = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt_value},
            {"role": "user", "content": question},
        ],
    )

    return resp.choices[0].message.content

In [9]:
rag_traceble_load_prompt(
    "Where Kapitalbank is located?",
    "rag-prompt:1cd9f4ee"
)

'Kapital Bank has the largest service network in Azerbaijan, with 119 branches and 52 departments located all over the country.'

In [10]:
rag_traceble_load_prompt(
    "Where Kapitalbank is located?",
    "rag-prompt:f4715ff6"
)

'Kapital Bank is located in Azerbaijan and has the largest service network in the country, with 119 branches and 52 departments spread across various locations. For specific branch locations or addresses, you may refer to their official website or contact their Call Centre.'