# Datasets with Langfuse

In [None]:
! pip install langsmith openai langfuse pandas
! pip install -qU requests bs4 lxml chromadb langchain langchain-text-splitters langchain-openai
! pip install -qU duckduckgo-search langchain-community ddgs
! pip install unstructured jq
! pip install deepeval
! pip install ragas==0.3.7

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
deepeval 3.6.9 requires posthog<7.0.0,>=6.3.0, but you have posthog 5.4.0 which is incompatible.
google-adk 1.17.0 requires opentelemetry-api<=1.37.0,>=1.37.0, but you have opentelemetry-api 1.38.0 which is incompatible.
google-adk 1.17.0 requires opentelemetry-sdk<=1.37.0,>=1.37.0, but you have opentelemetry-sdk 1.38.0 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-exporter-otlp-proto-common==1.37.0, but you have opentelemetry-exporter-otlp-proto-common 1.38.0 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-proto==1.37.0, but you have opentelemetry-proto 1.38.0 which is incompatible.
opentelemetry-exporter-otlp-proto-http 1.37.0 requires opentelemetry-sdk~=1.37.0, but you have opentelemetry-sdk 1.38.0 which is incompatib

In [None]:
import os

os.environ["LANGFUSE_TRACING"] = "true"
os.environ["LANGFUSE_HOST"] = "https://cloud.langfuse.com"
os.environ["LANGFUSE_SECRET_KEY"] = ""
os.environ["LANGFUSE_PUBLIC_KEY"] = ""
os.environ["OPENAI_API_KEY"] = ""

In [None]:
# kb_en_to_chroma.py  — minimal & direct
import os, re, time, requests
from urllib.parse import urljoin, urldefrag
from bs4 import BeautifulSoup

BASE = "https://www.kapitalbank.az"
START = f"{BASE}/en"
UA = {"User-Agent": "kb-minicrawl/0.2"}
TIMEOUT = 15
MAX_PAGES = 50

def clean_url(u):
    u = urldefrag(u)[0]
    if not u: return None
    if not u.startswith("http"): u = urljoin(BASE, u)
    if not u.startswith(START): return None
    if re.search(r"\.(pdf|jpe?g|png|gif|svg|mp4|zip|docx?|xlsx?)$", u, re.I): return None
    return u

def extract_text(html):
    s = BeautifulSoup(html, "lxml")
    for t in s(["script","style","noscript","svg","footer","nav","header"]): t.decompose()
    n = s.select_one("main") or s.select_one("article") or s.body or s
    return " ".join((n.get_text(" ", strip=True) if n else s.get_text(" ", strip=True)).split())

visited, queue, pages = set(), [START], []
while queue and len(visited) < MAX_PAGES:
    url = queue.pop(0)
    if url in visited: continue
    try:
        r = requests.get(url, headers=UA, timeout=TIMEOUT)
        if r.ok and "text/html" in r.headers.get("Content-Type",""):
            txt = extract_text(r.text)
            if len(txt) > 200:
                pages.append({"url": url, "text": txt})
            s = BeautifulSoup(r.text, "lxml")
            for a in s.find_all("a", href=True):
                u = clean_url(a["href"])
                if u and u not in visited:
                    queue.append(u)
        visited.add(url); time.sleep(0.15)
    except requests.RequestException:
        visited.add(url)

import json

# Save the crawled pages data to a file for later use
pages_outfile = "kapitalbank_pages.json"
with open(pages_outfile, "w", encoding="utf-8") as f:
    json.dump(pages, f, indent=2, ensure_ascii=False)
print(f"Saved {len(pages)} pages to {pages_outfile}")

# Load crawled pages from JSON file to make them available for Chroma processing
with open(pages_outfile, "r", encoding="utf-8") as f:
    pages = json.load(f)
print(f"Loaded {len(pages)} pages from {pages_outfile}")

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# ---- LangChain chunking ----
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
docs, metas = [], []
for p in pages:
    for chunk in splitter.split_text(p["text"]):
        docs.append(chunk)
        metas.append({"url": p["url"]})

# ---- OpenAI embeddings -> Chroma ----
persist_dir = "chroma_kapitalbank"
emb = OpenAIEmbeddings(model="text-embedding-3-small")  # cheap & solid
vs = Chroma.from_texts(
    texts=docs,
    embedding=emb,
    persist_directory=persist_dir,
    collection_name="kapitalbank_en",
    metadatas=metas,
)
vs.persist()
print(f"Indexed pages={len(pages)} chunks={len(docs)} into {persist_dir}/ (collection 'kapitalbank_en')")

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_dir = "chroma_kapitalbank"
collection_name = "kapitalbank_en"
emb = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the existing/persisted Chroma vector store
vs = Chroma(
    persist_directory=persist_dir,
    embedding_function=emb,
    collection_name=collection_name
)

Saved 39 pages to kapitalbank_pages.json
Loaded 39 pages from kapitalbank_pages.json


ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
  vs.persist()
  vs = Chroma(
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Indexed pages=39 chunks=161 into chroma_kapitalbank/ (collection 'kapitalbank_en')


In [None]:
from openai import OpenAI
import pandas as pd

client = OpenAI()

def generate_banking_qa_pairs(num_pairs=20):
    questions = []
    answers = []
    for _ in range(num_pairs):
        # Generate question
        question_completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are a helpful customer service chatbot for Kapitalbank in Azerbaijan. "
                        "Please generate a short, realistic question from a customer."
                    )
                }
            ],
            temperature=1
        )
        question_text = question_completion.choices[0].message.content.strip()

        # Now generate an answer to that question
        answer_completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are a knowledgeable Kapitalbank customer support assistant providing clear and concise answers to customer queries."
                    )
                },
                {
                    "role": "user",
                    "content": question_text
                }
            ],
            temperature=0.7
        )
        answer_text = answer_completion.choices[0].message.content.strip()

        questions.append(question_text)
        answers.append(answer_text)
    return questions, answers

# Generate Q&A pairs
banking_questions, banking_answers = generate_banking_qa_pairs(num_pairs=10)

# Convert to a Pandas DataFrame
df = pd.DataFrame({"Question": banking_questions, "Answer": banking_answers})

In [None]:
df

Unnamed: 0,Question,Answer
0,Salam. Mənim Kapitalbank mobil tətbiqetməsində...,Salam! Kapitalbank mobil tətbiqetməsində əsas ...
1,"Salam, Mənim Kapitalbank kartımın limitini art...",Salam! Kapitalbank kartınızın kredit limitini ...
2,"Salam, mənim Kapitalbank mobil tətbiqində edil...",Salam! Kapitalbank mobil tətbiqində köçürmələr...
3,"Hello, can you tell me what documents I need t...",Hello! To open a savings account with Kapitalb...
4,How can I update my contact information for my...,To update your contact information for your Ka...
5,Can you help me understand how to activate my ...,Certainly! Activating your new Kapitalbank deb...
6,Salam! Kapitalbank mobil tətbiqində hesabımı y...,Salam! Kapitalbank mobil tətbiqinə daxil ola b...
7,"Salam, mən Kapitalbank-ın onlayn bank tətbiqin...",Salam! Kapitalbank-ın onlayn bank tətbiqinə qo...
8,"Salam, Kapitalbankın onlayn bankçılıq xidmətin...",Salam! Kapitalbankın onlayn bankçılıq xidmətin...
9,How can I reset my internet banking password i...,If you've forgotten your internet banking pass...


In [None]:
from langfuse import get_client

langfuse = get_client()

# Create a new dataset in Langfuse
dataset_name = "openai_synthetic_dataset"
langfuse.create_dataset(
    name=dataset_name,
    description="Synthetic Q&A dataset generated via OpenAI in a loop",
    metadata={"approach": "openai_loop", "category": "mixed"}
)

# Upload each Q&A as a dataset item
for _, row in df.iterrows():
    # According to the TypeError, 'output' is not a valid argument.
    # Use 'expected_output' instead of 'output', assuming this is the correct parameter.
    langfuse.create_dataset_item(
        dataset_name="openai_synthetic_dataset",
        input=row["Question"],
        expected_output=row["Answer"]
    )

### Creating a dataset with RAGAS

In [None]:
from langchain_community.document_loaders import JSONLoader

# Load kapitalbank_pages.json using JSONLoader
json_path = "kapitalbank_pages.json"
json_loader = JSONLoader(file_path=json_path, jq_schema='.[]', text_content=False)
kapitalbank_docs = json_loader.load()

In [None]:
import ragas
ragas.__version__

'0.3.7'

In [None]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(kapitalbank_docs, testset_size=10)

# 4. The result `testset` can be converted to a pandas DataFrame for inspection
df = dataset.to_pandas()

  generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
  generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())


Applying HeadlinesExtractor:   0%|          | 0/26 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/39 [00:00<?, ?it/s]

Applying SummaryExtractor:   0%|          | 0/34 [00:00<?, ?it/s]



Applying CustomNodeFilter:   0%|          | 0/43 [00:00<?, ?it/s]

Applying EmbeddingExtractor:   0%|          | 0/34 [00:00<?, ?it/s]



Applying ThemesExtractor:   0%|          | 0/39 [00:00<?, ?it/s]

Applying NERExtractor:   0%|          | 0/39 [00:00<?, ?it/s]

Applying CosineSimilarityBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
from langfuse import get_client

langfuse = get_client()

# 5. Push the RAGAS-generated testset to Langfuse
langfuse.create_dataset(
    name="ragas_generated_testset",
    description="Synthetic RAG test set (RAGAS)",
    metadata={"source": "RAGAS", "docs_used": len(kapitalbank_docs)}
)

for _, row in df.iterrows():
    langfuse.create_dataset_item(
        dataset_name="ragas_generated_testset",
        input = row["user_input"],
        metadata = row["reference_contexts"]
    )

### Creating a dataset with DeepEval

In [None]:
from deepeval.synthesizer import Synthesizer

synthesizer = Synthesizer()



In [None]:
import json

# Read kapitalbank_pages.json and save as kapitalbank_pages.md
with open('kapitalbank_pages.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

with open('kapitalbank_pages.md', 'w', encoding='utf-8') as md_file:
    for entry in data:
        url = entry.get("url", "")
        text = entry.get("text", "")
        md_file.write(f'# URL: {url}\n\n{text}\n\n---\n\n')

# Now run with the markdown file as input
synthesizer.generate_goldens_from_docs(
    document_paths=['kapitalbank_pages.md'],
    include_expected_output=True
)
print(synthesizer.synthetic_goldens)

Output()

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


[Golden(input='If cyber fraud cases suddenly doubled, how might Kapital Bank’s awareness initiatives help reduce risks?', actual_output=None, expected_output='If cyber fraud cases suddenly doubled, Kapital Bank’s awareness initiatives—such as promoting the “Stop, think, verify!” campaign—would help reduce risks by educating customers to recognize and avoid common fraud tactics. The bank urges individuals to be cautious with suspicious calls, never share sensitive information like card details or OTPs, and use the Birbank app’s verification feature to confirm bank representatives. These measures empower customers to act prudently, making it harder for fraudsters to succeed, even amid a surge in cyber fraud cases.', context=[" prevent others from becoming victims of cyber fraud as well. Kapital Bank, the country's first bank, is part of PASHA Holding. Kapital Bank has the largest service network in Azerbaijan with 119 branches and 52 departments all over the country. For more detailed in

In [None]:
from langfuse import get_client
langfuse = get_client()

# 5. Create a Langfuse dataset
deepeval_dataset_name = "deepeval_synthetic_data"
langfuse.create_dataset(
    name=deepeval_dataset_name,
    description="Synthetic DeepEval dataset",
    metadata={"approach": "deepeval", "task": "text-to-sql"}
)

# 6. Upload the items
for golden in synthesizer.synthetic_goldens:
    langfuse.create_dataset_item(
        dataset_name=deepeval_dataset_name,
        input=golden.input,
        expected_output=golden.expected_output
    )