# Prerequisites

## (Optional) Self-hosting
Follow [this blog](https://community.aws/content/2klaPSIl7P4aAd2IJRWeS0NyVtn/deploying-langfuse-on-amazon-ecs-with-aws-fargate-a-step-by-step-guide-using-aws-cdk) to deploy Langfuse v2 on Amazon ECS with AWS Fargate.

## Project Setup
1. Go to your Langfuse domain
2. Create a new project
3. Create new API credentials in the project settings and save the API keys.

Define them directly in the code:

In [None]:
import os
os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-..." # Your Langfuse project secret key
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-..." # Your Langfuse project public key
os.environ["LANGFUSE_HOST"] = "https://xx.cloud.langfuse.com" # Region-specific Langfuse domain

   Or use the `.env` file:

   ```bash
   LANGFUSE_SECRET_KEY=sk-lf-... # Your Langfuse project secret key
   LANGFUSE_PUBLIC_KEY=pk-lf-... # Your Langfuse project public key
   LANGFUSE_HOST=https://xxx.xxx.awsapprunner.com # App Runner domain
   ```

In [None]:
# load variables from .env file
from dotenv import load_dotenv
load_dotenv()

See Langfuse documentation for more details: https://langfuse.com/docs

## Python Dependencies

We will use the `langfuse`, `boto3` and `litellm` Python packages. Specifically, we will use:

- The `langfuse` SDK along with the public or self-hosting deployment to debug and improve LLM applications by tracing model invocations, managing prompts / models configurations and running evaluations.
- The `boto3` SDK to interact with models on Amazon Bedrock or Amazon SageMaker.
- (Optional) The `litellm` SDK to route requests to different LLM models with advanced load balancing and fallback, as well as standardizing the responses for chat, streaming, function calling and more.

Note that you can also use other frameworks like LangChain or implement your own proxy instead of using `litellm`.

Run the following command to install the required Python SDKs:

```bash
pip install langfuse==2.54.1 boto3==1.35.70 litellm==1.52.16
```


# LLM Gateway Options
Choose one of the following options to invoke the LLM models

## Option 1: Bedrock Converse API
When only using Amazon Bedrock models.

In [None]:
import boto3

# used to access Bedrock configuration
bedrock = boto3.client(
    service_name="bedrock",
    region_name="us-west-2"
)
 
# used to invoke the Bedrock Converse API
bedrock_runtime = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-west-2"
)

# Check which models are available in your account
models = bedrock.list_inference_profiles()
for model in models["inferenceProfileSummaries"]:
  print(model["inferenceProfileName"] + " - " + model["inferenceProfileId"])

In [None]:
from typing import List, Dict, Optional, Any

from langfuse import Langfuse
from langfuse.client import PromptClient
from langfuse.decorators import observe, langfuse_context
from botocore.exceptions import ClientError

# langfuse client
langfuse = Langfuse()

@observe(as_type="generation", name="Bedrock Converse")
def fn(
    messages: List[Dict[str, Any]],
    prompt: Optional[PromptClient] = None,
    **kwargs,
) -> str | None:
    # 1. extract model metadata
    modelId = "anthropic.claude-3-haiku-20240307-v1:0"
    inferenceConfig = {"maxTokens": 500, "temperature": 0.1}
    additionalModelRequestFields = {"top_k": 250}

    model_parameters = {**inferenceConfig, **additionalModelRequestFields}

    langfuse_context.update_current_observation(
        input=messages,
        model=modelId,
        model_parameters=model_parameters,
        prompt=prompt,
        metadata=kwargs,
    )

    # Extract the system prompts from the messages and convert them to the format expected by the Bedrock Converse API
    system_prompts = [
        {"text": message["content"]}
        for message in messages
        if message["role"] == "system"
    ]

    # Convert the rest of messages to the format expected by the Bedrock Converse API
    messages = [
        {
            "role": message["role"],
            "content": (
                message["content"]
                if isinstance(message["content"], list)
                else [{"text": message["content"]}]
            ),
        }
        for message in messages
        if message["role"] != "system"  # Add this condition
    ]

    # 2. model call with error handling
    try:
        response = bedrock_runtime.converse(
            modelId=modelId,
            messages=messages,
            system=system_prompts,
            inferenceConfig=inferenceConfig,
            additionalModelRequestFields=additionalModelRequestFields,
            **kwargs,
        )
    except (ClientError, Exception) as e:
        error_message = f"ERROR: Can't invoke '{modelId}'. Reason: {e}"
        langfuse_context.update_current_observation(
            level="ERROR", status_message=error_message
        )
        print(error_message)
        return

    # 3. extract response metadata
    response_text = response["output"]["message"]["content"][0]["text"]
    langfuse_context.update_current_observation(
        output=response_text,
        usage={
            "input": response["usage"]["inputTokens"],
            "output": response["usage"]["outputTokens"],
            "total": response["usage"]["totalTokens"],
        },
        metadata={
            "ResponseMetadata": response["ResponseMetadata"],
        },
    )

    return response_text

## Option 2: LiteLLM Proxy
When using / evaluating multiple model providers.

In [None]:
from typing import List, Dict, Optional

from langfuse import Langfuse
from langfuse.client import PromptClient
from langfuse.decorators import langfuse_context, observe

import litellm
import litellm.types
import litellm.types.utils

# langfuse client
langfuse = Langfuse()

# set callbacks
litellm.success_callback = ["langfuse"]
litellm.failure_callback = ["langfuse"]


@observe(name="example_function")
def fn(
    messages: List[Dict[str, str]],
    prompt: Optional[PromptClient] = None,
    generation_id: Optional[str] = None,
) -> str | None:

    metadata = {
        "generation_name": "test-generation",  # set langfuse Generation Name
        "existing_trace_id": langfuse_context.get_current_trace_id(),  # link to parent trace 
        # GitHub issue for nested traces: https://github.com/langfuse/langfuse/issues/2238
    }

    if generation_id:
        metadata["generation_id"] = generation_id  # override langfuse Generation ID
    if prompt:
        metadata["prompt"] = prompt

    response = litellm.completion(
        model="bedrock/anthropic.claude-3-haiku-20240307-v1:0",
        messages=messages,
        metadata=metadata,
    )

    return response.choices[0].message.content


@observe(name="example_streaming_function")
def streaming_fn(
    messages: List[Dict[str, str]],
    prompt: Optional[PromptClient] = None,
    generation_id: Optional[str] = None,
) -> litellm.utils.CustomStreamWrapper:

    metadata = {
        "generation_name": "test-generation",  # set langfuse Generation name
        "existing_trace_id": langfuse_context.get_current_trace_id(),  # link to parent trace
    }

    if generation_id:
        metadata["generation_id"] = generation_id  # override langfuse Generation ID
    if prompt:
        metadata["prompt"] = prompt

    response = litellm.completion(
        model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
        messages=messages,
        stream=True,
        metadata=metadata,
    )

    return response


@observe(name="example_tool_use_function")
def tool_use_fn(
    messages: List[Dict[str, str]],
    tools: List[Dict[str, str]],
    tool_choice: str = "auto",
    prompt: Optional[PromptClient] = None,
    generation_id: Optional[str] = None,
) -> List[litellm.types.utils.ChatCompletionMessageToolCall]:

    metadata = {
        "generation_name": "test-generation",  # set langfuse Generation name
        "existing_trace_id": langfuse_context.get_current_trace_id(),  # link to parent trace
    }

    if generation_id:
        metadata["generation_id"] = generation_id  # override langfuse Generation ID
    if prompt:
        metadata["prompt"] = prompt

    response = litellm.completion(
        model="bedrock/anthropic.claude-3-haiku-20240307-v1:0",
        messages=messages,
        tools=tools,
        tool_choice=tool_choice,
        metadata=metadata,
    )
    return response.choices[0].message.tool_calls

# LLM Application Examples

## RAG Example

In [None]:
@observe(name="retrieve_context")
def retrieve_context(city: str) -> str:
    """Dummy function to retrieve context for the given city."""
    context = """\
21st November 2024
Sydney: 24 degrees celcius.
New York: 13 degrees celcius.
Tokyo: 11 degrees celcius."""
    return context

In [None]:
import uuid
from typing import Tuple

@observe(name="example_rag")
def call_rag_api(
    query: str,
    user_id: Optional[str] = None,
    session_id: Optional[str] = None,
) -> Tuple[str]:
    langfuse_context.update_current_trace(
        user_id=user_id,
        session_id=session_id,
        tags=["dev"],
    )

    retrieved_context = retrieve_context(query)
    # without langfuse prompt manager
    messages = [
        {
            "content": f"Context: {retrieved_context}\nBased on the context above, answer the following question:",
            "role": "system",
        },
        {"content": query, "role": "user"},
    ]

    # with langfuse prompt manager
    # qa_with_context_prompt = langfuse.get_prompt("qa-with-context", version=1)
    # messages = qa_with_context_prompt.compile(
    #     retrieved_context=retrieved_context,
    #     query=query,
    # )

    trace_id=langfuse_context.get_current_trace_id()
    generation_id = uuid.uuid4().hex

    return fn(
        messages, 
        # prompt=qa_with_context_prompt, # uncomment to link the prompt
        # if using LiteLLM functions, pass it down to LiteLLM completion
        # generation_id=generation_id, 
        # if not using LiteLLM, auto-overrides id for functions wrapped with @observe
        langfuse_observation_id=generation_id, 
    ), trace_id, generation_id # return id for async scoring

In [None]:
print(call_rag_api(query="What is the temperature in Sydney?", user_id="tenant1-user1")[0])

### Prompt Management

In [None]:
# Uncomment this to create a chat prompt
langfuse.create_prompt(
    name="qa-with-context",
    type="chat",
    prompt=[
      { "role": "system", "content": f"Context: {{retrieved_context}}\nBased on the context above, answer the following question:" },
      { "role": "user", "content": "{{query}}" },
    ],
    config={
        "model": "anthropic.claude-3-haiku-20240307-v1:0",
        "temperature": 0.1,
    },  # optionally, add configs (e.g. model parameters or model tools) or tags
)

In [None]:
qa_with_context_prompt = langfuse.get_prompt("qa-with-context", version=1)
messages = qa_with_context_prompt.compile(
    retrieved_context="<context>",
    query="<query>",
)
messages

### Scoring

#### Scoring from backend

In [None]:
import random

output, trace_id, generation_id = call_rag_api(query="What is the temperature in Sydney?", user_id="tenant1-user1")

# Score the trace from outside the trace context using the low-level SDK
# auto evals, score against both observation and trace
langfuse.score(
    trace_id=trace_id,
    observation_id=generation_id,
    name="accuracy",
    value=random.uniform(0, 1),
)

# user feedback
langfuse.score(
    trace_id=trace_id,
    name="like",
    data_type="BOOLEAN",
    value=True,
    comment="I like how detailed the notes are"
)

#### Scoring from frontend

Web SDK example for scoring:
* https://langfuse.com/docs/scores/user-feedback#example-using-langfuseweb
* https://langfuse.com/docs/sdk/typescript/guide-web

```javascript
import { LangfuseWeb } from "langfuse";
 
export function UserFeedbackComponent(props: { traceId: string }) {
  const langfuseWeb = new LangfuseWeb({
    publicKey: env.NEXT_PUBLIC_LANGFUSE_PUBLIC_KEY,
  });
 
  const handleUserFeedback = async (value: number) =>
    await langfuseWeb.score({
      traceId: props.traceId,
      name: "user_feedback",
      value,
    });
 
  return (
    <div>
      <button onClick={() => handleUserFeedback(1)}>👍</button>
      <button onClick={() => handleUserFeedback(0)}>👎</button>
    </div>
  );
}
```

### Evaluation

Only run the following cell **ONCE** to create the dataset

In [None]:
dataset_name = "city_temperature"

# Uncomment the following code to create a dataset and upload items to it
# langfuse.create_dataset(name=dataset_name)

# context = retrieve_context("What's the temperature?")
# # example items, could also be json instead of strings
# local_items = [
#     {"input": {"context": context, "city": "Sydney"}, "expected_output": "24 degrees celcius"},
#     {"input": {"context": context, "city": "New York"}, "expected_output": "13 degrees celcius"},
#     {"input": {"context": context, "city": "Tokyo"}, "expected_output": "11 degrees celcius"},
# ]

# # Upload to Langfuse
# for item in local_items:
#   langfuse.create_dataset_item(
#       dataset_name=dataset_name,
#       # any python object or value
#       input=item["input"],
#       # any python object or value, optional
#       expected_output=item["expected_output"]
# )

In [None]:
import random
from langfuse.model import DatasetStatus

def custom_evaluate(context, query, expected_output, output) -> Tuple[float, str]:
    # TODO: define any custom evaluation logic here
    # For example, rule-based, LLM-as-judge
    return random.uniform(0, 1), "This is a dummy LLM evaluation"

def run_experiment(run_name: str, user_prompt: str):
    dataset = langfuse.get_dataset(dataset_name)

    for item in dataset.items:
        with item.observe(run_name=run_name) as trace_id:
            if item.status is not DatasetStatus.ACTIVE:
                print(f"Skipping {item.id} of status {item.status}")
                continue

            print(item.input)
            context = item.input["context"]
            city = item.input["city"]
            query = user_prompt.format(city=city)
            expected_output = item.expected_output

            output, _, _ = call_rag_api(query=query, user_id="evals")

            # evaluation logic
            score, comment = custom_evaluate(context, query, expected_output, output)

            # # surface the score and comment at trace level
            langfuse.score(
                trace_id=trace_id,
                name="accuracy",
                data_type="NUMERIC",
                value=score,
                comment=comment
            )

In [None]:
from datetime import datetime
from langfuse.decorators import langfuse_context
 
run_experiment(
    run_name=f"generic_ask_{datetime.now().strftime('%Y%m%d%H%M%S')}",
    user_prompt="What is the temperature in {city}?"
)
run_experiment(
    run_name=f"precise_ask_{datetime.now().strftime('%Y%m%d%H%M%S')}",
    user_prompt="What is the temperature in {city}? Respond with the temperature only."
)

# Assert that all events were sent to the Langfuse API
langfuse_context.flush()
langfuse.flush()