# API-Usage-LLM-Interaction

Let's use a `.env` file and the `dotenv` library. This allows you to keep your API key secure and separated from your code.

Here’s how you can setup to load the OpenAI API key using a `.env` file:

### Step 1: Install Dependencies
Make sure you have `python-dotenv` installed if you're using `.env` files:
```bash
pip install python-dotenv
pip install -U langsmith
pip install openai
pip install langchain_openai
```

### Step 2: Create a `.env` File
In your project directory, create a `.env` file with the following content:
```
OPENAI_API_KEY=your-api-key-here
```

### Step 3: Test

In [1]:
import os
import openai
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Function to load environment variables or raise an error if not found
def get_env_var(var: str):
    value = os.getenv(var)
    if value is None:
        raise ValueError(f"{var} not found in environment variables. Make sure it is set in your .env file.")
    return value

# Load API keys from the environment
langchain_api_key = get_env_var("LANGCHAIN_API_KEY")
langchain_tracing_v2 = get_env_var("LANGCHAIN_TRACING_V2")
openai_api_key = get_env_var("OPENAI_API_KEY")
tavily_api_key = get_env_var("TAVILY_API_KEY")

# Now, you can use the keys in our setup as needed
from langchain_openai import ChatOpenAI
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
gpt35_chat = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, openai_api_key=openai_api_key)

In [1]:
import os
import openai
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Function to load environment variables or raise an error if not found
def get_env_var(var: str):
    value = os.getenv(var)
    if value is None:
        raise ValueError(f"{var} not found in environment variables. Make sure it is set in your .env file.")
    return value

# Load API keys from the environment
langchain_api_key = get_env_var("LANGCHAIN_API_KEY")  # LangChain tracing (if applicable)
langchain_tracing_v2 = get_env_var("LANGCHAIN_TRACING_V2")  # LangChain tracing V2 (optional)
openai_api_key = get_env_var("OPENAI_API_COURSE_KEY")  # OpenAI API key
tavily_api_key = get_env_var("TAVILY_API_KEY")  # Other API key (if used)

# Import and configure LangChain OpenAI integration
from langchain_openai import ChatOpenAI

# Initialize ChatOpenAI with the desired models and temperature settings
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
gpt4o_mini_chat = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=openai_api_key)

# Import LangChain message classes
from langchain_core.messages import HumanMessage

# Create a message using HumanMessage
msg = HumanMessage(content="Hello world", name="Joseph")  # Add content and optional metadata

# Create a list of messages
messages = [msg]

# Invoke the GPT-4o model with the message list
response_gpt4o = gpt4o_chat.invoke(messages)
print("Response from GPT-4o:", response_gpt4o.content)

# Invoke the GPT-4o-mini model with the message list
response_gpt4o_mini = gpt4o_mini_chat.invoke(messages)
print("Response from GPT-4o-mini:", response_gpt4o_mini.content)


Response from GPT-4o: Hello! How can I assist you today?
Response from GPT-4o-mini: Hello, Joseph! How can I assist you today?


In [2]:
from langchain_core.messages import HumanMessage

# Create a message
msg = HumanMessage(content="Hello world", name="Joseph")

# Message list
messages = [msg]

# Invoke the model with a list of messages 
gpt4o_chat.invoke(messages)

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 11, 'total_tokens': 20, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_4691090a87', 'finish_reason': 'stop', 'logprobs': None}, id='run-8f88359a-5575-4258-af8f-485cf4fe7ad2-0', usage_metadata={'input_tokens': 11, 'output_tokens': 9, 'total_tokens': 20, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [3]:
gpt4o_chat.invoke("hello world")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 9, 'total_tokens': 19, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_50cad350e4', 'finish_reason': 'stop', 'logprobs': None}, id='run-9a184456-aa97-4429-ac4b-2a23866c11f6-0', usage_metadata={'input_tokens': 9, 'output_tokens': 10, 'total_tokens': 19, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [4]:
gpt4o_mini_chat.invoke("hello world")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 9, 'total_tokens': 19, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-a3ff6f77-dc45-4182-bf7d-b79b295c0acc-0', usage_metadata={'input_tokens': 9, 'output_tokens': 10, 'total_tokens': 19, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [5]:
# Set up Tavily using the API key from the environment
from langchain_community.tools.tavily_search import TavilySearchResults
tavily_search = TavilySearchResults(max_results=3, api_key=tavily_api_key)

In [6]:
# Perform a search with Tavily
search_docs = tavily_search.invoke("What is LangGraph?")
search_docs

[{'url': 'https://langchain-ai.github.io/langgraph/',
  'content': 'Overview¶. LangGraph is a library for building stateful, multi-actor applications with LLMs, used to create agent and multi-agent workflows. Compared to other LLM frameworks, it offers these core benefits: cycles, controllability, and persistence. LangGraph allows you to define flows that involve cycles, essential for most agentic architectures, differentiating it from DAG-based solutions.'},
 {'url': 'https://www.datacamp.com/tutorial/langgraph-tutorial',
  'content': 'LangGraph is a library within the LangChain ecosystem designed to tackle these challenges head-on. LangGraph provides a framework for defining, coordinating, and executing multiple LLM agents (or chains) in a structured manner.'},
 {'url': 'https://medium.com/@cplog/introduction-to-langgraph-a-beginners-guide-14f9be027141',
  'content': 'LangGraph is a versatile tool for building complex, stateful applications with LLMs. By understanding its core concep

In [7]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.callbacks import get_openai_callback

# Load environment variables from .env file
load_dotenv()

# Function to load environment variables or raise an error if not found
def get_env_var(var: str):
    value = os.getenv(var)
    if value is None:
        raise ValueError(f"{var} not found in environment variables. Make sure it is set in your .env file.")
    return value

# Load API keys from the environment
openai_api_key = get_env_var("OPENAI_API_KEY")

# Initialize the OpenAI LLM via LangChain
gpt_chat = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)

# Define a simple prompt template for conversation
prompt_template = PromptTemplate(
    input_variables=["user_input"],
    template="You are a helpful assistant. User asks: {user_input}. How would you respond?"
)

# Instead of using RunnableSequence, directly chain the prompt and model using the | operator
conversation_chain = prompt_template | gpt_chat

try:
    # Start callback for tracing OpenAI usage
    with get_openai_callback() as cb:
        print("LangChain tracing started successfully.")

        # Example conversation - LangChain will trace this operation
        user_input = "What are the benefits of using transformers in natural language processing?"
        response = conversation_chain.invoke({"user_input": user_input})

        print("Agent's Response:")
        print(response)

        # You can extend this conversation further
        follow_up = "Can you explain the difference between GPT and BERT?"
        follow_up_response = conversation_chain.invoke({"user_input": follow_up})

        print("Follow-up Response:")
        print(follow_up_response)

        # Output callback information
        print(f"\nTotal Tokens Used: {cb.total_tokens}")
        print(f"Total Cost (USD): ${cb.total_cost}")

    print("LangChain tracing ended successfully.")

except Exception as e:
    print(f"Error with LangChain tracing: {e}")


LangChain tracing started successfully.
Agent's Response:
content='Transformers have revolutionized natural language processing (NLP) by significantly improving the performance of various NLP tasks. Some of the key benefits of using transformers in NLP include:\n\n1. Enhanced performance: Transformers have shown superior performance compared to traditional NLP models, especially in tasks such as language translation, text generation, and sentiment analysis.\n\n2. Attention mechanism: Transformers use an attention mechanism that allows the model to focus on relevant parts of the input sequence, leading to better understanding and representation of the text.\n\n3. Parallel processing: Transformers can process input sequences in parallel, making them more efficient and faster compared to sequential models.\n\n4. Transfer learning: Transformers can be fine-tuned on specific tasks with relatively small amounts of data, making them highly adaptable and versatile for various NLP applications.

In [11]:
#pip install deepeval

In [13]:
#pip install pytest

In [31]:
# GPT-4 Evaluation
import os
import json
import re
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from langchain_openai import ChatOpenAI

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI LLM via LangChain
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)

# Define a prompt for the LLM
prompt = "Explain the benefits of using transformers in natural language processing."

# Invoke the model and capture the response
response = gpt4o_chat.invoke(prompt)
response_text = response.content if hasattr(response, 'content') else response

# Clean up any Markdown bold formatting
cleaned_response_text = re.sub(r'\*\*([^*]+)\*\*', r'\1', response_text)

# Initialize the Answer Relevancy Metric
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)

# Create a test case
test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_response_text,
    retrieval_context=["Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."]
)

# Evaluate using evaluate function to capture results
evaluation_results = evaluate([test_case], [answer_relevancy_metric])

print("Evaluation Summary:")

try:
    # Assuming `evaluation_results` is a single EvaluationResult object with a list of TestResult objects
    for result in evaluation_results.test_results:
        print("\nTest Case:")
        print(f"  - Input: {result.input}")
        print(f"  - Actual Output: {result.actual_output}")

        # Loop through metric data in the test result
        for metric_data in result.metrics_data:
            print(f"\nMetric: {metric_data.name}")
            print(f"  - Score: {metric_data.score}")
            print(f"  - Success: {'✅' if metric_data.success else '❌'}")
            print(f"  - Threshold: {metric_data.threshold}")
            print(f"  - Reason: {metric_data.reason}")
            print(f"  - Evaluation Model: {metric_data.evaluation_model}")
            print(f"  - Evaluation Cost: ${metric_data.evaluation_cost:.5f}")

    # Log the results to a JSON file
    with open('evaluation_summary.json', 'w') as file:
        json.dump(evaluation_results.dict(), file, indent=4)
    print("Evaluation results have been logged to 'evaluation_summary.json'")

except Exception as e:
    print("An error occurred while processing evaluation results:", str(e))


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:14, 14.06s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the explanation perfectly covers the benefits of using transformers in natural language processing without any irrelevant statements. Great job!, error: None)

For test case:

  - input: Explain the benefits of using transformers in natural language processing.
  - actual output: Transformers have revolutionized the field of natural language processing (NLP) due to their unique architecture and capabilities. Here are some of the key benefits of using transformers in NLP:

1. Parallelization and Efficiency: Unlike recurrent neural networks (RNNs), which process data sequentially, transformers can process entire sequences of data simultaneously. This parallelization significantly speeds up training and inference, making transformers more efficient, especially when dealing with large datasets.

2. Handling Long-Range Dependencies: Transformers u




Evaluation Summary:

Test Case:
  - Input: Explain the benefits of using transformers in natural language processing.
  - Actual Output: Transformers have revolutionized the field of natural language processing (NLP) due to their unique architecture and capabilities. Here are some of the key benefits of using transformers in NLP:

1. Parallelization and Efficiency: Unlike recurrent neural networks (RNNs), which process data sequentially, transformers can process entire sequences of data simultaneously. This parallelization significantly speeds up training and inference, making transformers more efficient, especially when dealing with large datasets.

2. Handling Long-Range Dependencies: Transformers use a mechanism called self-attention, which allows them to weigh the importance of different words in a sentence, regardless of their position. This capability enables transformers to capture long-range dependencies and contextual relationships more effectively than traditional models.

3.

In [32]:
import os
import json
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from langchain_openai import ChatOpenAI

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize both GPT-4 and GPT-3.5 models
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
gpt35_chat = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)

# Define a prompt for the LLMs
prompt = "Explain the benefits of using transformers in natural language processing."

# Get responses from both models
gpt4_response = gpt4o_chat.invoke(prompt).content
gpt35_response = gpt35_chat.invoke(prompt).content

# Initialize the Answer Relevancy Metric
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)

# Create test cases for each model response
gpt4_test_case = LLMTestCase(
    input=prompt,
    actual_output=gpt4_response,
    retrieval_context=["Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."]
)

gpt35_test_case = LLMTestCase(
    input=prompt,
    actual_output=gpt35_response,
    retrieval_context=["Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."]
)

# Evaluate both test cases
evaluation_results_gpt4 = evaluate([gpt4_test_case], [answer_relevancy_metric])
evaluation_results_gpt35 = evaluate([gpt35_test_case], [answer_relevancy_metric])

# Function to process and print results
def print_results(model_name, evaluation_results):
    print(f"\n=== {model_name} Evaluation Results ===")
    for result in evaluation_results.test_results:
        print(f"\nInput: {result.input}")
        print(f"Output: {result.actual_output}\n")
        for metric_data in result.metrics_data:
            print(f"Metric: {metric_data.name}")
            print(f" - Score: {metric_data.score}")
            print(f" - Success: {'✅' if metric_data.success else '❌'}")
            print(f" - Reason: {metric_data.reason}")
            print(f" - Evaluation Model: {metric_data.evaluation_model}")
            print(f" - Evaluation Cost: ${metric_data.evaluation_cost:.5f}")
        print("\n" + "="*50)

# Print and log results for both models
print_results("GPT-4", evaluation_results_gpt4)
print_results("GPT-3.5", evaluation_results_gpt35)

# Optionally log results to file
results_to_log = {
    "GPT-4": evaluation_results_gpt4.dict(),
    "GPT-3.5": evaluation_results_gpt35.dict()
}
with open('evaluation_summary_comparison.json', 'w') as f:
    json.dump(results_to_log, f, indent=4)


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:15, 15.28s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the explanation provided is perfectly relevant and directly addresses the benefits of using transformers in natural language processing without any irrelevant information. Excellent job!, error: None)

For test case:

  - input: Explain the benefits of using transformers in natural language processing.
  - actual output: Transformers have revolutionized the field of natural language processing (NLP) due to several key benefits:

1. **Parallelization**: Unlike recurrent neural networks (RNNs), which process data sequentially, transformers allow for parallel processing of data. This is achieved through the self-attention mechanism, which enables the model to consider all words in a sentence simultaneously. This parallelization significantly speeds up training and inference times.

2. **Handling Long-Range Dependencies**: Transformers are partic




Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:07,  7.33s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response is perfectly relevant and effectively addresses the benefits of using transformers in natural language processing. Great job!, error: None)

For test case:

  - input: Explain the benefits of using transformers in natural language processing.
  - actual output: Transformers are a type of deep learning model that has revolutionized natural language processing (NLP) tasks. One of the main benefits of using transformers in NLP is their ability to capture long-range dependencies in text data. Traditional models like recurrent neural networks struggle with this because they process text sequentially, which can lead to information loss over long distances. Transformers, on the other hand, use self-attention mechanisms to weigh the importance of each word in a sentence, allowing them to capture relationships between words that are far a





=== GPT-4 Evaluation Results ===

Input: Explain the benefits of using transformers in natural language processing.
Output: Transformers have revolutionized the field of natural language processing (NLP) due to several key benefits:

1. **Parallelization**: Unlike recurrent neural networks (RNNs), which process data sequentially, transformers allow for parallel processing of data. This is achieved through the self-attention mechanism, which enables the model to consider all words in a sentence simultaneously. This parallelization significantly speeds up training and inference times.

2. **Handling Long-Range Dependencies**: Transformers are particularly effective at capturing long-range dependencies in text. The self-attention mechanism allows the model to weigh the importance of different words in a sentence, regardless of their distance from each other. This is crucial for understanding context and meaning in complex sentences.

3. **Scalability**: Transformers can be scaled up to c

In [33]:
import os
import re
import json
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from langchain_openai import ChatOpenAI

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI LLMs via LangChain
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
gpt35_chat = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)

# Define a prompt for the LLM
prompt = "Explain the benefits of using transformers in natural language processing."

# Retrieve responses from both models
gpt4_response = gpt4o_chat.invoke(prompt).content
gpt35_response = gpt35_chat.invoke(prompt).content

# Clean up any Markdown bold formatting
cleaned_gpt4_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt4_response)
cleaned_gpt35_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt35_response)

# Initialize the Answer Relevancy Metric
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)

# Create test cases for each model's response
gpt4_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt4_response,
    retrieval_context=["Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."]
)

gpt35_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt35_response,
    retrieval_context=["Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."]
)

# Evaluate both test cases
evaluation_results_gpt4 = evaluate([gpt4_test_case], [answer_relevancy_metric])
evaluation_results_gpt35 = evaluate([gpt35_test_case], [answer_relevancy_metric])

# Function to print evaluation summary
def print_evaluation_summary(model_name, evaluation_results):
    print(f"\n=== {model_name} Evaluation Results ===\n")
    for result in evaluation_results.test_results:
        print(f"Input: {result.input}")
        print(f"Output: {result.actual_output}\n")
        
        for metric_data in result.metrics_data:
            print(f"Metric: {metric_data.name}")
            print(f" - Score: {metric_data.score}")
            print(f" - Success: {'✅' if metric_data.success else '❌'}")
            print(f" - Reason: {metric_data.reason}")
            print(f" - Evaluation Model: {metric_data.evaluation_model}")
            print(f" - Evaluation Cost: ${metric_data.evaluation_cost:.5f}")
        print("=" * 50)

# Print evaluation summaries for both models
print_evaluation_summary("GPT-4o", evaluation_results_gpt4)
print_evaluation_summary("GPT-3.5-turbo", evaluation_results_gpt35)

# Save the results to a JSON file
output_data = {
    "gpt-4o": evaluation_results_gpt4.dict(),
    "gpt-3.5-turbo": evaluation_results_gpt35.dict()
}
with open('evaluation_summary.json', 'w') as f:
    json.dump(output_data, f, indent=4)

print("\nEvaluation results have been logged to 'evaluation_summary.json'")


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:11, 11.16s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response is perfectly relevant and directly addresses the benefits of using transformers in natural language processing without any irrelevant information. Great job!, error: None)

For test case:

  - input: Explain the benefits of using transformers in natural language processing.
  - actual output: Transformers have revolutionized the field of natural language processing (NLP) due to their unique architecture and capabilities. Here are some of the key benefits of using transformers in NLP:

1. Parallelization and Efficiency: Unlike recurrent neural networks (RNNs), which process data sequentially, transformers can process entire sequences of data simultaneously. This parallelization significantly speeds up training and inference, making transformers more efficient, especially when dealing with large datasets.

2. Handling Long-Range De




Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:06,  6.34s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response is perfectly relevant and fully addresses the benefits of using transformers in natural language processing without any irrelevant information. Great job!, error: None)

For test case:

  - input: Explain the benefits of using transformers in natural language processing.
  - actual output: Transformers are a type of deep learning model that has revolutionized natural language processing (NLP) tasks. One of the main benefits of using transformers in NLP is their ability to capture long-range dependencies in text data. Traditional models like recurrent neural networks (RNNs) struggle with this because they process text sequentially, which can lead to information loss over long distances. Transformers, on the other hand, use self-attention mechanisms to weigh the importance of each word in a sentence, allowing them to capture relati





=== GPT-4o Evaluation Results ===

Input: Explain the benefits of using transformers in natural language processing.
Output: Transformers have revolutionized the field of natural language processing (NLP) due to their unique architecture and capabilities. Here are some of the key benefits of using transformers in NLP:

1. Parallelization and Efficiency: Unlike recurrent neural networks (RNNs), which process data sequentially, transformers can process entire sequences of data simultaneously. This parallelization significantly speeds up training and inference, making transformers more efficient, especially when dealing with large datasets.

2. Handling Long-Range Dependencies: Transformers use a mechanism called self-attention, which allows them to weigh the importance of different words in a sentence, regardless of their position. This capability enables transformers to capture long-range dependencies and contextual relationships more effectively than traditional models like RNNs or LS

In [35]:
import os
import re
import json
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric  # Import available metrics only
from deepeval.test_case import LLMTestCase
from langchain_openai import ChatOpenAI

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI LLMs via LangChain
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
gpt35_chat = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)

# Define a prompt for the LLM
prompt = "Explain the benefits of using transformers in natural language processing."

# Retrieve responses from both models
gpt4_response = gpt4o_chat.invoke(prompt).content
gpt35_response = gpt35_chat.invoke(prompt).content

# Clean up any Markdown bold formatting
cleaned_gpt4_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt4_response)
cleaned_gpt35_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt35_response)

# Define the available metric(s) to evaluate
metrics_list = [AnswerRelevancyMetric(threshold=0.5)]

# Create test cases for each model's response
gpt4_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt4_response,
    retrieval_context=["Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."]
)

gpt35_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt35_response,
    retrieval_context=["Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."]
)

# Evaluate both test cases with the metrics list
evaluation_results_gpt4 = evaluate([gpt4_test_case], metrics_list)
evaluation_results_gpt35 = evaluate([gpt35_test_case], metrics_list)

# Function to print evaluation summaries
def print_evaluation_summary(model_name, evaluation_results):
    print(f"\n=== {model_name} Evaluation Results ===\n")
    for result in evaluation_results.test_results:
        print(f"Input: {result.input}")
        print(f"Output: {result.actual_output}\n")
        
        for metric_data in result.metrics_data:
            print(f"Metric: {metric_data.name}")
            print(f" - Score: {metric_data.score}")
            print(f" - Success: {'✅' if metric_data.success else '❌'}")
            print(f" - Reason: {metric_data.reason}")
            print(f" - Evaluation Model: {metric_data.evaluation_model}")
            print(f" - Evaluation Cost: ${metric_data.evaluation_cost:.5f}")
        print("=" * 50)

# Print evaluation summaries for both models
print_evaluation_summary("GPT-4o", evaluation_results_gpt4)
print_evaluation_summary("GPT-3.5-turbo", evaluation_results_gpt35)

# Save the results to a JSON file
output_data = {
    "gpt-4o": evaluation_results_gpt4.dict(),
    "gpt-3.5-turbo": evaluation_results_gpt35.dict()
}
with open('evaluation_summary.json', 'w') as f:
    json.dump(output_data, f, indent=4)

print("\nEvaluation results have been logged to 'evaluation_summary.json'")


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:11, 11.03s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the answer perfectly addresses the question about the benefits of using transformers in natural language processing without any irrelevant statements. Great job!, error: None)

For test case:

  - input: Explain the benefits of using transformers in natural language processing.
  - actual output: Transformers have revolutionized the field of natural language processing (NLP) due to their unique architecture and capabilities. Here are some of the key benefits of using transformers in NLP:

1. Parallelization: Unlike recurrent neural networks (RNNs), transformers do not require sequential data processing, which allows for parallelization during training. This significantly speeds up the training process and makes it feasible to train on large datasets.

2. Handling Long-Range Dependencies: Transformers use self-attention mechanisms that allow t




Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:07,  7.55s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the output perfectly addresses the input with no irrelevant statements, providing a clear and focused explanation of the benefits of using transformers in natural language processing. Great job!, error: None)

For test case:

  - input: Explain the benefits of using transformers in natural language processing.
  - actual output: Transformers are a type of deep learning model that has revolutionized natural language processing (NLP) tasks. One of the main benefits of using transformers in NLP is their ability to capture long-range dependencies in text data. Traditional models like recurrent neural networks (RNNs) struggle with this because they process text sequentially, which can lead to information loss over long distances. Transformers, on the other hand, use self-attention mechanisms to weigh the importance of each word in a sentence, allo





=== GPT-4o Evaluation Results ===

Input: Explain the benefits of using transformers in natural language processing.
Output: Transformers have revolutionized the field of natural language processing (NLP) due to their unique architecture and capabilities. Here are some of the key benefits of using transformers in NLP:

1. Parallelization: Unlike recurrent neural networks (RNNs), transformers do not require sequential data processing, which allows for parallelization during training. This significantly speeds up the training process and makes it feasible to train on large datasets.

2. Handling Long-Range Dependencies: Transformers use self-attention mechanisms that allow them to consider the entire context of a sentence or document when making predictions. This is particularly useful for capturing long-range dependencies in text, which is a limitation of traditional RNNs and LSTMs.

3. Scalability: The architecture of transformers is highly scalable. Models like BERT, GPT, and T5 have

In [36]:
import os
import re
import json
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric  # Import any available metrics one-by-one
from deepeval.test_case import LLMTestCase
from langchain_openai import ChatOpenAI

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI LLMs via LangChain
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
gpt35_chat = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)

# Define a prompt for the LLM
prompt = "Explain the benefits of using transformers in natural language processing."

# Retrieve responses from both models
gpt4_response = gpt4o_chat.invoke(prompt).content
gpt35_response = gpt35_chat.invoke(prompt).content

# Clean up any Markdown bold formatting
cleaned_gpt4_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt4_response)
cleaned_gpt35_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt35_response)

# Initialize the list of metrics, attempting to import and append other metrics if available
metrics_list = [AnswerRelevancyMetric(threshold=0.5)]

# Create test cases for each model's response
gpt4_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt4_response,
    retrieval_context=["Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."]
)

gpt35_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt35_response,
    retrieval_context=["Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."]
)

# Evaluate both test cases with the metrics list
evaluation_results_gpt4 = evaluate([gpt4_test_case], metrics_list)
evaluation_results_gpt35 = evaluate([gpt35_test_case], metrics_list)

# Function to print evaluation summaries
def print_evaluation_summary(model_name, evaluation_results):
    print(f"\n=== {model_name} Evaluation Results ===\n")
    for result in evaluation_results.test_results:
        print(f"Input: {result.input}")
        print(f"Output: {result.actual_output}\n")
        
        for metric_data in result.metrics_data:
            print(f"Metric: {metric_data.name}")
            print(f" - Score: {metric_data.score}")
            print(f" - Success: {'✅' if metric_data.success else '❌'}")
            print(f" - Reason: {metric_data.reason}")
            print(f" - Evaluation Model: {metric_data.evaluation_model}")
            print(f" - Evaluation Cost: ${metric_data.evaluation_cost:.5f}")
        print("=" * 50)

# Print evaluation summaries for both models
print_evaluation_summary("GPT-4o", evaluation_results_gpt4)
print_evaluation_summary("GPT-3.5-turbo", evaluation_results_gpt35)

# Save the results to a JSON file
output_data = {
    "gpt-4o": evaluation_results_gpt4.dict(),
    "gpt-3.5-turbo": evaluation_results_gpt35.dict()
}
with open('evaluation_summary.json', 'w') as f:
    json.dump(output_data, f, indent=4)

print("\nEvaluation results have been logged to 'evaluation_summary.json'")


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:09,  9.96s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the explanation was perfectly relevant and concise, with no irrelevant statements. Great job!, error: None)

For test case:

  - input: Explain the benefits of using transformers in natural language processing.
  - actual output: Transformers have revolutionized the field of natural language processing (NLP) due to their unique architecture and capabilities. Here are some of the key benefits of using transformers in NLP:

1. Parallelization: Unlike recurrent neural networks (RNNs), which process data sequentially, transformers can process entire sequences of data in parallel. This parallelization significantly speeds up training and inference times, making transformers more efficient and scalable for large datasets.

2. Attention Mechanism: The self-attention mechanism in transformers allows the model to weigh the importance of different word




Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:06,  6.26s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the answer is perfectly relevant with no irrelevant statements. Great job!, error: None)

For test case:

  - input: Explain the benefits of using transformers in natural language processing.
  - actual output: Transformers are a type of deep learning model that has revolutionized natural language processing (NLP) tasks. One of the main benefits of using transformers in NLP is their ability to capture long-range dependencies in text data. This means that transformers can understand the context of a word or phrase in relation to the entire sentence or document, rather than just the words immediately surrounding it.

Additionally, transformers are highly parallelizable, which allows for faster training and inference times compared to traditional sequential models. This makes them well-suited for processing large amounts of text data efficiently





=== GPT-4o Evaluation Results ===

Input: Explain the benefits of using transformers in natural language processing.
Output: Transformers have revolutionized the field of natural language processing (NLP) due to their unique architecture and capabilities. Here are some of the key benefits of using transformers in NLP:

1. Parallelization: Unlike recurrent neural networks (RNNs), which process data sequentially, transformers can process entire sequences of data in parallel. This parallelization significantly speeds up training and inference times, making transformers more efficient and scalable for large datasets.

2. Attention Mechanism: The self-attention mechanism in transformers allows the model to weigh the importance of different words in a sentence, regardless of their position. This ability to focus on relevant parts of the input sequence helps in capturing long-range dependencies and contextual information more effectively than traditional models.

3. Handling Long-Range Depen

In [42]:
import os
import re
import json
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from langchain_openai import ChatOpenAI

# Attempt to import all relevant metrics and initialize them with the required parameters
available_metrics = []
metric_import_errors = []

try:
    from deepeval.metrics import AnswerRelevancyMetric
    available_metrics.append(AnswerRelevancyMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"AnswerRelevancyMetric: {e}")

try:
    from deepeval.metrics import ContextualRelevancyMetric
    available_metrics.append(ContextualRelevancyMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"ContextualRelevancyMetric: {e}")

try:
    from deepeval.metrics import HallucinationMetric
    available_metrics.append(HallucinationMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"HallucinationMetric: {e}")

try:
    from deepeval.metrics import FaithfulnessMetric
    available_metrics.append(FaithfulnessMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"FaithfulnessMetric: {e}")

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI LLMs via LangChain
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
gpt35_chat = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)

# Define a prompt for the LLM and the context it will be evaluated against
prompt = "Explain the benefits of using transformers in natural language processing."
context = "Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."

# Retrieve responses from both models
gpt4_response = gpt4o_chat.invoke(prompt).content
gpt35_response = gpt35_chat.invoke(prompt).content

# Clean up any Markdown bold formatting
cleaned_gpt4_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt4_response)
cleaned_gpt35_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt35_response)

# Create test cases for each model's response with the required context for hallucination metrics
gpt4_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt4_response,
    retrieval_context=[context],  # This remains a list of strings for retrieval-based metrics
    context=[context]  # Wrapping the context string in a list
)

gpt35_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt35_response,
    retrieval_context=[context],
    context=[context]
)


# Evaluate both test cases
evaluation_results_gpt4 = evaluate([gpt4_test_case], available_metrics)
evaluation_results_gpt35 = evaluate([gpt35_test_case], available_metrics)

def print_evaluation_summary(model_name, evaluation_results):
    print(f"\n=== {model_name} Evaluation Results ===\n")
    
    # Loop over each test case result
    for result in evaluation_results.test_results:
        print(f"Model: {model_name}\n")
        print(f"Input: {result.input}\n")
        print("Metrics Summary:")

        # Loop over each metric data and print only essential info
        for metric_data in result.metrics_data:
            print(f" - {metric_data.name}: Score = {metric_data.score}, Success = {'✅' if metric_data.success else '❌'}")
            print(f"   Reason: {metric_data.reason}")
        
        print("=" * 50)
        print(f"\nOutput from {model_name}:\n{result.actual_output}\n")
        print("=" * 50)

# Run the evaluation summaries for both models
print_evaluation_summary("GPT-4o", evaluation_results_gpt4)
print_evaluation_summary("GPT-3.5-turbo", evaluation_results_gpt35)

# Log any metric import errors
if metric_import_errors:
    print("\nMetric Import Errors:")
    for error in metric_import_errors:
        print(f" - {error}")

# Save the results to a JSON file
output_data = {
    "gpt-4o": evaluation_results_gpt4.dict(),
    "gpt-3.5-turbo": evaluation_results_gpt35.dict()
}
with open('evaluation_summary.json', 'w') as f:
    json.dump(output_data, f, indent=4)

print("\nEvaluation results have been logged to 'evaluation_summary.json'")


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: | |  0% (0/1) [Time Taken: 00:00, ?test case/

None


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:11, 11.53s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the response was perfectly relevant and addressed the question thoroughly with no irrelevant statements. Great job!, error: None)
  - ✅ Contextual Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the relevant statement perfectly aligns with the input, highlighting key benefits of transformers in NLP like handling long-range dependencies, parallel processing, and fine-tuning. Great job!, error: None)
  - ✅ Hallucination (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 0.00 because there are no factual alignments or contradictions found, indicating perfect alignment between the actual output and the context., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 beca




Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: | |  0% (0/1) [Time Taken: 00:00, ?test case/

None


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:06,  6.98s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the output is perfectly relevant and directly addresses the benefits of using transformers in natural language processing with no irrelevant information. Great job!, error: None)
  - ✅ Contextual Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the relevant statement perfectly aligns with the input, highlighting key benefits of transformers in NLP. Great job!, error: None)
  - ✅ Hallucination (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 0.00 because there are no factual alignments present, and the listed contradiction indicates that the actual output does not introduce any false information or discrepancies with the context, suggesting complete factual consistency., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict:





=== GPT-4o Evaluation Results ===

Model: GPT-4o

Input: Explain the benefits of using transformers in natural language processing.

Metrics Summary:
 - Answer Relevancy: Score = 1.0, Success = ✅
   Reason: The score is 1.00 because the response was perfectly relevant and addressed the question thoroughly with no irrelevant statements. Great job!
 - Contextual Relevancy: Score = 1.0, Success = ✅
   Reason: The score is 1.00 because the relevant statement perfectly aligns with the input, highlighting key benefits of transformers in NLP like handling long-range dependencies, parallel processing, and fine-tuning. Great job!
 - Hallucination: Score = 0.0, Success = ✅
   Reason: The score is 0.00 because there are no factual alignments or contradictions found, indicating perfect alignment between the actual output and the context.
 - Faithfulness: Score = 1.0, Success = ✅
   Reason: The score is 1.00 because there are no contradictions, indicating that the actual output is perfectly aligne

In [43]:
import os
import re
import json
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from langchain_openai import ChatOpenAI

# Attempt to import all relevant metrics and initialize them with the required parameters
available_metrics = []
metric_import_errors = []

try:
    from deepeval.metrics import AnswerRelevancyMetric
    available_metrics.append(AnswerRelevancyMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"AnswerRelevancyMetric: {e}")

try:
    from deepeval.metrics import ContextualRelevancyMetric
    available_metrics.append(ContextualRelevancyMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"ContextualRelevancyMetric: {e}")

try:
    from deepeval.metrics import HallucinationMetric
    available_metrics.append(HallucinationMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"HallucinationMetric: {e}")

try:
    from deepeval.metrics import FaithfulnessMetric
    available_metrics.append(FaithfulnessMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"FaithfulnessMetric: {e}")

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI LLMs via LangChain
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)
gpt35_chat = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, openai_api_key=openai_api_key)

# Define a prompt for the LLM and the context it will be evaluated against
prompt = "Explain the benefits of using transformers in natural language processing."
context = "Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."

# Retrieve responses from both models
gpt4_response = gpt4o_chat.invoke(prompt).content
gpt35_response = gpt35_chat.invoke(prompt).content

# Clean up any Markdown bold formatting
cleaned_gpt4_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt4_response)
cleaned_gpt35_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt35_response)

# Create test cases for each model's response with the required context for hallucination metrics
gpt4_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt4_response,
    retrieval_context=[context],  # This remains a list of strings for retrieval-based metrics
    context=[context]  # Wrapping the context string in a list
)

gpt35_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt35_response,
    retrieval_context=[context],
    context=[context]
)

# Evaluate both test cases
evaluation_results_gpt4 = evaluate([gpt4_test_case], available_metrics)
evaluation_results_gpt35 = evaluate([gpt35_test_case], available_metrics)

def print_evaluation_summary(model_name, evaluation_results):
    print(f"\n=== {model_name} Evaluation Results ===\n")

    # Only loop through the first test case as they are printed separately in this function.
    result = evaluation_results.test_results[0]

    print("Metrics Summary:")
    for metric_data in result.metrics_data:
        print(f" - {metric_data.name}: Score = {metric_data.score}, Success = {'✅' if metric_data.success else '❌'}")
        print(f"   Reason: {metric_data.reason}")
    print("=" * 50)

    print(f"\nOutput from {model_name}:\n{result.actual_output}\n")
    print("=" * 50)


# Run the evaluation summaries for both models and avoid redundant information
print_evaluation_summary("GPT-4o", evaluation_results_gpt4)
print_evaluation_summary("GPT-3.5-turbo", evaluation_results_gpt35)

# Log any metric import errors
if metric_import_errors:
    print("\nMetric Import Errors:")
    for error in metric_import_errors:
        print(f" - {error}")

# Save the results to a JSON file with just final summary data
output_data = {
    "gpt-4o": evaluation_results_gpt4.dict(),
    "gpt-3.5-turbo": evaluation_results_gpt35.dict()
}
with open('evaluation_summary.json', 'w') as f:
    json.dump(output_data, f, indent=4)

print("\nEvaluation results have been logged to 'evaluation_summary.json'")


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: | |  0% (0/1) [Time Taken: 00:00, ?test case/

None


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:11, 11.94s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the output perfectly addresses the benefits of using transformers in natural language processing without any irrelevant information. Great job!, error: None)
  - ✅ Contextual Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the retrieval context perfectly addresses the input by highlighting how transformers handle long-range dependencies, enable parallel processing, and are adaptable for various NLP tasks. Great job!, error: None)
  - ✅ Hallucination (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 0.00 because there are no factual alignments and the contradictions indicate that the actual output aligns well with the context, suggesting no hallucination., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluat




Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: | |  0% (0/1) [Time Taken: 00:00, ?test case/

None


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:09,  9.13s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the output is fully relevant and effectively addresses the benefits of using transformers in natural language processing. Great job!, error: None)
  - ✅ Contextual Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the context perfectly aligns with the input, highlighting the benefits of transformers in NLP such as handling long-range dependencies and parallel processing. Great job!, error: None)
  - ✅ Hallucination (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 0.00 because there are no factual alignments or explicit contradictions identified, indicating that the actual output aligns well with the context provided., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is





=== GPT-4o Evaluation Results ===

Metrics Summary:
 - Answer Relevancy: Score = 1.0, Success = ✅
   Reason: The score is 1.00 because the output perfectly addresses the benefits of using transformers in natural language processing without any irrelevant information. Great job!
 - Contextual Relevancy: Score = 1.0, Success = ✅
   Reason: The score is 1.00 because the retrieval context perfectly addresses the input by highlighting how transformers handle long-range dependencies, enable parallel processing, and are adaptable for various NLP tasks. Great job!
 - Hallucination: Score = 0.0, Success = ✅
   Reason: The score is 0.00 because there are no factual alignments and the contradictions indicate that the actual output aligns well with the context, suggesting no hallucination.
 - Faithfulness: Score = 1.0, Success = ✅
   Reason: The score is 1.00 because there are no contradictions, indicating that the actual output is perfectly aligned with the retrieval context. Great job!

Output 

In [44]:
import os
import re
import json
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from langchain_openai import ChatOpenAI

# Attempt to import all relevant metrics and initialize them with the required parameters
available_metrics = []
metric_import_errors = []

try:
    from deepeval.metrics import AnswerRelevancyMetric
    available_metrics.append(AnswerRelevancyMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"AnswerRelevancyMetric: {e}")

try:
    from deepeval.metrics import ContextualRelevancyMetric
    available_metrics.append(ContextualRelevancyMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"ContextualRelevancyMetric: {e}")

try:
    from deepeval.metrics import HallucinationMetric
    available_metrics.append(HallucinationMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"HallucinationMetric: {e}")

try:
    from deepeval.metrics import FaithfulnessMetric
    available_metrics.append(FaithfulnessMetric(threshold=0.5))
except ImportError as e:
    metric_import_errors.append(f"FaithfulnessMetric: {e}")

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize GPT-4o LLM via LangChain
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)

# Define a prompt for the LLM and the context it will be evaluated against
prompt = "Explain the benefits of using transformers in natural language processing."
context = "Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."

# Retrieve response from GPT-4o
gpt4_response = gpt4o_chat.invoke(prompt).content

# Clean up any Markdown bold formatting
cleaned_gpt4_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt4_response)

# Create test case for GPT-4o's response with the required context for hallucination metrics
gpt4_test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt4_response,
    retrieval_context=[context],  # For retrieval-based metrics
    context=[context]  # Wrapping the context string in a list
)

# Evaluate the test case
evaluation_results_gpt4 = evaluate([gpt4_test_case], available_metrics)

# Print evaluation summary for GPT-4o only
def print_evaluation_summary(model_name, evaluation_results):
    print(f"\n=== {model_name} Evaluation Results ===\n")
    
    # Loop over each test case result
    for result in evaluation_results.test_results:
        print(f"Model: {model_name}\n")
        print(f"Input: {result.input}\n")
        print("Metrics Summary:")

        # Loop over each metric data and print only essential info
        for metric_data in result.metrics_data:
            print(f" - {metric_data.name}: Score = {metric_data.score}, Success = {'✅' if metric_data.success else '❌'}")
            print(f"   Reason: {metric_data.reason}")
        
        print("=" * 50)
        print(f"\nOutput from {model_name}:\n{result.actual_output}\n")
        print("=" * 50)

# Run the evaluation summary for GPT-4o
print_evaluation_summary("GPT-4o", evaluation_results_gpt4)

# Log any metric import errors
if metric_import_errors:
    print("\nMetric Import Errors:")
    for error in metric_import_errors:
        print(f" - {error}")

# Save the results to a JSON file for GPT-4o only
output_data = {"gpt-4o": evaluation_results_gpt4.dict()}
with open('evaluation_summary.json', 'w') as f:
    json.dump(output_data, f, indent=4)

print("\nEvaluation results have been logged to 'evaluation_summary.json'")


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: | |  0% (0/1) [Time Taken: 00:00, ?test case/

None


Evaluating 1 test case(s) in parallel: |█|100% (1/1) [Time Taken: 00:10, 10.56s/test



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the explanation perfectly addressed the benefits of using transformers in natural language processing without including any irrelevant information. Great job!, error: None)
  - ✅ Contextual Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the relevant statement perfectly addresses the input by highlighting key benefits of transformers in NLP such as handling long-range dependencies and enabling parallel processing., error: None)
  - ✅ Hallucination (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 0.00 because there are no factual alignments provided and the contradictions indicate that the actual output aligns with the context, suggesting no hallucination in the output., error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict





=== GPT-4o Evaluation Results ===

Model: GPT-4o

Input: Explain the benefits of using transformers in natural language processing.

Metrics Summary:
 - Answer Relevancy: Score = 1.0, Success = ✅
   Reason: The score is 1.00 because the explanation perfectly addressed the benefits of using transformers in natural language processing without including any irrelevant information. Great job!
 - Contextual Relevancy: Score = 1.0, Success = ✅
   Reason: The score is 1.00 because the relevant statement perfectly addresses the input by highlighting key benefits of transformers in NLP such as handling long-range dependencies and enabling parallel processing.
 - Hallucination: Score = 0.0, Success = ✅
   Reason: The score is 0.00 because there are no factual alignments provided and the contradictions indicate that the actual output aligns with the context, suggesting no hallucination in the output.
 - Faithfulness: Score = 1.0, Success = ✅
   Reason: The score is 1.00 because there are no cont

In [None]:
import os
import re
import json
from dotenv import load_dotenv
from deepeval import evaluate
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from langchain_openai import ChatOpenAI

# Load environment variables
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

# Initialize GPT-4o LLM via LangChain
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)

# Define a prompt and context for the evaluation
prompt = "Explain the benefits of using transformers in natural language processing."
context = "Transformers enable handling long-range dependencies, parallel processing, and fine-tuning for various NLP tasks."

# Retrieve the response from GPT-4o
gpt4_response = gpt4o_chat.invoke(prompt).content
cleaned_gpt4_response = re.sub(r'\*\*([^*]+)\*\*', r'\1', gpt4_response)

# Define a custom GEval metric for Correctness
correctness_metric = GEval(
    name="Correctness",
    criteria="Evaluate if the actual output accurately covers the benefits of transformers in NLP based on the context provided.",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.CONTEXT],
    threshold=0.7,
    async_mode=True,
    verbose_mode=True
)

# Create a test case with the required context
test_case = LLMTestCase(
    input=prompt,
    actual_output=cleaned_gpt4_response,
    context=[context]
)

# Measure the custom GEval metric
correctness_metric.measure(test_case)

# Output the results for this custom metric
print("G-Eval Correctness Metric Results:")
print(f" - Score: {correctness_metric.score}")
print(f" - Reason: {correctness_metric.reason}")

# Save the result to a JSON file
output_data = {
    "G-Eval Correctness Metric": {
        "Score": correctness_metric.score,
        "Reason": correctness_metric.reason
    }
}

with open('g_eval_correctness_summary.json', 'w') as f:
    json.dump(output_data, f, indent=4)

print("\nEvaluation results have been logged to 'g_eval_correctness_summary.json'")
