In [2]:
pip install -q umap-learn scikit-learn langchain-google-genai faiss-cpu ragas


Note: you may need to restart the kernel to use updated packages.


In [4]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GEMINI_API_KEY = user_secrets.get_secret("GEMINI_KEY")

In [None]:
import os
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from kaggle_secrets import UserSecretsClient
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from bs4 import BeautifulSoup as Soup

# # Fetch API Key from Kaggle Secrets
# secrets = UserSecretsClient()
# GEMINI_API_KEY = secrets.get_secret("GEMINI_API_KEY")

# Initialize Gemini Model & Embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GEMINI_API_KEY)
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, google_api_key=GEMINI_API_KEY)

# Loading Documentation
def load_docs(url, depth=1):
    loader = RecursiveUrlLoader(url=url, max_depth=depth, extractor=lambda x: Soup(x, "html.parser").text)
    return loader.load()

docs = load_docs("https://python.langchain.com/docs/expression_language/", depth=2)
docs_texts = [d.page_content for d in docs]

# Function to compute cosine similarity thresholding
def compute_similarity(embeddings):
    if len(embeddings) < 2:
        return np.array([1.0])  # Default high similarity for single document
    sim_matrix = cosine_similarity(embeddings)
    return np.mean(sim_matrix, axis=1)  # Mean similarity per node

# Function for weighted summarization
def weighted_summary(texts, weights):
    if not texts:
        return ""  # Return empty summary if no text
    formatted_text = "\n".join([f"({w:.2f}) {txt}" for txt, w in zip(texts, weights)])
    
    prompt = ChatPromptTemplate.from_template(
        """Summarize the following text, giving more importance to lower-ranked nodes:
        {context}"""
    )
    
    chain = prompt | model | StrOutputParser()
    return chain.invoke({"context": formatted_text})

# Recursive Clustering + Summarization
def recursive_embed_cluster_summarize(texts, level=1, max_levels=3):
    results = {}

    # If only one document, skip clustering
    if len(texts) == 1:
        df_summary = pd.DataFrame({"summary": texts, "level": level, "cluster": [0]})
        results[level] = df_summary
        return results

    # Generate embeddings
    text_embeddings = np.array(embeddings.embed_documents(texts))

    # Apply UMAP for dimensionality reduction
    reduced_embeddings = umap.UMAP(n_components=min(10, len(texts)), metric="cosine").fit_transform(text_embeddings)

    # Apply GMM Clustering
    n_clusters = min(len(texts), 2)  # Ensure at least 2 clusters
    gmm = GaussianMixture(n_components=n_clusters, random_state=224)
    
    try:
        cluster_labels = gmm.fit_predict(reduced_embeddings)
    except ValueError:
        cluster_labels = np.zeros(len(texts))  # Assign all to same cluster if GMM fails

    # Compute similarity thresholding
    similarity_scores = compute_similarity(text_embeddings)
    threshold = np.percentile(similarity_scores, 30)  # Filter out bottom 30% similarity

    # Assign weight using inverse reciprocal rank
    weights = 1 / (np.array([level] * len(texts)) + 1)

    # Filter out nodes with low similarity
    relevant_indices = [i for i, score in enumerate(similarity_scores) if score > threshold]
    
    if not relevant_indices:
        return results  # Return empty if all are filtered out

    filtered_texts = [texts[i] for i in relevant_indices]
    filtered_weights = [weights[i] for i in relevant_indices]

    # Generate summaries
    df_summary = pd.DataFrame({
        "summary": [weighted_summary(filtered_texts, filtered_weights)],
        "level": level,
        "cluster": cluster_labels[relevant_indices]
    })

    # Store results
    results[level] = df_summary

    # Recursive processing
    if level < max_levels and len(df_summary) > 1:
        new_texts = df_summary["summary"].tolist()
        results.update(recursive_embed_cluster_summarize(new_texts, level + 1, max_levels))

    return results

# Build tree with weighted summarization
results = recursive_embed_cluster_summarize(docs_texts, level=1, max_levels=3)

# Flatten results and create vector store
all_texts = docs_texts + [summ for lvl in results for summ in results[lvl]["summary"].tolist()]
vectorstore = FAISS.from_texts(texts=all_texts, embedding=embeddings)
retriever = vectorstore.as_retriever()

# RAG Setup with Correct Document Formatting
prompt = ChatPromptTemplate.from_template("Answer based on context:\n{context}\nQuestion: {question}")

def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])  # Extract text from Document objects

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Query Example
response = rag_chain.invoke("How does LangChain handle hierarchical expression trees?")
print(response)



The provided text doesn't describe how LangChain handles hierarchical expression trees.  It focuses on the LangChain Expression Language (LCEL), which uses `RunnableSequence` and `RunnableParallel` as primary composition primitives to build chains of runnables.  While these primitives allow for sequential and parallel execution of multiple runnables, they don't explicitly address the concept of hierarchical expression trees in the way a more general-purpose expression language might.  The text suggests that for complex scenarios beyond simple sequential or parallel chains, LangGraph is the recommended approach.


In [44]:
retrieved_docs = retriever.invoke("How does LangChain handle hierarchical expression trees?")
print("\n\n".join([doc.page_content for doc in retrieved_docs]))







LangChain Expression Language (LCEL) | ü¶úÔ∏èüîó LangChain






Skip to main contentJoin us at  Interrupt: The Agent AI Conference by LangChain on May 13 & 14 in San Francisco!IntegrationsAPI ReferenceMoreContributingPeopleError referenceLangSmithLangGraphLangChain HubLangChain JS/TSv0.3v0.3v0.2v0.1üí¨SearchIntroductionTutorialsBuild a Question Answering application over a Graph DatabaseTutorialsBuild a simple LLM application with chat models and prompt templatesBuild a ChatbotBuild a Retrieval Augmented Generation (RAG) App: Part 2Build an Extraction ChainBuild an AgentTaggingBuild a Retrieval Augmented Generation (RAG) App: Part 1Build a semantic search engineBuild a Question/Answering system over SQL dataSummarize TextHow-to guidesHow-to guidesHow to use tools in a chainHow to use a vectorstore as a retrieverHow to add memory to chatbotsHow to use example selectorsHow to add a semantic layer over graph databaseHow to invoke runnables in parallelHow to stream chat model resp

In [6]:
pip install -q deepeval


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.9/55.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.7/118.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.0/65.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.7/319.7 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:

## Trying RAGAS For eval(need OpenAI LLM)

In [None]:
# from ragas import evaluate
# from ragas.metrics import (
#     answer_relevancy,
#     faithfulness,
#     context_precision,
#     context_recall
# )
# from langchain_google_genai import ChatGoogleGenerativeAI

# # # Load Gemini API Key
# # from kaggle_secrets import UserSecretsClient
# # secrets = UserSecretsClient()
# # GEMINI_API_KEY = secrets.get_secret("GEMINI_API_KEY")

# # Set Gemini as the LLM for Ragas
# gemini_llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=GEMINI_API_KEY)

# # Define test dataset for evaluation
# test_data = [
#     {"question": "How does LangChain handle hierarchical expression trees?", "ground_truth": "LangChain structures expression trees using modular components to allow hierarchical processing."},
#     {"question": "What are the core features of LangChain's RAG pipeline?", "ground_truth": "LangChain provides vector-based retrieval, prompt chaining, and API integrations for LLMs."}
# ]

# # Retrieve answers using RAPTOR RAG
# for sample in test_data:
#     sample["generated_answer"] = rag_chain.invoke(sample["question"])

# # Convert test dataset to DataFrame
# df_test = pd.DataFrame(test_data)

# # Pass `llm=gemini_llm` to use Gemini instead of OpenAI
# results = evaluate(
#     dataset=df_test,
#     metrics=[answer_relevancy, faithfulness, context_precision, context_recall],
#     llm=gemini_llm  # Use Gemini instead of OpenAI
# )

# # Print evaluation scores
# print("Evaluation Results:\n", results)


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

## Trying DeepEval for eval

In [None]:
from deepeval.models.base_model import DeepEvalBaseLLM
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel
import json

# Defining Pydantic models for structured responses
class Claims(BaseModel):
    claims: list[str]

class Statements(BaseModel):
    statements: list[str]

# Creating custom Gemini wrapper
class GeminiLLM(DeepEvalBaseLLM):
    def __init__(self, api_key, model_name="gemini-1.5-flash"):
        self.llm = ChatGoogleGenerativeAI(model=model_name, google_api_key=api_key)

    def load_model(self):
        return self.llm

    def generate(self, prompt, schema=None, **kwargs):
        """
        Generate a response and return the correct Pydantic object.
        If a schema is specified (Claims or Statements), return the expected format.
        """
        response = self.llm.invoke(prompt)
        return self._format_response(response, schema)

    async def a_generate(self, prompt, schema=None, **kwargs):
        response = await self.llm.ainvoke(prompt)
        return self._format_response(response, schema)

    def get_model_name(self):
        return "Gemini"

    def _format_response(self, response, schema=None):
        """
        Fix: Ensuring Gemini response is correctly converted to `Claims`, `Truths`, `Statements`, or `Verdicts`.
        Debug: Adding structured logs for better readability.
        """
        print("\n🔹 Received Raw Response from LLM:")
        print("-------------------------------------------------")
        print(response)
        print("-------------------------------------------------\n")
    
        try:
            # Handle the case where response is wrapped in markdown-style triple backticks
            response_content = response.content.strip("```json").strip("```").strip()
            
            #Parse JSON correctly
            response_data = json.loads(response_content)
            
            print("\n Successfully Parsed JSON Response:")
            print(json.dumps(response_data, indent=2))  # Pretty-print for readability
    
        except (json.JSONDecodeError, AttributeError) as e:
            print("\n JSON Parsing Failed! Using Fallback Mechanism.")
            print(f" Error: {e}")
            
            # Fallback response structure
            response_data = {
                "claims": response_content.split(". "),
                "statements": response_content.split(". ")
            }
    
            print("\n🔹 Fallback Data Structure:")
            print(json.dumps(response_data, indent=2))
    
        # Debug: Ensure the response has required keys before applying schema
        print("\n🔹 Applying Schema:", schema)
        print("-------------------------------------------------")
    
        try:
            if schema == Claims:
                return Claims(claims=response_data.get("claims", []))
            elif schema == Statements:
                return Statements(statements=response_data.get("statements", []))
            elif schema == Truths:
                return Truths(truths=response_data.get("truths", []))
            elif schema == Verdicts:
                return Verdicts(verdicts=response_data.get("verdicts", []))
    
            # Ensure any other schema is also handled
            return schema(**response_data) if schema else response_data
    
        except Exception as e:
            print("\n Schema Conversion Failed!")
            print(f"Error: {e}")
    
            # Return a safe default structure instead of breaking execution
            return Claims(claims=response_data.get("claims", [])) if schema == Claims else Statements(statements=response_data.get("statements", []))


In [None]:
import nest_asyncio
from deepeval import evaluate
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    ContextualRelevancyMetric,
    HallucinationMetric
)
from deepeval.test_case import LLMTestCase
from langchain_google_genai import ChatGoogleGenerativeAI
from pydantic import BaseModel
import json
from deepeval.metrics.faithfulness.schema import Claims, Truths
from deepeval.metrics.answer_relevancy.schema import Statements
from deepeval.metrics.contextual_relevancy.schema import ContextualRelevancyVerdicts
from deepeval.metrics.hallucination.schema import Verdicts



# # Fix Async Issues
# nest_asyncio.apply()

# # Define Response Models
# class Claims(BaseModel):
#     claims: list[str]

# class Statements(BaseModel):
#     statements: list[str]

# # Custom Gemini Wrapper for DeepEval
# class GeminiLLM(DeepEvalBaseLLM):
#     def __init__(self, api_key, model_name="gemini-1.5-flash"):
#         self.llm = ChatGoogleGenerativeAI(model=model_name, google_api_key=api_key)

#     def load_model(self):
#         return self.llm

#     def generate(self, prompt, schema=None, **kwargs):
#         response = self.llm.invoke(prompt)
#         return self._format_response(response, schema)

#     async def a_generate(self, prompt, schema=None, **kwargs):
#         response = await self.llm.ainvoke(prompt)
#         return self._format_response(response, schema)

#     def get_model_name(self):
#         return "Gemini"

#     def _format_response(self, response, schema=None):
#         """
#         Fix: Correctly extracts claims/statements while handling both JSON & text responses.
#         """
#         try:
#             response_data = json.loads(response.content)
#         except (json.JSONDecodeError, AttributeError):
#             response_data = {"claims": response.content.split(". "), "statements": response.content.split(". ")}

#         if schema == Claims:
#             return Claims(claims=response_data.get("claims", response.content.split(". ")))
#         elif schema == Statements:
#             return Statements(statements=response_data.get("statements", response.content.split(". ")))
#         return response_data  # Default raw response if schema is not defined

# # Initialize Gemini Model
gemini_llm = GeminiLLM(api_key=GEMINI_API_KEY)

# Define Proper Test Cases (Ensure all required fields are present)
test_cases = [
    LLMTestCase(
        input="How does LangChain handle hierarchical expression trees?",
        actual_output="LangChain structures expression trees using modular components to allow hierarchical processing.",
        retrieval_context=[
            "LangChain enables hierarchical compositions using modular components for structured processing."
        ],
        context=[
            "LangChain structures workflows using modular components, allowing hierarchical execution of tasks."
        ]
    ),
    LLMTestCase(
        input="What are the core features of LangChain's RAG pipeline?",
        actual_output="LangChain provides vector-based retrieval, prompt chaining, and API integrations for LLMs.",
        retrieval_context=[
            "LangChain integrates FAISS for retrieval and enables prompt chaining across multiple models."
        ],
        context=[
            "LangChain’s RAG pipeline includes vector retrieval, document chunking, and LLM chaining."
        ]
    ),
    LLMTestCase(
        input="What is the role of FAISS in LangChain?",
        actual_output="FAISS is used in LangChain for structured prompting.",
        retrieval_context=[
            "FAISS is integrated into LangChain to enhance document retrieval and improve search accuracy."
        ],
        context=[
            "LangChain leverages FAISS for optimized vector search, enabling efficient retrieval for LLM queries."
        ]
    ),
    LLMTestCase(
        input="How does LangChain handle long-form text retrieval?",
        actual_output="LangChain employs hybrid search to retrieve long-form text efficiently.",
        retrieval_context=[
            "LangChain combines keyword-based and dense retrieval models for handling long-form content."
        ],
        context=[
            "Hybrid search in LangChain integrates sparse and dense retrieval to improve long-text processing."
        ]
    ),
]

# Define DeepEval Metrics with Gemini
metrics = [
    AnswerRelevancyMetric(model=gemini_llm, threshold=0.7, include_reason=True),
    FaithfulnessMetric(model=gemini_llm, threshold=0.7, include_reason=True),
    ContextualRelevancyMetric(model=gemini_llm, threshold=0.7, include_reason=True),
    HallucinationMetric(model=gemini_llm, threshold=0.5, include_reason=True)
]

# Run DeepEval Evaluation
evaluation_results = evaluate(test_cases, metrics)

# Print Results
print("DeepEval Evaluation Results:\n", evaluation_results)


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 4 test case(s) in parallel: |          |  0% (0/4) [Time Taken: 00:00, ?test case/s]


🔹 Received Raw Response from LLM:
-------------------------------------------------
content='```json\n{\n  "claims": [\n    "FAISS is used in LangChain for structured prompting."\n  ]\n}\n```' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []} id='run-51bfc562-b7a9-4943-ad69-b3bcd07bc7b9-0' usage_metadata={'input_tokens': 306, 'output_tokens': 32, 'total_tokens': 338, 'input_token_details': {'cache_read': 0}}
-------------------------------------------------


✅ Successfully Parsed JSON Response:
{
  "claims": [
    "FAISS is used in LangChain for structured prompting."
  ]
}

🔹 Applying Schema: <class 'deepeval.metrics.faithfulness.schema.Claims'>
-------------------------------------------------

🔹 Received Raw Response from LLM:
-------------------------------------------------
content='```json\n{\n  "claims": [\n    "LangChain employs hybrid search to retrieve long-form text efficient

Evaluating 4 test case(s) in parallel: |█████     | 50% (2/4) [Time Taken: 00:02,  1.01test case/s]


🔹 Received Raw Response from LLM:
-------------------------------------------------
content='```json\n{\n  "reason": "The score is 1.00 because there are no contradictions between the actual output and the retrieval context.  Fantastic work!"\n}\n```' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []} id='run-18229783-6969-4bf4-bbf9-dc174c14229e-0' usage_metadata={'input_tokens': 236, 'output_tokens': 41, 'total_tokens': 277, 'input_token_details': {'cache_read': 0}}
-------------------------------------------------


✅ Successfully Parsed JSON Response:
{
  "reason": "The score is 1.00 because there are no contradictions between the actual output and the retrieval context.  Fantastic work!"
}

🔹 Applying Schema: <class 'deepeval.metrics.faithfulness.schema.Reason'>
-------------------------------------------------

🔹 Received Raw Response from LLM:
---------------------------------------

Evaluating 4 test case(s) in parallel: |██████████|100% (4/4) [Time Taken: 00:02,  1.49test case/s]


🔹 Received Raw Response from LLM:
-------------------------------------------------
content='```json\n{\n  "reason": "The score is 0.67 because the response, while partially addressing the query about LangChain, includes irrelevant discussion of modularity.  The explanation lacks a direct answer to how LangChain specifically handles hierarchical expression trees."\n}\n```' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []} id='run-ae846d81-e1ec-41d2-992b-6465bf9b43eb-0' usage_metadata={'input_tokens': 237, 'output_tokens': 60, 'total_tokens': 297, 'input_token_details': {'cache_read': 0}}
-------------------------------------------------


✅ Successfully Parsed JSON Response:
{
  "reason": "The score is 0.67 because the response, while partially addressing the query about LangChain, includes irrelevant discussion of modularity.  The explanation lacks a direct answer to how LangChain speci




DeepEval Evaluation Results:
 test_results=[TestResult(name='test_case_3', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.7, success=True, score=1.0, reason='The score is 1.00 because the response perfectly addresses how LangChain handles long-form text retrieval, with no irrelevant information.', strict_mode=False, evaluation_model='Gemini', error=None, evaluation_cost=None, verbose_logs='Statements:\n[\n    "LangChain employs hybrid search.",\n    "LangChain retrieves long-form text efficiently."\n] \n \nVerdicts:\n[\n    {\n        "verdict": "idk",\n        "reason": null\n    },\n    {\n        "verdict": "yes",\n        "reason": null\n    }\n]'), MetricData(name='Faithfulness', threshold=0.7, success=True, score=1.0, reason='The score is 1.00 because there are no contradictions between the actual output and the retrieval context.  Fantastic work!', strict_mode=False, evaluation_model='Gemini', error=None, evaluation_cost=None, verbose_logs='Truths (l

In [21]:
# type(response)


str