# Implementing Basic RAG QA with Langchain
This notebook demonstrates how to implement a basic Retrieval-Augmented Generation (RAG) chain using Turkish legislation data. The overall approach is as follows:
1. Load Legislation Data & Structure Metadata
2. Create Embeddings for the Legislation Data using Amazon's Titan Embeddings model, and save these embeddings locally using Chroma.
3. Create a Question Answering (QA) chain which retrieves context based on the embeddings saved in Chroma, serving these as context to the Amazon Titan Express LLM to answer the provided user prompt.
4. Format the response so that source materials can be cited.

This implementation mostly follows these Langchain tutorials:
- https://python.langchain.com/docs/modules/data_connection/document_loaders/json#using-jsonloader

In [1]:
# Define metadata extraction function so we can return links as sources
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["legislation_title"] = record.get("title")
    metadata["source"] = record.get("url")
    
    return metadata

In [4]:
# Import JSON FAQ File using JSONLoader
from langchain_community.document_loaders import JSONLoader
from pprint import pprint

file_path='../data/processed/legislation_data/National_Legislation_Content_TR.json'

loader = JSONLoader(
    file_path=file_path,
    jq_schema=".[]",
    content_key="content",
    text_content=False, # Need this because content is a list, not string
    metadata_func=metadata_func
)

data = loader.load()

In [11]:
from langchain_community.embeddings import BedrockEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents,chunk_size=4000,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(data)


embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v1", region_name="us-east-1"
)

vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory='./data/processed/legislation_data/vectordata')
vectorstore.persist()

In [4]:
from langchain import hub
from langchain_community.llms import Bedrock
from langchain_community.embeddings import BedrockEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma


# Retrieve and generate answers using relevant FAQs

embeddings = BedrockEmbeddings(
    model_id="amazon.titan-embed-text-v1", region_name="us-east-1"
)

vectorstore = Chroma(persist_directory="./data/processed/legislation_data/vectordata", embeddings=embeddings)
retriever = vectorstore.as_retriever()

prompt_template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer from the provided context, just say that your training materials don't include this information, don't try to make up an answer.
Keep the answer as concise as possible.

{context}

Question: {question}

Helpful Answer:"""

prompt_template_TR = """Aşağıdaki bilgilerle kulanarak soran soruyu cevaplayın. Eğer mecvut bilgileriyle soruyu cevaplamak mümkün değilse, mevcut bilgilerde sorunun cevabı bulunmadğını açıklayın.

{context}

Soru: {question}

Yardımcı cevabı:"""
custom_rag_prompt = PromptTemplate.from_template(prompt_template_TR)

# Instantiate Claude 2.1 with parameters passed via the CreateInferenceModifier helper
from utils import CreateInferenceModifier # Import the function from utils.py

# Define the universal set of modifier parameters
modifiers = {"max_tokens": 20000,
    "temperature": 0.5,
    "top_k": 250,
    "top_p": 1,
    "stop_sequences": ["\n\nHuman"],
    }

llm = Bedrock(model_id="anthropic.claude-v2:1", model_kwargs=CreateInferenceModifier("claude", modifiers))

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

NameError: name 'BedrockEmbeddings' is not defined

In [3]:
# Define a function to extract unique URLs used in the retrieved source materials.
def extract_unique_urls(response):
    unique_urls = set()  # Use a set to store unique URLs
    
    # Iterate through each document in the 'context'
    for document in response['context']:
        source_url = document.metadata['source']  # Extract the 'source' URL
        unique_urls.add(source_url)  # Add the URL to the set
    
    # Convert the set of unique URLs to a string
    urls_string = '; '.join(unique_urls)
    
    return urls_string

# Invoke the chain and print the response and sources.
response = rag_chain_with_source.invoke("Türkiye'de sığınmacı çocukları ile ilgi mevzuatı hangi kanunları ve bunların hangi bölümleri kapsar?")
print(response["answer"])
print(f"Sources: {extract_unique_urls(response)}")

ValueError: You must provide embeddings or a function to compute them