In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain

import pinecone
from tqdm.auto import tqdm
from uuid import uuid4
from datasets import load_dataset
import tiktoken

## Creating the Knowledge Base

We have two primary types of knowledge for LLMs:
1. `parametric knowledge`: efers to everything the LLM learned during training and acts as a frozen snapshot of the world for the LLM.

2. `source knowledge`: This knowledge covers any information fed into the LLM via the input prompt.

When we talk about retrieval augmentation, we’re talking about giving the LLM valuable source knowledge.

In our example, we will use a subset of Wikipedia. To get that data, we will use Hugging Face datasets like so:

In [None]:
data = load_dataset("wikipedia", "20220301.simple", split='train[:10000]')
data

In [None]:
data[0]

## Creating Chunks

Most datasets will contain records that include a lot of text. Because of this, our first task is usually to build a preprocessing pipeline that chunks those long bits of text into more concise chunks. Splitting our text into smaller chunks is essential for several reasons:
* `Improve embedding accuracy` — this will improve the relevance of results later.

* `Reduce the amount of text` fed into our LLM as source knowledge. Limiting input improves the LLM's ability to follow instructions, reduces generation costs, and helps us get faster responses.

* Provide users with more `precise information sources` as we can narrow down the information source to a smaller chunk of text.

* In the case of very long chunks of text, we will exceed the maximum context window of our embedding or completion models. Splitting these chunks makes it possible to `add these longer documents` to our knowledge base.

To create these chunks, we first need a way of measuring the length of our text. LLMs don't measure text by word or character — they measure it by `tokens`.

In [None]:
## As we are going to use `gpt-3.5-turbo` as LLM we are going to use `pk50k_base` tokenize

tokenizer = tiktoken.get_encoding("p50k_base")

tokenizer

In [None]:
## Creating a function that calculates the amount of tokens

def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())

    return len(tokens)

In [None]:
tiktoken_len("hello I am a chunk of text and using the tiktoken_len function we can find the length of this chunk of text in tokens")

In [None]:
## Splitting the Text into Chunks

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,   # splitting to no longer than that
    chunk_overlap = 20, # manimum that amount of chunks can overlap
    length_function = tiktoken_len,
    separators = ["\n\n", "\n", " ", ""]
)

In [None]:
chunks = text_splitter.split_text(data[0]["text"])[:3]

print(tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2]))
chunks

## Creating Embeddings

We take the chunks of text we'd like to store in our knowledge base and encode each chunk into a `vector embedding`.

We then create the embeddings with `another AI` language model that has learned to translate human-readable text into AI-readable embeddings.

Finally, we store these embeddings in our `vector database` (more on this soon) and can find text chunks with similar meanings by calculating the distance between embeddings in vector space.



In [None]:
## We are going to use `ada` for our embeddings (default)

embed = OpenAIEmbeddings(
    openai_api_key = open("api_key.txt", "r").read()
)

In [None]:
texts = [
    "this is the first chunk of text",
    "then another second chunk of text is here"
]

res = embed.embed_documents(texts)
len(res), len(res[0])

From this, we get two embeddings as we passed in two chunks of text. Each embedding is a 1536-dimensional vector.

## Vector Database

A vector database is a type of knowledge base that allows us to scale the search of similar embeddings to billions of records, manage our knowledge base by adding, updating, or removing records, and even do things like filtering.

In [None]:
## Creating Database Index

index_name = "langchain-retrieval-augmentation"

pinecone.init(api_key = open("pinecone_api.txt", "r").read(), environment="asia-northeast1-gcp")

pinecone.create_index(
    name = index_name,
    metric = "dotproduct",
    dimension = len(res[0]) # 1536 in our case
)

In [None]:
## Connecting to the index

index = pinecone.Index(index_name)

index.describe_index_stats()

we can clearly see that the index is empty.

The indexing process consists of us iterating through the data we'd like to add to our knowledge base, creating IDs, embeddings, and metadata — then adding these to the index.

In [None]:
batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # Adding Record's Metadata
    metadata = {
        "wiki-id": str(record["id"]),
        "source": record["url"],
        "title": record["title"]
    }

    # Splitting `text` into Chunks
    record_texts = text_splitter.split_text(record["text"])

    # Creating Individual Metadata Dictionaries for each Chunk
    record_metadatas = [{"chunk": j, "text": text, **metadata} for j, text in enumerate(record_texts)]

    # Appending to Current Batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)

    # If we have reached `batch_limit` adding Texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

In [None]:
index.describe_index_stats()

## LangChain Vectorstore and Querying

We construct our index independently of LangChain. That's because it's a straightforward process, and it is faster to do this with the Pinecone client directly. However, we're about to jump back into LangChain, so we should reconnect to our index via the LangChain library.

In [None]:
text_field = "text"

# Connecting to our Index
index = pinecone.Index(index_name)

# Creating the LangChain Pinecone's Object
vectorstore = Pinecone(index, embed.embed_query, text_field)

In [None]:
## Testing Index

query = "who was Benito Mussolini?"

vectorstore.similarity_search(
    query, # our search query
    k=3    # return 3 most relevant docs
)

All of these are relevant results, telling us that the retrieval component of our systems is `functioning`. The next step is adding our LLM to generatively answer our question using the information provided in these retrieved contexts.

## Generative Question Answering

In generative question-answering (`GQA`), we pass our question to the LLM but instruct it to base the answer on the information returned from our `knowledge base`.

In [None]:
## Initializing the LLM

chatGPT = ChatOpenAI(
    openai_api_key = open("openai_api.txt").read(),
    model_name = "gpt-3.5-turbo",
    temperature = 0.0
)

In [None]:
## Initializing the GQA Object

qa = RetrievalQA.from_chain_type(
    llm = chatGPT,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever()
)

In [None]:
## Testing on out query

qa.run(query)

In [None]:
## We can also create a Chain that link the source of every claim

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm = chatGPT,
    chain_type = "stuff",
    retriever = vectorstore.as_retriever()
)

In [None]:
qa_with_sources(query)