In [None]:
%pip install langchain_community
%pip install langchain_text_splitters
%pip install langchain-openai 
%pip install langchainhub 
%pip install chromadb 
%pip install langchain
%pip install beautifulsoup4
%pip install python-dotenv

# new
%pip install gensim --user
%pip install transformers
%pip install torch

In [None]:
import os
from langchain_community.document_loaders import WebBaseLoader
import bs4
import openai
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableParallel
from dotenv import load_dotenv, find_dotenv
from langchain_core.prompts import PromptTemplate

In [None]:
# variables
_ = load_dotenv(dotenv_path='env.txt')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
openai.api_key = os.environ['OPENAI_API_KEY']
embedding_function = OpenAIEmbeddings()
llm = ChatOpenAI(model_name="gpt-4o-mini")
user_query = "What are the advantages of using RAG?"

In [4]:
# Obtain embedding for user query
question_embedding = embedding_function.embed_query(user_query)
first_5_numbers = question_embedding[:5]
print(f"User question embedding (first 5 dimensions): {first_5_numbers}")

User question embedding (first 5 dimensions): [-0.006319054113595048, -0.0023517232115089787, 0.015498643243434815, -0.02267445873596028, 0.017820641897159206]


In [5]:
# Obtain the size of the user query embedding
embedding_size = len(question_embedding)
print(f"Embedding size: {embedding_size}")

Embedding size: 1536


In [None]:
#### INDEXING ####

In [6]:
# Load Documents
loader = WebBaseLoader(
    web_paths=("https://kbourne.github.io/chapter1.html",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [None]:
# Split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
splits = text_splitter.split_documents(docs)

In [None]:
# USING TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Extract the text content from the splits
tfidf_documents = [split.page_content for split in splits]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(tfidf_documents)

# Get the vocabulary, term frequencies, and corresponding IDF values
vocab = tfidf_vectorizer.get_feature_names_out()
tf_values = tfidf_matrix.toarray()
idf_values = tfidf_vectorizer.idf_

# Create a list of tuples containing word, TF, and IDF values
word_stats = list(zip(vocab, tf_values.sum(axis=0), idf_values))

# Sort the list by IDF values in descending order
word_stats.sort(key=lambda x: x[2], reverse=True)

# Print the grid of top 10 words, TF, and IDF values
print("Word\t\tTF\t\tIDF")
print("----\t\t--\t\t---")
for word, tf, idf in word_stats[:10]:
    print(f"{word:<12}\t{tf:.2f}\t\t{idf:.2f}")

Word		TF		IDF
----		--		---
000         	0.16		2.95
1024        	0.04		2.95
123         	0.02		2.95
13          	0.04		2.95
15          	0.01		2.95
16          	0.07		2.95
192         	0.06		2.95
1m          	0.08		2.95
200         	0.08		2.95
2024        	0.01		2.95


In [None]:
# TD-IDF scoring for user query
# User query embedding
tfidf_user_query = [user_query]
new_tfidf_matrix = tfidf_vectorizer.transform(tfidf_user_query)

# Calculate cosine similarity between the new content and the original documents
tfidf_similarity_scores = cosine_similarity(new_tfidf_matrix, tfidf_matrix)

# Find the index of the document with the highest similarity score
tfidf_top_doc_index = tfidf_similarity_scores.argmax()

# Print the text of the top document
print("TF-IDF Top Document:\n", tfidf_documents[tfidf_top_doc_index])

TF-IDF Top Document:
 Can you imagine what you could do with all of the benefits mentioned above, but combined with all of the data within your company, about everything your company has ever done, about your customers and all of their interactions, or about all of your products and services combined with a knowledge of what a specific customer’s needs are? You do not have to imagine it, that is what RAG does! Even smaller companies are not able to access much of their internal data resources very effectively. Larger companies are swimming in petabytes of data that is not readily accessible or is not being fully utilized. Prior to RAG, most of the services you saw that connected customers or employees with the data resources of the company were really just scratching the surface of what is possible compared to if they could access ALL of the data in the company. With the advent of RAG and generative AI in general, corporations are on the precipice of something really, really big. Compa

In [None]:
# CREATING AND SAVING DOC2VEC MODEL
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity

# Extract the text content from the splits
doc2vec_documents = [split.page_content for split in splits]

# Tokenize the documents
doc2vec_tokenized_documents = [doc.lower().split() for doc in doc2vec_documents]

# Create tagged documents for Doc2Vec
doc2vec_tagged_documents = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(doc2vec_tokenized_documents)]

# Train the Doc2Vec model
# Use this version first.
doc2vec_model = Doc2Vec(doc2vec_tagged_documents, vector_size=100, window=5, min_count=1, workers=4)

# After running the previous version of model, comment the previous line out and uncomment this one. Try it with 1536D vectors.
# doc2vec_model = Doc2Vec(doc2vec_tagged_documents, vector_size=1536, window=5, min_count=1, workers=4)

# Save the trained model to a file
doc2vec_model.save("doc2vec_model.bin")

In [None]:
# USING DOC2VEC SAVED MODEL

# Load the saved model
loaded_doc2vec_model = Doc2Vec.load("doc2vec_model.bin")

# Calculate the document vectors
doc2vec_document_vectors = [loaded_doc2vec_model.dv[str(i)] for i in range(len(doc2vec_documents))]

# User query for embedding
doc2vec_user_query = [user_query]

# Tokenize the new content
doc2vec_tokenized_user_query = [content.lower().split() for content in doc2vec_user_query]

# Infer the vector for the new content
doc2vec_user_query_vector = loaded_doc2vec_model.infer_vector(doc2vec_tokenized_user_query[0])

# Calculate cosine similarity between the new content vector and the document vectors
doc2vec_similarity_scores = cosine_similarity([doc2vec_user_query_vector], doc2vec_document_vectors)

# Find the index of the document with the highest similarity score
doc2vec_top_doc_index = doc2vec_similarity_scores.argmax()

# Print the text of the top document
print("\nDoc2Vec Top Document:\n", doc2vec_documents[doc2vec_top_doc_index])


Doc2Vec Top Document:
 Both! All vector databases are vector stores, but not all vector stores are vector databases. Ok, while you get out your chalkboard to draw the Vinn diagram, I continue to explain this statement. There are ways to store vectors that are not full databases. They are simply storage devices for vectors. So to encompass all possible ways to store vectors, LangChain calls them all vector stores. Let’s do the same! Just know that not all of the “vector stores” that LangChain connects with are officially considered vector databases, but in general, most of them are and many people refer to all of them as vector databases. Phew, glad we cleared that up!


In [None]:
# USING BERT
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Extract the text content from the splits
bert_documents = [split.page_content for split in splits]

# Load BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Get the vector size of the BERT embeddings
bert_vector_size = bert_model.config.hidden_size
print(f"Vector size of BERT (base-uncased) embeddings: {bert_vector_size}\n")

# Tokenize the documents
bert_tokenized_documents = [bert_tokenizer(doc, return_tensors='pt', max_length=512, truncation=True) for doc in bert_documents]

# Calculate the document embeddings
bert_document_embeddings = []
with torch.no_grad():
    for doc in bert_tokenized_documents:
        bert_outputs = bert_model(**doc)
        bert_doc_embedding = bert_outputs.last_hidden_state[0, 0, :].numpy()
        bert_document_embeddings.append(bert_doc_embedding)

# New content (question) for embedding
bert_user_query = [user_query]

# Tokenize the new content
bert_tokenized_user_query = bert_tokenizer(bert_user_query[0], return_tensors='pt', max_length=512, truncation=True)

# Calculate the embedding for the new content
bert_user_query_embedding = []
with torch.no_grad():
    bert_outputs = bert_model(**bert_tokenized_user_query)
    bert_user_query_embedding = bert_outputs.last_hidden_state[0, 0, :].numpy()

# Calculate cosine similarity between the new content embedding and the document embeddings
bert_similarity_scores = cosine_similarity([bert_user_query_embedding], bert_document_embeddings)

# Find the index of the document with the highest similarity score
bert_top_doc_index = bert_similarity_scores.argmax()

# Print the text of the top document
print("BERT Top Document:\n", bert_documents[bert_top_doc_index])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Vector size of BERT (base-uncased) embeddings: 768

BERT Top Document:
 Or if you are developing in a legal field, you may want it to sound more like a lawyer. Vector Store or Vector Database?


In [None]:
# Embed
vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=embedding_function)

retriever = vectorstore.as_retriever()

In [None]:
# Retrieve the first result using the new content
result = retriever.get_relevant_documents(user_query)[0]

# Print the retrieved document
print("\nRetrieved Document:\n", result.page_content)

In [None]:
#### RETRIEVAL and GENERATION ####

In [None]:
# Prompt
prompt = hub.pull("jclemens24/rag-prompt")

In [None]:
# Relevance check prompt
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [None]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 4:
        return "I don't know."
    else:
        return x['answer']

In [None]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {"relevance_score": (
            RunnablePassthrough()
            | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
            | llm
            | StrOutputParser()
        ), "answer": (
            RunnablePassthrough()
            | prompt
            | llm
            | StrOutputParser()
        )}
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [None]:
rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [None]:
# Question - relevant question
result = rag_chain_with_source.invoke(user_query)
relevance_score = result['answer']['relevance_score']
final_answer = result['answer']['final_answer']

print(f"Relevance Score: {relevance_score}")
print(f"Final Answer:\n{final_answer}")