<a href="https://colab.research.google.com/github/Sidy3143/llm-projects/blob/main/Lex_Fridman_Karpathy_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Building a RAG system using the Mistral-7B model to answer questions based on the Lex Fridman and Andrej Karpathy podcast transcript.**

Load Mistal-7B model with quantized 4-bit

In [None]:
!pip install -q fsspec==2025.3.0 gcsfs datasets transformers trl peft bitsandbytes accelerate

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

In [None]:
!huggingface-cli login # you need token access to mistral model from hugging face

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [None]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_compute_dtype="bfloat16",
                                bnb_4bit_use_double_quant=True,
                                )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map = {"":0},
)

Load transcript of the Lex fridman-Andrej Karpathy podcast

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = 'https://podscript.ai/podcasts/lex-fridman-podcast/333-andrej-karpathy-tesla-ai-self-driving-optimus-aliens-and-agi'

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
# View all div classes
for div in soup.find_all('div'):
    print(div.get('class'))

In [None]:
transcript_div = soup.find('div', class_='post-content') # 'post-content' is the one containning the actual transcript

transcript_text = transcript_div.get_text(separator='\n') #extract the text

transcript_text

We clip the unnecessary parts in the intro (sponsors etc.) and the weird text at the bottom (episode info).

In [None]:
start_marker = "And now, dear friends, here’s Andrej Karpathy."
end_marker = "\nEpisode Info"


start_idx = transcript_text.find(start_marker)
end_idx = transcript_text.find(end_marker)


cleaned_transcript =  transcript_text[start_idx + len(start_marker):end_idx]


cleaned_transcript

In [None]:
# Save to a text file
with open('karpathy_lex_transcript_cleaned.txt', 'w', encoding='utf-8') as f:
    f.write(cleaned_transcript)

print("Transcript saved successfully!")

In [None]:
!pip install -q -U langchain_community

In [None]:
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
!pip install -q -U langchain_huggingface

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS

langchain_embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # you can choose different embedding models from hugging face

In [None]:
!pip install -q faiss-cpu # or faiss-gpu if using gpu

In [None]:
# Customize text splitting with conversational separators
text_splitter_conv = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=25,
    separators=["\nLex Fridman\n\n\n", "\nAndrej Karpathy\n\n\n"] # this is the format for who is talking, so we devide by it to not mix
)

chunks_conv = text_splitter_conv.split_text(cleaned_transcript)

print(f"{len(chunks_conv)} chunks")

In [None]:
faiss_documents = [Document(page_content=chunk) for chunk in chunks_conv]

In [None]:
faiss_vectorstore = FAISS.from_documents(faiss_documents, langchain_embedder)
faiss_retriever = faiss_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

Without RAG

In [None]:
system_prompt = "Answer the following question appropriately."

query = "What did Andrej karpathy learn from Elon Musk?"

message_template = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": query}
]

input = tokenizer.apply_chat_template(message_template)

input = torch.tensor([input]).to(model.device)

output = model.generate(input, max_length=1024)

decoded = tokenizer.decode(output[0])

In [None]:
decoded

This gives a good answer but not specific to what Andrej said during the conversation. So let's give it some context with the FAISS retriever.

In [None]:
# Now try retrieving with the FAISS retriever
query = "What did Andrej karpathy learn from Elon Musk?"

faiss_results = faiss_retriever.invoke(query)

In [None]:
context = "Context:\n"
for i in range(5):
  context += faiss_results[i].page_content

In [None]:
context

In [None]:
system_prompt = f"""Your an AI assistant. Your goal is to provide answers about the lex fridman and Andrej Karpathy podcasts, given the provided relevant context with timestamps. Use the context if it is relevant to the question. If you don't know the answer, just say you don't know.
{context}

Please respond to the following question:"""

In [None]:
message_template = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": query}
]

input = tokenizer.apply_chat_template(message_template)

In [None]:
input = torch.tensor([input]).to(model.device)

output = model.generate(input, max_length=1024)

decoded = tokenizer.decode(output[0])

In [None]:
start_response = decoded.find("[/INST]") # only show what the model generated

response = decoded[start_response:]
response

Gives an answer very specific and accurate to what Andrej said during the episode.

We Can also use **llama Index**

In [None]:
!pip -q install llama-index
!pip -q install llama-index-embeddings-huggingface

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.schema import Document
from llama_index.core.postprocessor import SimilarityPostprocessor

In [None]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") # alternative model

Settings.llm = None
Settings.chunk_size = 512
Settings.chunk_overlap = 25

In [None]:
documents = [Document(text=cleaned_transcript)]

In [None]:
index = VectorStoreIndex.from_documents(documents)

In [None]:
top_k = 3

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

In [None]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)

In [None]:
query = "What did Andrej karpathy learn from Elon Musk?"
response = query_engine.query(query)

In [None]:
context = "Context:\n"
for i in range(top_k):
  context += response.source_nodes[i].text + "\n\n"

In [None]:
context

In [None]:
system_prompt = f"""Your an AI assistant. Your goal is to provide answers about the lex fridman and Andrej Karpathy podcasts, given the provided relevant context with timestamps. Use the context if it is relevant to the question. If you don't know the answer, just say you don't know.
{context}

Please respond to the following question:"""

In [None]:
message_template = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": query}
]

input_tokens = tokenizer.apply_chat_template(message_template)

In [None]:
input = torch.tensor([input_tokens]).to(model.device)

output = model.generate(input, max_length=4096)

decoded = tokenizer.decode(output[0])

In [None]:
decoded

In [None]:
marker_output = decoded.find("[/INST]") # only extract the model's answer

output = decoded[marker_output:]

In [None]:
output

Again, we get a good answer related to what he said in the podcast.