In [13]:
# Cell 1: imports & environment (no Google API needed for HF embeddings)
import os
from dotenv import load_dotenv
from huggingface_hub import login

# FIXED: Import missing classes
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# Load variables from .env
load_dotenv()
# Get token from .env
hf_token = os.getenv("HF_TOKEN")
# Login securely
login(hf_token)

# Initialize HF embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [14]:
# Cell 2: Load data from text, web, and PDF
from langchain_community.document_loaders import TextLoader, WebBaseLoader, PyPDFLoader
import bs4

# Plain text
loader = TextLoader("speech.txt")
text_documents = loader.load()

# Web page (example)
web_loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(parse_only=bs4.SoupStrainer(
        class_=("post-title","post-content","post-header")
    ))
)
web_documents = web_loader.load()

# PDF
pdf_loader = PyPDFLoader("attention.pdf")
pdf_docs = pdf_loader.load()

# Combine all documents
documents = []
for d in (text_documents + web_documents + pdf_docs):
    # Ensure each item is a Document object (LangChain format)
    documents.append(d)
    
print(f"Total documents loaded: {len(documents)}")
if documents:
    print(f"First document preview: {documents[0].page_content[:200]}...")

Total documents loaded: 17
First document preview: The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no ...


In [15]:
# Cell 3: Chunk documents into smaller pieces
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
documents = text_splitter.split_documents(documents)

# Inspect first few chunks
len(documents), documents[:3]

(152,
 [Document(metadata={'source': 'speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'),
  Document(metadata={'source': 'speech.txt'}, page_content='Just because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…'),
  Document(metadata={'source': 'speech.txt'}

In [16]:
# Create vector store AFTER documents are chunked
db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")

In [17]:
# Run some queries
query = "Who are the authors of Attention is All You Need?"
retrieved_results = db.similarity_search(query)
print("Q:", query)
print("A:", retrieved_results[0].page_content)

Q: Who are the authors of Attention is All You Need?
A: Attention Visualizations
Input-Input Layer5
It
is
in
this
spirit
that
a
majority
of
American
governments
have
passed
new
laws
since
2009
making
the
registration
or
voting
process
more
difficult
.
<EOS>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
It
is
in
this
spirit
that
a
majority
of
American
governments
have
passed
new
laws
since
2009
making
the
registration
or
voting
process
more
difficult
.
<EOS>
<pad>
<pad>
<pad>
<pad>
<pad>
<pad>
Figure 3: An example of the attention mechanism following long-distance dependencies in the
encoder self-attention in layer 5 of 6. Many of the attention heads attend to a distant dependency of
the verb ‘making’, completing the phrase ‘making...more difficult’. Attentions here shown only for


In [18]:
query = "What is Transformer architecture?"
retrieved_results = db.similarity_search(query)
print("Q:", query)
print("A:", retrieved_results[0].page_content)

Q: What is Transformer architecture?
A: Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,
respectively.
3.1 Encoder and Decoder Stacks
Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two
sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-
wise fully connected feed-forward network. We employ a residual connection [11] around each of
the two sub-layers, followed by layer normalization [ 1]. That is, the output of each sub-layer is
LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer
