In [2]:
import requests
from bs4 import BeautifulSoup
import json
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.docstore.document import Document


In [3]:
os.makedirs("data", exist_ok=True)
os.makedirs("vectorstore", exist_ok=True)


In [6]:
url = "https://highspark.co/famous-persuasive-speeches/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

speeches = []

for h2 in soup.find_all("h2"):
    title = h2.get_text(strip=True)
    blockquote = h2.find_next("blockquote")
    background_p = blockquote.find_next("p") if blockquote else None

    if blockquote:
        speech_text = blockquote.get_text(strip=True)
        background = background_p.get_text(strip=True) if background_p else ""
        speeches.append({
            "title": title,
            "text": speech_text,
            "background": background
        })

with open("data/speeches.json", "w") as f:
    json.dump(speeches, f, indent=2)

print(f"Saved {len(speeches)} speeches.")


Saved 40 speeches.


In [7]:
with open("data/speeches.json") as f:
    speeches = json.load(f)

documents = []
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

for speech in speeches:
    full_text = f"{speech['text']}\n\nBackground:\n{speech['background']}"
    chunks = splitter.split_text(full_text)
    for chunk in chunks:
        documents.append(Document(page_content=chunk, metadata={"title": speech["title"]}))


In [8]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings

# Initialize Ollama Mistral embedding
embedding = OllamaEmbeddings(model="mistral")

# Build FAISS vector store from documents
db = FAISS.from_documents(documents, embedding)

# Save vector store locally
db.save_local("vectorstore/faiss_index")

print("Vector store created and saved successfully.")


  embedding = OllamaEmbeddings(model="mistral")


Vector store created and saved successfully.


In [12]:
query = "What did Queen Elizabeth I say about her role in the war against Spain?"
results = db.similarity_search_with_score(query, k=3)

for doc, score in results:
    print(f"Title: {doc.metadata['title']}")
    print(f"Content: {doc.page_content}\n")
    print(f"Score: {score}\n")


Title: 30. Black Power Address at UC Berkeley by Stokely Carmichael
Content: to sanction Black Power. We’re tired waiting; every time black people move in this country, they’re forced to defend their position before they move. It’s time that the people who are supposed to be defending their position do that. That’s white people. They ought to start defending themselves as to why they have oppressed and exploited us.”

Score: 133587.265625

Title: 35. Questioning the Universe by Stephen Hawking
Content: show that we have made remarkable progress in the last hundred years. But if we want to continue beyond the next hundred years, our future is in space. That is why I am in favor of manned — or should I say, personned — space flight.”

Score: 135536.296875

Title: 21. June 9 Speech to Martial Law Units by Deng Xiaoping
Content: Perhaps this bad thing will enable us to go ahead with reform and the open policy at a steadier and better — even a faster — pace, more speedily correct our mistak