In [26]:
import os, warnings, textwrap
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import WikipediaLoader

In [27]:
# ------------------------------------------------------------------------
# 1) Load Wikipedia content using Wikipedia API
# ------------------------------------------------------------------------
# Load environment variables
load_dotenv()

# Use Wikipedia API to get clean, complete article content
loader = WikipediaLoader(
    query="2023 Cricket World Cup",
    load_max_docs=2,  # Load main article + related docs
    doc_content_chars_max=50000  # Get full article content
)

docs = loader.load()
### the loaded Wikipedia documents, you can inspect their content, metadata, and structure

In [28]:
# ------------------------------------------------------------------------
# 2) Prepare documents for splitting
# ------------------------------------------------------------------------
if docs:
    # Use the Wikipedia docs directly - they're already clean and well-structured
    html_docs = docs

    print(f"Document ready for processing")
    print(f"Content preview:\n{html_docs[0].page_content[:500]}\n")

Document ready for processing
Content preview:
The 2023 ICC Men's Cricket World Cup was the 13th edition of the ICC Men's Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. This was the fourth World Cup held in India, but the first where India was the sole host.
The tournament was contested by ten national teams, maintaining the same format used in 2019. After six weeks of round-robin



In [29]:
# ------------------------------------------------------------------------
# 3) Split with RecursiveCharacterTextSplitter
# ------------------------------------------------------------------------
if 'html_docs' in locals() and html_docs:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Larger chunks for better context
        chunk_overlap=100,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    # Split all Wikipedia documents
    text_chunks = text_splitter.split_documents(html_docs)
    print(f"The number of chunks created: {len(text_chunks)}")
    print(f"Preview of first chunk:\n{text_chunks[0].page_content[:300]}...")
else:
    print("Error: No documents to split. Run step 2 first.")

The number of chunks created: 28
Preview of first chunk:
The 2023 ICC Men's Cricket World Cup was the 13th edition of the ICC Men's Cricket World Cup, a quadrennial One Day International (ODI) cricket tournament organized by the International Cricket Council (ICC). It was hosted from 5 October to 19 November 2023 across ten venues in India. This was the f...


In [30]:
# ------------------------------------------------------------------------
# 5) NagaAI Embeddings
# ------------------------------------------------------------------------
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=os.getenv("NAGA_API_KEY"),
    base_url=os.getenv("NAGA_BASE_URL"),
)
texts = [d.page_content for d in text_chunks]
openai_embeddings = embeddings.embed_documents(texts)  # -> List[List[float]]
print(f"Created {len(openai_embeddings)} embeddings")

Created 28 embeddings


In [31]:
# ------------------------------------------------------------------------
# 6) Check embedding dimensions
# ------------------------------------------------------------------------
if openai_embeddings:
    print(f"The length of the embeddings vector is {len(openai_embeddings[0])}")
    print(f"The embeddings array shape is {len(openai_embeddings)} x {len(openai_embeddings[0])}")
else:
    print("No embeddings produced (empty input).")

The length of the embeddings vector is 1536
The embeddings array shape is 28 x 1536


In [32]:
# ------------------------------------------------------------------------
# 7) FAISS from precomputed vectors
# ------------------------------------------------------------------------
dim = len(openai_embeddings[0])
text_embeddings = list(zip(texts, openai_embeddings))
vector_store = FAISS.from_embeddings(
    text_embeddings=text_embeddings,
    embedding=embeddings,  # used for query embedding only
    metadatas=[d.metadata for d in text_chunks],
)

# Save the vector store - one level up to stay in project root
folder_path = "../Assets/Data"  # From demo/ to project root
os.makedirs(folder_path, exist_ok=True)
vector_store.save_local(folder_path=folder_path, index_name="CWC_index")
print(f"Vector store saved to: {os.path.abspath(folder_path)}")

Vector store saved to: D:\AI-Engineer\prototyping\pull-commit-push-github\9781633435858\Assets\Data
