
### Installing  prerequisites


In [2]:

# If you already installed these in your virtual environment, you can skip this cell.
%pip install -q \
  langchain>=0.2.0 \
  langchain-community \
  langchain-text-splitters \
  faiss-cpu>=1.7.4 \
  pandas>=2.0 \
  openpyxl>=3.1 \
  python-docx>=1.1 \
  pypdf>=4.2 \
  tqdm \
  python-dotenv \
  openai \
  langchain-openai

Note: you may need to restart the kernel to use updated packages.



### Imports & configuration

In [None]:

from pathlib import Path
from tqdm import tqdm
import pandas as pd
from dotenv import load_dotenv

# LangChain
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

# *** OpenAI embeddings only ***
from langchain_openai import OpenAIEmbeddings


load_dotenv()
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Sanity check to verify OpenAI is being used
vec = embeddings.embed_query("hello from openai")
print("OpenAI embedding vector length:", len(vec))


ROOT = Path.cwd().parent        
DATA_DIR = ROOT / "data"        
SAVE_DIR = DATA_DIR / "cs-standards-vector-store"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

DATA_DIR, SAVE_DIR



### Load documents from `data/`


In [None]:

def _mkdoc(text: str, meta: dict):
    text = (text or "").strip()
    if not text:
        return None
    return Document(page_content=text, metadata=meta)

def load_documents(data_dir: Path) -> list[Document]:
    docs = []
    for f in sorted(data_dir.glob("**/*")):
        if not f.is_file():
            continue
        ext = f.suffix.lower()

        if ext in [".xlsx", ".csv"]:
            df = pd.read_excel(f) if ext == ".xlsx" else pd.read_csv(f)
            cols = {c.lower(): c for c in df.columns}

            state_col   = cols.get("state")
            grade_col   = cols.get("grade")
            strand_col  = cols.get("strand")
            cluster_col = cols.get("cluster") or cols.get("content cluster") or cols.get("content_cluster")
            id_col      = cols.get("standard_id") or cols.get("id") or cols.get("code")
            text_col    = cols.get("standard_text") or cols.get("standard") or cols.get("description") or cols.get("text")

            for _, row in df.iterrows():
                meta = {
                    "source": str(f),
                    "state":  row.get(state_col, "") if state_col else "",
                    "grade":  row.get(grade_col, "") if grade_col else "",
                    "strand": row.get(strand_col, "") if strand_col else "",
                    "cluster":row.get(cluster_col, "") if cluster_col else "",
                    "standard_id": row.get(id_col, "") if id_col else "",
                }
                body = str(row.get(text_col, "")) if text_col else " | ".join(
                    f"{c}: {row.get(c, '')}" for c in df.columns
                )
                text = (
                    f"State: {meta['state']} | Grade: {meta['grade']} | Strand: {meta['strand']} | "
                    f"Cluster: {meta['cluster']} | StandardID: {meta['standard_id']}\n{body}"
                )
                d = _mkdoc(text, meta)
                if d:
                    docs.append(d)

        elif ext == ".docx":
            from docx import Document as Docx
            d = Docx(f)
            text = "\n".join(p.text for p in d.paragraphs)
            d = _mkdoc(text, {"source": str(f)})
            if d: docs.append(d)

        elif ext == ".pdf":
            from pypdf import PdfReader
            r = PdfReader(str(f))
            text = "\n".join((p.extract_text() or "") for p in r.pages)
            d = _mkdoc(text, {"source": str(f)})
            if d: docs.append(d)

        elif ext in [".txt", ".md"]:
            text = f.read_text(encoding="utf-8", errors="ignore")
            d = _mkdoc(text, {"source": str(f)})
            if d: docs.append(d)

        # Ignore other extensions by default
    return docs

print("Scanning:", DATA_DIR)
raw_docs = load_documents(DATA_DIR)
print(f"Loaded {len(raw_docs)} raw docs")



### Chunk documents
Uses a `RecursiveCharacterTextSplitter` with `chunk_size=1000` and `chunk_overlap=200`.


In [None]:

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", " ", ""],
)
chunks = splitter.split_documents(raw_docs)
print(f"Total chunks: {len(chunks)}")



### Build FAISS index (batch size = 16) and save

We initialize with the first batch (up to 16 chunks), then add the remaining chunks in steps of 16.  
Finally, we save the FAISS store to `vector_store_cs_standards/`.


In [None]:
if len(chunks) == 0:
    raise ValueError("No chunks found. Make sure your data directory contains supported files.")

first_batch = chunks[:16] if len(chunks) >= 16 else chunks
vector_store = FAISS.from_documents(first_batch, embedding=embeddings)

BATCH = 16
for i in tqdm(range(len(first_batch), len(chunks), BATCH), desc="Indexing"):
    vector_store.add_documents(chunks[i:i+BATCH])

vector_store.save_local(str(SAVE_DIR))
print("Saved FAISS index to:", SAVE_DIR)




### Quick query test

Load the saved index and run a similarity search.  
> **Note:** Use the **same** embedding model class and pass `allow_dangerous_deserialization=True`.


In [None]:
from langchain_community.vectorstores import FAISS as FAISSLoader

vs = FAISSLoader.load_local(str(SAVE_DIR), embeddings=embeddings, allow_dangerous_deserialization=True)
docs = vs.similarity_search("1st grade fractions standards in Texas", k=5)
for j, d in enumerate(docs, 1):
    print(f"\n[{j}] {d.metadata}")
    print(d.page_content[:400], "...")
