In [1]:
mock_transcript = [
    {
        "speaker": "Rahul",
        "start_time": "00:00:05",
        "end_time": "00:00:15",
        "text": "We should deploy the model by Friday.",
        "meeting_id": "M001",
        "date": "2026-02-17"
    },
    {
        "speaker": "Anita",
        "start_time": "00:00:16",
        "end_time": "00:00:25",
        "text": "We need additional GPU resources for training.",
        "meeting_id": "M001",
        "date": "2026-02-17"
    },
]


In [2]:
from langchain_core.documents import Document

documents = []

for segment in mock_transcript:
    doc = Document(
        page_content=segment["text"],
        metadata={
            "speaker": segment["speaker"],
            "start_time": segment["start_time"],
            "end_time": segment["end_time"],
            "meeting_id": segment["meeting_id"],
            "date": segment["date"]
        }
    )
    documents.append(doc)

len(documents)


2

In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={"device": "cpu"}  # change to "cuda" if GPU
)


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 920.70it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [6]:
from langchain_community.vectorstores import FAISS

vector_store = FAISS.from_documents(documents, embeddings)


In [7]:
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 2}
)

In [9]:
query = "What was discussed about body building?"

results = retriever.invoke(query)

for r in results:
    print("Content:", r.page_content)
    print("Metadata:", r.metadata)
    print("-" * 40)


Content: We need additional GPU resources for training.
Metadata: {'speaker': 'Anita', 'start_time': '00:00:16', 'end_time': '00:00:25', 'meeting_id': 'M001', 'date': '2026-02-17'}
----------------------------------------
Content: We should deploy the model by Friday.
Metadata: {'speaker': 'Rahul', 'start_time': '00:00:05', 'end_time': '00:00:15', 'meeting_id': 'M001', 'date': '2026-02-17'}
----------------------------------------
