In [2]:
#install langchain
%pip install langchain
%pip install qdrant-client sentence-transformers torch
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.0%2Bcu118-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp313-cp313-win_amd64.whl.metadata (29 kB)
Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp313-cp313-win_amd64.whl (5.5 MB)
   ---------------------------------------- 0.0/5.5 MB ? eta -:--:--
   ---------------------------------------- 5.5/5.5 MB 97.4 MB/s eta 0:00:00
Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp313-cp313-win_amd64.whl (2908.4 MB)
   ---------------------------------------- 0.0/2.9 GB ? eta -:--:--
   ---------------------------------------- 0.0/2.9 GB 117.4 MB/s eta 0:00:25
  

  You can safely remove it manually.

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import os
import glob
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter

INPUT_DIRS_10K = [
    "10-K_sectioned",
    "10-K-sectioned_newer"]
INPUT_DIR_10Q = "10-Q_sectioned"
INPUT_DIR_EARNINGS = "processed_transcripts_json"

OUTPUT_10K_JSONL = "chuncks/processed_10k_chunks.jsonl"
OUTPUT_10Q_JSONL = "chuncks/processed_10q_chunks.jsonl"
OUTPUT_EARNINGS_JSONL = "chuncks/processed_earnings_chunks.jsonl"

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150

sec_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = CHUNK_SIZE,
    chunk_overlap = CHUNK_OVERLAP,
    length_function = len

)
earnings_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,  # Example: maybe slightly smaller for conversational text
    chunk_overlap=100, # Example
    length_function=len,
    # Consider separators that might be common in transcripts if defaults aren't enough
    # separators=["\n\n", "\n", ". ", "? ", "! ", "; ", " ", ""]
)
global_chunk_id_counter = 0

In [8]:
def process_sec_file(filepath, filename_only, text_splitter, output_file_handle, filing_category):
    global global_chunk_id_counter
    print(f"Processing SEC ({filing_category}) file: {filepath}")
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)

        global_metadata = data.get("metadata", {})
        ticker = global_metadata.get("ticker", "UNKNOWN_TICKER")
        year = global_metadata.get("year", "UNKNOWN_YEAR")
        cik = global_metadata.get("cik", "UNKNOWN_CIK")
        date = global_metadata.get("date", "UNKNOWN_DATE")
        # Use the filing_category passed to the function for consistency
        filing_type_from_meta = global_metadata.get("filing_type", filing_category.upper())


        for section_data in data.get("sections", []):
            section_text = section_data.get("text", "")
            if not section_text:
                continue

            item = section_data.get("item", "UNKNOWN_ITEM")
            section_name_from_json = section_data.get("section", "UNKNOWN_SECTION")
            chunks = text_splitter.split_text(section_text)

            for i, chunk_content in enumerate(chunks):
                chunk_metadata = {
                    "source_type": "sec_filing",
                    "filing_category": filing_category, # e.g., "10k" or "10q"
                    "original_file_name": filename_only,
                    "ticker": ticker,
                    "year": year,
                    "cik": cik,
                    "date": date,
                    "filing_type": filing_type_from_meta, # Use what's in file, or fallback
                    "item": item,
                    "section_name": section_name_from_json,
                    "chunk_sequence_in_section": i,
                    "global_chunk_id": f"sec_{global_chunk_id_counter}"
                }
                global_chunk_id_counter += 1
                chunk_json_object = {"text": chunk_content, "metadata": chunk_metadata}
                output_file_handle.write(json.dumps(chunk_json_object) + "\n")

    except json.JSONDecodeError:
        print(f"Error decoding JSON from SEC file: {filepath}")
    except Exception as e:
        print(f"An error occurred while processing SEC file {filepath}: {e}")

In [9]:
# --- Function for Earnings Transcript ---
def process_earnings_file(filepath, filename_only, text_splitter, output_file_handle):
    global global_chunk_id_counter
    print(f"Processing Earning Transcript file: {filepath}")

    try:
        with open(filepath, "r", encoding="utf-8") as f:
            earnings_data = json.load(f)

        # --- Extract Global Document Metadata ---
        doc_meta = earnings_data.get("document_metadata", {})
        ticker = doc_meta.get("ticker", "UNKNOWN_TICKER")
        year = doc_meta.get("year", "UNKNOWN_YEAR")
        quarter = doc_meta.get("quarter", "UNKNOWN_QUARTER")
        original_date = doc_meta.get("original_date_col_value", "UNKNOWN_DATE")
        company_name = doc_meta.get("company_name", "UNKNOWN_COMPANY")
        exchange = doc_meta.get("exchange", "UNKNOWN_EXCHANGE")
        # You can add more fields like 'exchange', 'extracted_participants' if useful for RAG

        # --- Iterate through speaker turns ---
        speaker_turns = earnings_data.get("speaker_turns", [])
        if not speaker_turns:
            print(f"  No speaker turns found in {filename_only}. Skipping.")
            return

        for turn_index, turn_data in enumerate(speaker_turns):
            turn_text = turn_data.get("text", "")
            if not turn_text:
                continue # Skip turns with no text

            # Extract turn-specific metadata
            turn_id = turn_data.get("turn_id", f"turn_{turn_index}")
            speaker_full = turn_data.get("speaker_full", "UNKNOWN_SPEAKER")
            speaker_simple = turn_data.get("speaker_simple_name", "UNKNOWN_SPEAKER")
            # speaker_title = turn_data.get("speaker_title_affiliation", "") # Optional
            turn_section = turn_data.get("section", "UNKNOWN_SECTION") # "Prepared Remarks", "Q&A"

            # Chunk the text of the current speaker turn
            chunks = text_splitter.split_text(turn_text)

            for chunk_index, chunk_content in enumerate(chunks):
                chunk_metadata = {
                    "source_type": "earnings_transcript",
                    "original_file_name": filename_only, # The JSON filename of the transcript
                    "ticker": ticker,
                    "year": year,
                    "quarter": quarter,
                    "date": original_date, # The date of the earnings call
                    "company_name": company_name,
                    "exchange": exchange,
                    "turn_id": turn_id, # ID of the speaker turn
                    "turn_speaker_full": speaker_full,
                    "turn_speaker_simple": speaker_simple,
                    "turn_section": turn_section, # e.g., "Prepared Remarks" or "Q&A"
                    # "turn_speaker_title": speaker_title, # Optional
                    "chunk_sequence_in_turn": chunk_index,
                    "global_chunk_id": f"earn_{global_chunk_id_counter}"
                }
                global_chunk_id_counter += 1

                chunk_json_object = {"text": chunk_content, "metadata": chunk_metadata}
                output_file_handle.write(json.dumps(chunk_json_object) + "\n")

    except json.JSONDecodeError:
        print(f"Error decoding JSON from earnings file: {filepath}")
    except FileNotFoundError:
        print(f"Error: Earnings file not found: {filepath}")
    except Exception as e:
        print(f"An error occurred while processing earnings file {filepath}: {e}")

In [10]:
def main():
    # Keep track of chunks per category for final summary
    chunks_by_category = {
        "10k": 0,
        "10q": 0,
        "earnings": 0
    }
    initial_global_chunk_id = global_chunk_id_counter # Store before processing starts

    # --- Process 10-K Filings ---
    print("--- Processing 10-K Filings ---")
    with open(OUTPUT_10K_JSONL, "w", encoding="utf-8") as outfile_10k:
        start_chunks_for_10k = global_chunk_id_counter
        for input_dir_10k_base in INPUT_DIRS_10K:
            if not os.path.isdir(input_dir_10k_base):
                print(f"Warning: 10-K input directory not found: {input_dir_10k_base}")
                continue
            print(f"  Searching in: {input_dir_10k_base}")
            for filepath in glob.glob(os.path.join(input_dir_10k_base, "**", "*.json"), recursive=True):
                filename_only = os.path.basename(filepath)
                process_sec_file(filepath, filename_only, sec_text_splitter, outfile_10k, "10k")
        chunks_by_category["10k"] = global_chunk_id_counter - start_chunks_for_10k
    print(f"Finished processing 10-K filings. Chunks created: {chunks_by_category['10k']}")

    # --- Process 10-Q Filings ---
    print("\n--- Processing 10-Q Filings ---")
    with open(OUTPUT_10Q_JSONL, "w", encoding="utf-8") as outfile_10q:
        start_chunks_for_10q = global_chunk_id_counter
        if INPUT_DIR_10Q and os.path.isdir(INPUT_DIR_10Q):
            print(f"  Searching in: {INPUT_DIR_10Q}")
            for filepath in glob.glob(os.path.join(INPUT_DIR_10Q, "**", "*.json"), recursive=True):
                filename_only = os.path.basename(filepath)
                process_sec_file(filepath, filename_only, sec_text_splitter, outfile_10q, "10q")
        elif INPUT_DIR_10Q: # If path is defined but not a directory
            print(f"Warning: 10-Q input directory not found or is not a directory: {INPUT_DIR_10Q}")
        else: # If INPUT_DIR_10Q is None or empty string
             print(f"Info: No 10-Q input directory specified. Skipping 10-Q processing.")
        chunks_by_category["10q"] = global_chunk_id_counter - start_chunks_for_10q
    print(f"Finished processing 10-Q filings. Chunks created: {chunks_by_category['10q']}")

    # --- Process Earnings Transcripts ---
    print("\n--- Processing Earnings Transcripts ---")
    with open(OUTPUT_EARNINGS_JSONL, "w", encoding="utf-8") as outfile_earnings:
        start_chunks_for_earnings = global_chunk_id_counter
        if INPUT_DIR_EARNINGS and os.path.isdir(INPUT_DIR_EARNINGS):
            print(f"  Searching in: {INPUT_DIR_EARNINGS}")
            for filepath in glob.glob(os.path.join(INPUT_DIR_EARNINGS, "**", "*.json"), recursive=True):
                filename_only = os.path.basename(filepath)
                process_earnings_file(filepath, filename_only, earnings_text_splitter, outfile_earnings)
        elif INPUT_DIR_EARNINGS:
            print(f"Warning: Earnings input directory not found or is not a directory: {INPUT_DIR_EARNINGS}")
        else:
            print(f"Info: No Earnings input directory specified. Skipping Earnings processing.")

        chunks_by_category["earnings"] = global_chunk_id_counter - start_chunks_for_earnings
    print(f"Finished processing Earnings Transcripts. Chunks created: {chunks_by_category['earnings']}")

    total_chunks_processed = global_chunk_id_counter - initial_global_chunk_id
    print(f"\n--- Processing Summary ---")
    print(f"  10-K chunks saved to {OUTPUT_10K_JSONL} ({chunks_by_category['10k']} chunks)")
    print(f"  10-Q chunks saved to {OUTPUT_10Q_JSONL} ({chunks_by_category['10q']} chunks)")
    print(f"  Earnings chunks saved to {OUTPUT_EARNINGS_JSONL} ({chunks_by_category['earnings']} chunks)")
    print(f"Total unique chunks created across all files: {total_chunks_processed}")
    print(f"Final global_chunk_id_counter value: {global_chunk_id_counter}")


if __name__ == "__main__":
    main()

--- Processing 10-K Filings ---
  Searching in: 10-K_sectioned
Processing SEC (10k) file: 10-K_sectioned\A\2018.json
Processing SEC (10k) file: 10-K_sectioned\A\2019.json
Processing SEC (10k) file: 10-K_sectioned\A\2020.json
Processing SEC (10k) file: 10-K_sectioned\AA\2017.json
Processing SEC (10k) file: 10-K_sectioned\AA\2018.json
Processing SEC (10k) file: 10-K_sectioned\AA\2019.json
Processing SEC (10k) file: 10-K_sectioned\AAAU\2018.json
Processing SEC (10k) file: 10-K_sectioned\AAAU\2019.json
Processing SEC (10k) file: 10-K_sectioned\AAAU\2020.json
Processing SEC (10k) file: 10-K_sectioned\AAGH\2017.json
Processing SEC (10k) file: 10-K_sectioned\AAGH\2018.json
Processing SEC (10k) file: 10-K_sectioned\AAGH\2019.json
Processing SEC (10k) file: 10-K_sectioned\AAGH\2020.json
Processing SEC (10k) file: 10-K_sectioned\AAL\2017.json
Processing SEC (10k) file: 10-K_sectioned\AAL\2018.json
Processing SEC (10k) file: 10-K_sectioned\AAL\2019.json
Processing SEC (10k) file: 10-K_sectioned\A

In [15]:
file = "chuncks/processed_earnings_chunks.jsonl"
with open(file, "r", encoding="utf-8") as f:
    for line in f:
        print(line.strip())
        break  # Just print the first line for brevity

{"text": "Good afternoon and welcome to the Agilent Technologies' First Quarter 2020 Earnings Conference Call. All lines have been placed on mute to prevent any background noise. After the speakers' remarks, there will be a question-and-answer session. And now I'd like to introduce you to the host for today's conference, Ankur Dhingra, Vice President of Investor Relations.", "metadata": {"source_type": "earnings_transcript", "original_file_name": "2020_Q1.json", "ticker": "A", "year": "2020", "quarter": "Q1", "date": "Feb 18, 2020, 4:30 p.m. ET", "company_name": "Agilent Technologies' First Quarter 2020", "exchange": "NYSE", "turn_id": "A_2020_Q1_turn001", "turn_speaker_full": "Operator", "turn_speaker_simple": "Operator", "turn_section": "Prepared Remarks", "chunk_sequence_in_turn": 0, "global_chunk_id": "earn_6612760"}}


In [17]:
#we ll curate the list so it onlt incldues the most important sp500 companies


import json
import os
import time
from helpers import get_sp500_tickers 

# --- Configuration ---
# Get the S&P 500 tickers from your helper function
IMPORTANT_TICKERS = get_sp500_tickers()
print(f"Loaded {len(IMPORTANT_TICKERS)} important tickers for curation.")

# Paths to your FULL .jsonl files (containing all original chunks)
FULL_INPUT_JSONL_FILES = {
    "10k": "chunks/processed_10k_chunks.jsonl",
    "10q": "chunks/processed_10q_chunks.jsonl",
    "earnings": "chunks/processed_earnings_chunks.jsonl"
}

# Output directory for curated .jsonl files
CURATED_OUTPUT_DIR = "curated_sp500_chunks" # New directory name for clarity
os.makedirs(CURATED_OUTPUT_DIR, exist_ok=True)

CURATED_OUTPUT_JSONL_FILES = {
    "10k": os.path.join(CURATED_OUTPUT_DIR, "processed_10k_sp500_curated.jsonl"),
    "10q": os.path.join(CURATED_OUTPUT_DIR, "processed_10q_sp500_curated.jsonl"),
    "earnings": os.path.join(CURATED_OUTPUT_DIR, "processed_earnings_sp500_curated.jsonl")
}

total_chunks_written_overall = 0
total_chunks_scanned_overall = 0

def main_curate_chunks():
    global total_chunks_written_overall, total_chunks_scanned_overall
    start_time_curation = time.time()

    print("--- Starting Curation Process ---")

    for category, input_path in FULL_INPUT_JSONL_FILES.items():
        output_path = CURATED_OUTPUT_JSONL_FILES[category]
        print(f"\nProcessing {input_path} -> {output_path}")
        
        if not os.path.exists(input_path):
            print(f"  Warning: Input file not found: {input_path}. Skipping.")
            continue

        category_chunks_written = 0
        category_chunks_scanned = 0
        # Open output file in write mode to overwrite if it exists
        with open(input_path, 'r', encoding='utf-8') as infile, \
             open(output_path, 'w', encoding='utf-8') as outfile:
            
            for line_num, line in enumerate(infile):
                total_chunks_scanned_overall += 1
                category_chunks_scanned +=1
                if category_chunks_scanned % 500000 == 0 and category_chunks_scanned > 0: # Progress update per file
                    print(f"  Scanned {category_chunks_scanned} lines from {input_path}...")

                if line.strip():
                    try:
                        chunk_data = json.loads(line)
                        # Check if the ticker is in our important list
                        # Ensure metadata and ticker keys exist
                        metadata = chunk_data.get('metadata', {})
                        ticker_in_chunk = metadata.get('ticker')

                        if ticker_in_chunk and ticker_in_chunk in IMPORTANT_TICKERS:
                            outfile.write(line) # Write the original line (which is already a JSON string)
                            category_chunks_written += 1
                            total_chunks_written_overall += 1
                    except json.JSONDecodeError:
                        print(f"    Warning: Could not decode JSON on line {line_num+1} in {input_path}. Skipping.")
                    # No need for KeyError here specifically if using .get() with defaults
        
        print(f"  Finished {input_path}. Scanned {category_chunks_scanned}. Wrote {category_chunks_written} curated chunks to {output_path}.")

    end_time_curation = time.time()
    print("\n--- Curation Complete ---")
    print(f"Total chunks scanned from original files: {total_chunks_scanned_overall}")
    print(f"Total curated chunks written to new files: {total_chunks_written_overall}")
    print(f"Curation process took: {end_time_curation - start_time_curation:.2f} seconds.")

if __name__ == "__main__":
    main_curate_chunks()

Loaded 503 important tickers for curation.
--- Starting Curation Process ---

Processing chunks/processed_10k_chunks.jsonl -> curated_sp500_chunks\processed_10k_sp500_curated.jsonl
  Scanned 500000 lines from chunks/processed_10k_chunks.jsonl...
  Scanned 1000000 lines from chunks/processed_10k_chunks.jsonl...
  Scanned 1500000 lines from chunks/processed_10k_chunks.jsonl...
  Scanned 2000000 lines from chunks/processed_10k_chunks.jsonl...
  Scanned 2500000 lines from chunks/processed_10k_chunks.jsonl...
  Scanned 3000000 lines from chunks/processed_10k_chunks.jsonl...
  Scanned 3500000 lines from chunks/processed_10k_chunks.jsonl...
  Scanned 4000000 lines from chunks/processed_10k_chunks.jsonl...
  Finished chunks/processed_10k_chunks.jsonl. Scanned 4007192. Wrote 457262 curated chunks to curated_sp500_chunks\processed_10k_sp500_curated.jsonl.

Processing chunks/processed_10q_chunks.jsonl -> curated_sp500_chunks\processed_10q_sp500_curated.jsonl
  Scanned 500000 lines from chunks/pro

In [10]:
# File: upsert_local_qdrant.py  –  revised 2025-06-03
"""
Changes vs. your original
––––––––––––––––––––––––––
1. Client timeout raised to 60 s.
2. Collection created **on-disk** (vectors + HNSW).
3. NO call to update_collection(indexing_threshold=0).
4. Flush() at the very end to guarantee WAL → disk.
5. Small style tweaks and extra logging.
"""

import json, os, time, uuid, glob
try:
    import numpy as np
    from qdrant_client import QdrantClient, models
    LIBRARIES_AVAILABLE_LOCAL = True
except ImportError as e:
    print(f"Required library not installed for local upsert: {e}")
    LIBRARIES_AVAILABLE_LOCAL = False


# ── Configuration ──────────────────────────────────────────────────────────────
DOWNLOADED_DATA_DIR          = r"colab_generated"       # where you saved the files
QDRANT_URL_LOCAL             = "http://localhost:6333"
COLLECTION_NAME_LOCAL        = "financial_sp500_local_final_v2"
QDRANT_UPSERT_BATCH_SIZE     = 512                      # 512–2000 is fine
VECTOR_SIZE                  = 768                      # bge-base-en-v1.5
SAVED_FILE_BASENAME          = "financial_sp500_curated"
CLIENT_TIMEOUT_S             = 60                       # ↑ from default 5 s
# ───────────────────────────────────────────────────────────────────────────────


def recreate_collection_on_disk(client: QdrantClient) -> None:
    """Creates/recreates the collection with on-disk storage."""
    print(f"Recreating collection '{COLLECTION_NAME_LOCAL}' in on-disk mode…")
    client.recreate_collection(
        collection_name=COLLECTION_NAME_LOCAL,
        vectors_config=models.VectorParams(
            size=VECTOR_SIZE,
            distance=models.Distance.COSINE,
            on_disk=True                    # vectors mmap’d
        ),
        hnsw_config=models.HnswConfigDiff(on_disk=True)  # graph mmap’d
    )
    print("  ✓ Collection ready.")


def main_local_upsert() -> None:
    if not LIBRARIES_AVAILABLE_LOCAL:
        print("Aborting: local libraries missing.")
        return

    # ── Connect ────────────────────────────────────────────────────────────────
    print(f"\nConnecting to Qdrant at {QDRANT_URL_LOCAL} …")
    cli = QdrantClient(url=QDRANT_URL_LOCAL, timeout=CLIENT_TIMEOUT_S)
    print("  ✓ Connected. Existing collections:",
          [c.name for c in cli.get_collections().collections])

    # ── Prepare collection ─────────────────────────────────────────────────────
    recreate_collection_on_disk(cli)

    # ── Locate embedding/payload parts ─────────────────────────────────────────
    emb_files = sorted(glob.glob(os.path.join(
        DOWNLOADED_DATA_DIR, f"{SAVED_FILE_BASENAME}_embeddings_part_*.npy")))
    pay_files = sorted(glob.glob(os.path.join(
        DOWNLOADED_DATA_DIR, f"{SAVED_FILE_BASENAME}_payloads_part_*.jsonl")))

    if not emb_files or not pay_files:
        print("Error: no embedding/payload files found.")
        return
    if len(emb_files) != len(pay_files):
        print("Error: embedding/payload part count mismatch "
              f"({len(emb_files)} vs {len(pay_files)}).")
        return
    print(f"Found {len(emb_files)} part(s) to upsert.")

    # ── Upsert loop ────────────────────────────────────────────────────────────
    total_points = 0
    buf: list[models.PointStruct] = []
    t0 = time.time()

    for part_idx, (emb_fp, pay_fp) in enumerate(zip(emb_files, pay_files), start=1):
        print(f"\nPart {part_idx}/{len(emb_files)}  →  "
              f"{os.path.basename(emb_fp)} & {os.path.basename(pay_fp)}")

        vectors = np.load(emb_fp).tolist()
        payloads = [json.loads(l) for l in open(pay_fp, encoding="utf-8") if l.strip()]

        if len(vectors) != len(payloads):
            print("  ⚠ Skipping – vector/payload length mismatch.")
            continue

        for vec, pld in zip(vectors, payloads):
            buf.append(models.PointStruct(id=str(uuid.uuid4()),
                                          vector=vec,
                                          payload=pld))
            if len(buf) >= QDRANT_UPSERT_BATCH_SIZE:
                cli.upsert(COLLECTION_NAME_LOCAL, buf, wait=True)
                total_points += len(buf)
                print(f"    Upserted {len(buf)} (total {total_points}).")
                buf = []

    # flush remainder
    if buf:
        cli.upsert(COLLECTION_NAME_LOCAL, buf, wait=True)
        total_points += len(buf)
        print(f"    Upserted final {len(buf)} (total {total_points}).")

    # ── Final flush to disk ────────────────────────────────────────────────────
    #cli.flush(COLLECTION_NAME_LOCAL, wait=True)
    dt = time.time() - t0
    print("\n— Upsert complete —")
    print(f"Inserted {total_points} points in {dt:.1f} s "
          f"(avg {dt/total_points:.4f} s/pt).")

    info = cli.get_collection(COLLECTION_NAME_LOCAL)
    print(f"Collection status: {info.status}, points: {info.points_count}")
    print("Optimizer will finish in background; status becomes GREEN when done.")


if __name__ == "__main__":
    main_local_upsert()



Connecting to Qdrant at http://localhost:6333 …
  ✓ Connected. Existing collections: ['financial_sp500_local_final_v2']
Recreating collection 'financial_sp500_local_final_v2' in on-disk mode…
  ✓ Collection ready.
Found 10 part(s) to upsert.

Part 1/10  →  financial_sp500_curated_embeddings_part_0000.npy & financial_sp500_curated_payloads_part_0000.jsonl


  client.recreate_collection(


    Upserted 512 (total 512).
    Upserted 512 (total 1024).
    Upserted 512 (total 1536).
    Upserted 512 (total 2048).
    Upserted 512 (total 2560).
    Upserted 512 (total 3072).
    Upserted 512 (total 3584).
    Upserted 512 (total 4096).
    Upserted 512 (total 4608).
    Upserted 512 (total 5120).
    Upserted 512 (total 5632).
    Upserted 512 (total 6144).
    Upserted 512 (total 6656).
    Upserted 512 (total 7168).
    Upserted 512 (total 7680).
    Upserted 512 (total 8192).
    Upserted 512 (total 8704).
    Upserted 512 (total 9216).
    Upserted 512 (total 9728).
    Upserted 512 (total 10240).
    Upserted 512 (total 10752).
    Upserted 512 (total 11264).
    Upserted 512 (total 11776).
    Upserted 512 (total 12288).
    Upserted 512 (total 12800).
    Upserted 512 (total 13312).
    Upserted 512 (total 13824).
    Upserted 512 (total 14336).
    Upserted 512 (total 14848).
    Upserted 512 (total 15360).
    Upserted 512 (total 15872).
    Upserted 512 (total 1638

In [12]:
from qdrant_client import QdrantClient
cli = QdrantClient("http://localhost:6333")
info = cli.get_collection("financial_sp500_local_final_v2")
print(info.status)          # will show .error if any


green


In [14]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
import numpy as np, pprint, textwrap

MODEL_NAME = "BAAI/bge-base-en-v1.5"
COLLECTION  = "financial_sp500_local_final_v2"
TOP_K       = 5

# 1. Connect and load encoder
enc = SentenceTransformer(MODEL_NAME, device="cpu")   # or "cuda"
cli = QdrantClient(url="http://localhost:6333")

# 2. Ad-hoc query
question = "Who is the CEO of Apple?"
q_emb = enc.encode(question, normalize_embeddings=True).tolist()

hits = cli.search(
    collection_name=COLLECTION,
    query_vector=q_emb,
    limit=TOP_K,
    with_payload=True,   # so we get the chunk text back
)

for h in hits:
    print(f"\n· Score: {h.score:.3f}")
    snippet = textwrap.shorten(h.payload['chunk_text'], 200, placeholder=" …")
    pprint.pprint(snippet)


  hits = cli.search(



· Score: 0.749
('Thank you. Good afternoon, and thank you for joining us. Speaking first '
 "today is Apple's CEO, Tim Cook; and he'll be followed by CFO, Luca Maestri. "
 "After that, we'll open the call to questions from …")

· Score: 0.744
('Thank you. Good afternoon and thank you for joining us. Speaking first today '
 "is Apple's CEO, Tim Cook. And he'll be followed by CFO, Luca Maestri. After "
 "that, we'll open the call to questions from …")

· Score: 0.743
('Thank you. Good afternoon, and thank you for joining us. Speaking first '
 "today is Apple's CEO, Tim Cook, and he will be followed by CFO, Luca "
 "Maestri. After that, we'll open the call to questions …")

· Score: 0.741
('Thank you. Good afternoon and thank you for joining us. Speaking first today '
 "is Apple's CEO, Tim Cook, and he'll be followed by CFO, Luca Maestri. After "
 "that, we'll open the call to questions from …")

· Score: 0.738
('Thank you. Good afternoon and thank you for joining us. Speaking first toda