In [32]:
import json, re
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [33]:
JSON_PATH = Path('../data/query/30092025.json')
VECTOR_PATH = "30092025_vector_db"
print("JSON path:", JSON_PATH)
print("Vector DB path:", VECTOR_PATH)

JSON path: ..\data\query\30092025.json
Vector DB path: 30092025_vector_db


In [34]:
def clean_text(text: str) -> str:
    """
    Removes URLs and collapses excess whitespace from a string.
    Handles None or empty inputs gracefully.

    Args:
        text: The input string to clean.

    Returns:
        The cleaned string.
    """
    if not text:
        return ""
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Reduce multiple whitespace characters to a single space
    text = re.sub(r"\s+", " ", text).strip()
    # Remove newlines
    text = text.replace("\n", " ").replace("\r", " ")
    return text

sample_text = "This is a test   text \nwith a link \nhttp://example.com"
cleaned = clean_text(sample_text)
print("Before:", sample_text)
print("After: ", cleaned)

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏Å‡∏±‡∏ö None content
print("Testing with None:", clean_text(None))
print("Testing with empty string:", clean_text(""))

Before: This is a test   text 
with a link 
http://example.com
After:  This is a test text with a link
Testing with None: 
Testing with empty string: 


In [35]:
def load_news(path) -> list[str]:  # ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÄ‡∏õ‡πá‡∏ô path ‡πÅ‡∏ó‡∏ô Path object
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    docs = []
    for category, items in data.items():
        for item in items:
            content = clean_text(item.get("content", ""))
            full_text = (
                f"Category: {category}\n"
                f"Headline: {item.get('headline','')}\n"
                f"Source: {item.get('source','')}\n"
                f"Content: {content}\n"
                f"Timestamp: {item.get('timestamp','')}"
            )
            docs.append(full_text)
    return docs

docs = load_news(Path('../data/query/16092025.json'))  # ‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç path ‡πÉ‡∏´‡πâ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á
print(f"‚úÖ Loaded {len(docs)} news items")
print("\n‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡πà‡∏≤‡∏ß 1:")
print(docs[0] if docs else "No news found")

‚úÖ Loaded 27 news items

‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡πà‡∏≤‡∏ß 1:
Category: Nvidia
Headline: Nvidia suffers a major blow from China
Source: TheStreet
Content: The company keeps running into setbacks in this important market.
Timestamp: 2025-09-16


In [36]:
def build_vector_db(docs: list[str]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=50, separators=["\n\n", "\n", ".", " "]
    )
    chunks = []
    for d in docs:
        chunks.extend(splitter.split_text(d))

    print(f"‚úÖ Total chunks created: {len(chunks)}")

    # ‚úÖ ‡πÉ‡∏ä‡πâ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ü‡∏£‡∏µ‡∏à‡∏≤‡∏Å Hugging Face (‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ API key)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_texts(chunks, embeddings)
    vectordb.save_local(VECTOR_PATH)
    print(f"‚úÖ Vector DB saved to: {VECTOR_PATH}")

build_vector_db(docs[:3])

‚úÖ Total chunks created: 3
‚úÖ Vector DB saved to: 30092025_vector_db


In [37]:
docs = load_news(JSON_PATH)
build_vector_db(docs)

‚úÖ Total chunks created: 825
‚úÖ Vector DB saved to: 30092025_vector_db


In [38]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.load_local(VECTOR_PATH, embeddings, allow_dangerous_deserialization=True)
res = db.similarity_search("Nvidia acquisition", k=1)
print("\nüîç ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤:")
print(res[0].page_content)


üîç ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤:
. OpenAI, another company that will need to spend a lot to support its ambitions, took a different approach to raising money. Last week, the company struck a deal with Nvidia to deploy 10 GW of Nvidia systems over the next five years in exchange for an incremental $100 billion equity investment. OpenAI is reportedly discussing leasing‚Äînot buying‚Äîthe chips from Nvidia, a novel arrangement that could cut its hardware costs by 10 to 15%, according to The Information


In [39]:
query = "Nvidia acquisition"
res = db.similarity_search(query, k=1)

print("\nüîç Query:", query)
print("üîπ Result snippet:")
print(res[0].page_content if res else "No result found")


üîç Query: Nvidia acquisition
üîπ Result snippet:
. OpenAI, another company that will need to spend a lot to support its ambitions, took a different approach to raising money. Last week, the company struck a deal with Nvidia to deploy 10 GW of Nvidia systems over the next five years in exchange for an incremental $100 billion equity investment. OpenAI is reportedly discussing leasing‚Äînot buying‚Äîthe chips from Nvidia, a novel arrangement that could cut its hardware costs by 10 to 15%, according to The Information
