In [29]:
import json, re
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [30]:
JSON_PATH = Path('../scrape_news/22092025.json')
VECTOR_PATH = "22092025_vector_db"
print("JSON path:", JSON_PATH)
print("Vector DB path:", VECTOR_PATH)

JSON path: ..\scrape_news\22092025.json
Vector DB path: 22092025_vector_db


In [31]:
def clean_text(text):
    # ‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡∏Å‡∏±‡∏ö None ‡∏´‡∏£‡∏∑‡∏≠ empty text
    if not text:
        return ""
    text = re.sub(r"http\S+", "", text)       # ‡∏•‡∏ö URL
    text = re.sub(r"\s+", " ", text).strip()  # ‡∏•‡∏î‡∏ä‡πà‡∏≠‡∏á‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏Å‡∏¥‡∏ô
    return text

sample_text = "This is a test   text with a link http://example.com"
cleaned = clean_text(sample_text)
print("Before:", sample_text)
print("After: ", cleaned)

# ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏Å‡∏±‡∏ö None content
print("Testing with None:", clean_text(None))
print("Testing with empty string:", clean_text(""))

Before: This is a test   text with a link http://example.com
After:  This is a test text with a link
Testing with None: 
Testing with empty string: 


In [32]:
def load_news(path) -> list[str]:  # ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÄ‡∏õ‡πá‡∏ô path ‡πÅ‡∏ó‡∏ô Path object
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    docs = []
    for category, items in data.items():
        for item in items:
            content = clean_text(item.get("content", ""))
            full_text = (
                f"Category: {category}\n"
                f"Headline: {item.get('headline','')}\n"
                f"Source: {item.get('source','')}\n"
                f"Content: {content}\n"
                f"Timestamp: {item.get('timestamp','')}"
            )
            docs.append(full_text)
    return docs

docs = load_news(Path('../scrape_news/22092025.json'))  # ‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç path ‡πÉ‡∏´‡πâ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á
print(f"‚úÖ Loaded {len(docs)} news items")
print("\n‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡πà‡∏≤‡∏ß 1:")
print(docs[0] if docs else "No news found")

‚úÖ Loaded 30 news items

‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏Ç‡πà‡∏≤‡∏ß 1:
Category: Nvidia
Headline: If you're looking for a gaming laptop on a budget, this is it
Source: xda_developers
Content: Save $290 off retail for a limited time
Timestamp: 2025-09-21


In [33]:
def build_vector_db(docs: list[str]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=50, separators=["\n\n", "\n", ".", " "]
    )
    chunks = []
    for d in docs:
        chunks.extend(splitter.split_text(d))

    print(f"‚úÖ Total chunks created: {len(chunks)}")

    # ‚úÖ ‡πÉ‡∏ä‡πâ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ü‡∏£‡∏µ‡∏à‡∏≤‡∏Å Hugging Face (‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ API key)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_texts(chunks, embeddings)
    vectordb.save_local(VECTOR_PATH)
    print(f"‚úÖ Vector DB saved to: {VECTOR_PATH}")

build_vector_db(docs[:3])

‚úÖ Total chunks created: 9


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Vector DB saved to: 22092025_vector_db


In [34]:
docs = load_news(JSON_PATH)
build_vector_db(docs)

‚úÖ Total chunks created: 379
‚úÖ Vector DB saved to: 22092025_vector_db
‚úÖ Vector DB saved to: 22092025_vector_db


In [35]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.load_local(VECTOR_PATH, embeddings, allow_dangerous_deserialization=True)
res = db.similarity_search("Nvidia acquisition", k=1)
print("\nüîç ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤:")
print(res[0].page_content)


üîç ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤:
Category: Nvidia
Headline: If you're looking for a gaming laptop on a budget, this is it
Source: xda_developers
Content: Save $290 off retail for a limited time
Timestamp: 2025-09-21


In [36]:
query = "Nvidia acquisition"
res = db.similarity_search(query, k=1)

print("\nüîç Query:", query)
print("üîπ Result snippet:")
print(res[0].page_content if res else "No result found")


üîç Query: Nvidia acquisition
üîπ Result snippet:
Category: Nvidia
Headline: If you're looking for a gaming laptop on a budget, this is it
Source: xda_developers
Content: Save $290 off retail for a limited time
Timestamp: 2025-09-21
