In [1]:
import json, re
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [2]:
JSON_PATH = Path('../scrape_news/22092025.json')
VECTOR_PATH = "22092025_vector_db"
print("JSON path:", JSON_PATH)
print("Vector DB path:", VECTOR_PATH)

JSON path: ..\scrape_news\22092025.json
Vector DB path: 22092025_vector_db


In [3]:
def clean_text(text: str) -> str:
    """
    Removes URLs and collapses excess whitespace from a string.
    Handles None or empty inputs gracefully.

    Args:
        text: The input string to clean.

    Returns:
        The cleaned string.
    """
    if not text:
        return ""
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Reduce multiple whitespace characters to a single space
    text = re.sub(r"\s+", " ", text).strip()
    # Remove newlines
    text = text.replace("\n", " ").replace("\r", " ")
    return text

sample_text = "This is a test   text \nwith a link \nhttp://example.com"
cleaned = clean_text(sample_text)
print("Before:", sample_text)
print("After: ", cleaned)

# ทดสอบกับ None content
print("Testing with None:", clean_text(None))
print("Testing with empty string:", clean_text(""))

Before: This is a test   text 
with a link 
http://example.com
After:  This is a test text with a link
Testing with None: 
Testing with empty string: 


In [4]:
def load_news(path) -> list[str]:  # เปลี่ยนเป็น path แทน Path object
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    docs = []
    for category, items in data.items():
        for item in items:
            content = clean_text(item.get("content", ""))
            full_text = (
                f"Category: {category}\n"
                f"Headline: {item.get('headline','')}\n"
                f"Source: {item.get('source','')}\n"
                f"Content: {content}\n"
                f"Timestamp: {item.get('timestamp','')}"
            )
            docs.append(full_text)
    return docs

docs = load_news(Path('../scrape_news/22092025.json'))  # แก้ไข path ให้ถูกต้อง
print(f"✅ Loaded {len(docs)} news items")
print("\nตัวอย่างข่าว 1:")
print(docs[0] if docs else "No news found")

✅ Loaded 30 news items

ตัวอย่างข่าว 1:
Category: Nvidia
Headline: If you're looking for a gaming laptop on a budget, this is it
Source: xda_developers
Content: Save $290 off retail for a limited time
Timestamp: 2025-09-21


In [5]:
def build_vector_db(docs: list[str]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=50, separators=["\n\n", "\n", ".", " "]
    )
    chunks = []
    for d in docs:
        chunks.extend(splitter.split_text(d))

    print(f"✅ Total chunks created: {len(chunks)}")

    # ✅ ใช้โมเดลฟรีจาก Hugging Face (ไม่ต้องมี API key)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_texts(chunks, embeddings)
    vectordb.save_local(VECTOR_PATH)
    print(f"✅ Vector DB saved to: {VECTOR_PATH}")

build_vector_db(docs[:3])

✅ Total chunks created: 9


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


✅ Vector DB saved to: 22092025_vector_db


In [6]:
docs = load_news(JSON_PATH)
build_vector_db(docs)

✅ Total chunks created: 379
✅ Vector DB saved to: 22092025_vector_db


In [7]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.load_local(VECTOR_PATH, embeddings, allow_dangerous_deserialization=True)
res = db.similarity_search("Nvidia acquisition", k=1)
print("\n🔍 ตัวอย่างผลการค้นหา:")
print(res[0].page_content)


🔍 ตัวอย่างผลการค้นหา:
Category: Nvidia
Headline: If you're looking for a gaming laptop on a budget, this is it
Source: xda_developers
Content: Save $290 off retail for a limited time
Timestamp: 2025-09-21


In [8]:
query = "Nvidia acquisition"
res = db.similarity_search(query, k=1)

print("\n🔍 Query:", query)
print("🔹 Result snippet:")
print(res[0].page_content if res else "No result found")


🔍 Query: Nvidia acquisition
🔹 Result snippet:
Category: Nvidia
Headline: If you're looking for a gaming laptop on a budget, this is it
Source: xda_developers
Content: Save $290 off retail for a limited time
Timestamp: 2025-09-21
