In [2]:
import json, re
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [3]:
JSON_PATH = Path("mock_news.json")
VECTOR_PATH = "mock_news_vector_db"
print("JSON path:", JSON_PATH)
print("Vector DB path:", VECTOR_PATH)

JSON path: mock_news.json
Vector DB path: mock_news_vector_db


In [4]:
def clean_text(text: str) -> str:
    text = re.sub(r"http\S+", "", text)       # ลบ URL
    text = re.sub(r"\s+", " ", text).strip()  # ลดช่องว่างเกิน
    return text

sample_text = "This is a test   text with a link http://example.com"
cleaned = clean_text(sample_text)
print("Before:", sample_text)
print("After: ", cleaned)

Before: This is a test   text with a link http://example.com
After:  This is a test text with a link


In [5]:
def load_news(path: Path) -> list[str]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    docs = []
    for category, items in data.items():
        for item in items:
            content = clean_text(item.get("content", ""))
            full_text = (
                f"Category: {category}\n"
                f"Headline: {item.get('headline','')}\n"
                f"Source: {item.get('source','')}\n"
                f"Content: {content}\n"
                f"Timestamp: {item.get('timestamp','')}"
            )
            docs.append(full_text)
    return docs

docs = load_news(JSON_PATH)
print(f"✅ Loaded {len(docs)} news items")
print("\nตัวอย่างข่าว 1:")
print(docs[0] if docs else "No news found")

✅ Loaded 3 news items

ตัวอย่างข่าว 1:
Category: Nvidia
Headline: Nvidia acquires AI startup
Source: biztoc
Content: Nvidia just spent over $900 million to hire Enfabrica CEO and license the AI startup's technology. This move strengthens Nvidia's data-center and AI infrastructure strategy.
Timestamp: 2025-09-18


In [6]:
def build_vector_db(docs: list[str]):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=50, separators=["\n\n", "\n", ".", " "]
    )
    chunks = []
    for d in docs:
        chunks.extend(splitter.split_text(d))

    print(f"✅ Total chunks created: {len(chunks)}")

    # ✅ ใช้โมเดลฟรีจาก Hugging Face (ไม่ต้องมี API key)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_texts(chunks, embeddings)
    vectordb.save_local(VECTOR_PATH)
    print(f"✅ Vector DB saved to: {VECTOR_PATH}")

build_vector_db(docs[:3])

✅ Total chunks created: 3


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✅ Vector DB saved to: mock_news_vector_db


In [7]:
docs = load_news(JSON_PATH)
build_vector_db(docs)

✅ Total chunks created: 3
✅ Vector DB saved to: mock_news_vector_db


In [9]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.load_local(VECTOR_PATH, embeddings, allow_dangerous_deserialization=True)
res = db.similarity_search("Nvidia acquisition", k=1)
print("\n🔍 ตัวอย่างผลการค้นหา:")
print(res[0].page_content)


🔍 ตัวอย่างผลการค้นหา:
Category: Nvidia
Headline: Nvidia acquires AI startup
Source: biztoc
Content: Nvidia just spent over $900 million to hire Enfabrica CEO and license the AI startup's technology. This move strengthens Nvidia's data-center and AI infrastructure strategy.
Timestamp: 2025-09-18


In [10]:
query = "Nvidia acquisition"
res = db.similarity_search(query, k=1)

print("\n🔍 Query:", query)
print("🔹 Result snippet:")
print(res[0].page_content if res else "No result found")


🔍 Query: Nvidia acquisition
🔹 Result snippet:
Category: Nvidia
Headline: Nvidia acquires AI startup
Source: biztoc
Content: Nvidia just spent over $900 million to hire Enfabrica CEO and license the AI startup's technology. This move strengthens Nvidia's data-center and AI infrastructure strategy.
Timestamp: 2025-09-18
