In [1]:
!python --version

Python 3.13.9


In [82]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import imageio
from dotenv import load_dotenv
load_dotenv()

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [7]:
VECTOR_DB_FOLDER = r"chunk_news\23092025_vector_db"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

In [10]:
def load_vector(path: Path):
    """
    Loads an existing FAISS vector database from a local path.

    Args:
        path: The directory path where the vector database is stored.

    Returns:
        The loaded FAISS database object.
    """
    print(f"üîÑ Loading vector database from: {path}")
    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
    db = FAISS.load_local(str(path), embeddings, allow_dangerous_deserialization=True)
    print("‚úÖ Vector DB loaded successfully.")
    return db


def get_retriever(db_path):
    """
    Constructs the path to the vector DB, loads it, and returns a retriever.
    This is the primary function to be imported by other scripts like worker.py.
    
    Returns:
        A LangChain retriever object.
    """
    db = load_vector(db_path)
    
    retriever = db.as_retriever(search_kwargs={"k": 3})
    return retriever

In [11]:
retriever = get_retriever(db_path=VECTOR_DB_FOLDER)

üîÑ Loading vector database from: chunk_news\23092025_vector_db
‚úÖ Vector DB loaded successfully.


In [25]:
query = "Intel news 07/10/2025"
results = retriever.invoke(query)

for result in results:
    print("--- Document ---")
    print(result.page_content[:200])
    print()

--- Document ---
. This announcement affects both desktops and laptop chips. The 11th to 14th-generation Intel CPUs are still relatively new ‚Äî the 14th-generation Raptor Lake Refresh CPUs first arrived in 2023, with n

--- Document ---
. This announcement affects both desktops and laptop chips. The 11th to 14th-generation Intel CPUs are still relatively new ‚Äî the 14th-generation Raptor Lake Refresh CPUs first arrived in 2023, with n

--- Document ---
Category: Intel
Headline: Arm is the future of desktop computing, and the writing is on the wall for x86
Source: XDA Developers
Content: Arm has been slowly picking up pace in the last two decades, bu



In [46]:
def cosine_similarity(a, b):
    a = a / np.linalg.norm(a)
    b = b / np.linalg.norm(b, axis=-1, keepdims=True)
    return np.dot(b, a)

def get_topk_embeddings(db, query_emb, k=4):
    q = np.array(query_emb, dtype=np.float32).reshape(1, -1)
    distances, indices = db.index.search(q, k)
    doc_embs = [db.index.reconstruct(int(i)) for i in indices[0]]
    docs = [db.docstore.search(db.index_to_docstore_id[int(i)]) for i in indices[0]]
    return docs, np.array(doc_embs), distances[0]

def fitness(query_embedding, db, k=4):
    docs, doc_embs, _ = get_topk_embeddings(db, query_embedding, k)
    if len(doc_embs) == 0:
        return -9999
    sims = cosine_similarity(query_embedding, doc_embs)
    sim_score = np.mean(sims)
    dd_sims = [cosine_similarity(doc_embs[i], doc_embs[j])
               for i in range(len(doc_embs))
               for j in range(i+1, len(doc_embs))]
    coherence = np.mean(dd_sims) if dd_sims else 0
    return 0.7 * sim_score + 0.3 * coherence

In [30]:
db = load_vector(VECTOR_DB_FOLDER)
embedding_fn = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

üîÑ Loading vector database from: chunk_news\23092025_vector_db
‚úÖ Vector DB loaded successfully.


In [93]:
def pso_optimize(init_embedding, db, w, c1, c2, num_particles=20, iters=40):
    DIM = len(init_embedding)

    particles = np.random.randn(num_particles, DIM) * 0.05 + init_embedding
    velocities = np.zeros((num_particles, DIM))

    personal_best = particles.copy()
    personal_scores = np.array([fitness(p, db) for p in particles])

    global_best = personal_best[np.argmax(personal_scores)]
    global_best_score = np.max(personal_scores)

    history = []

    for t in range(iters):
        history.append(particles.copy())
        for i in range(num_particles):
            r1, r2 = np.random.rand(), np.random.rand()
            velocities[i] = (
                w * velocities[i] +
                c1 * r1 * (personal_best[i] - particles[i]) +
                c2 * r2 * (global_best - particles[i])
            )
            particles[i] += velocities[i]
            score = fitness(particles[i], db)
            if score > personal_scores[i]:
                personal_best[i] = particles[i]
                personal_scores[i] = score
                if score > global_best_score:
                    global_best = particles[i]
                    global_best_score = score
        print(f"Iter {t+1}/{iters}: score = {global_best_score:.4f}")

    history.append(particles.copy())
    return global_best, global_best_score, np.array(history)

def retrieve(query_emb, db, k=4):
    q = np.array(query_emb, dtype=np.float32).reshape(1, -1)
    dists, idxs = db.index.search(q, k)
    docs = [db.docstore.search(db.index_to_docstore_id[int(i)]) for i in idxs[0]]
    return docs

def make_gif_3d(history, filename="pso3d.gif"):
    T, N, D = history.shape
    flat = history.reshape(T*N, D)
    proj = PCA(n_components=3).fit_transform(flat)
    proj = proj.reshape(T, N, 3)

    images = []
    for t in range(T):
        fig = plt.figure(figsize=(6,6))
        ax = fig.add_subplot(111, projection="3d")
        ax.scatter(proj[t,:,0], proj[t,:,1], proj[t,:,2], s=40)
        ax.set_title(f"Iteration {t}")
        mn = proj.min(axis=(0,1))
        mx = proj.max(axis=(0,1))
        ax.set_xlim(mn[0], mx[0])
        ax.set_ylim(mn[1], mx[1])
        ax.set_zlim(mn[2], mx[2])
        plt.tight_layout()
        plt.savefig("frame3d.png")
        plt.close()
        images.append(imageio.imread("frame3d.png"))

    imageio.mimsave(filename, images, fps=4)

In [94]:
query = "Nvidia news 07/10/2025"
init_emb = embedding_fn.embed_query(query)
score = fitness(init_emb, db, k=4)
print(f"score: {score}")

docs = retrieve(init_emb, db, k=4)
for i, d in enumerate(docs):
    print(f"\n---- RESULT {i+1} ----")
    print(d.page_content)

score: 0.6414233449287092

---- RESULT 1 ----
Category: Nvidia
Headline: More questions than answers in Nvidia‚Äôs $100 billion OpenAI deal
Source: The Indian Express
Content: Nvidia‚Äôs move to invest up to $100 billion into OpenAI at the same time it plans to supply millions of its market-leading artificial intelligence chips to the ChatGPT creator has little precedent in the tech industry. Under the deal, Nvidia will be taking a f‚Ä¶
Timestamp: 2025-09-23

---- RESULT 2 ----
Category: Nvidia
Headline: MRVL Stock vs. NVIDIA
Source: Forbes
Content: NVIDIA presents superior revenue growth in key periods, enhanced profitability, and a comparatively lower valuation...
Timestamp: 2025-09-23

---- RESULT 3 ----
Category: Nvidia
Headline: "The next leap forward" - Nvidia is investing $100bn in OpenAI, and will start by deploying as much power for 10 nuclear reactors
Source: TechRadar

---- RESULT 4 ----
. ‚ÄúWe‚Äôre excited to deploy 10 gigawatts of compute with NVIDIA to push back the fron

In [95]:
query = "Nvidia news 07/10/2025"
init_emb = embedding_fn.embed_query(query)
best_emb, best_score, hist = pso_optimize(
    init_emb, db,
    w=0.8,
    c1=1.2,
    c2=1.0, 
    num_particles=384, 
    iters=50
)

make_gif_3d(hist, "pso.gif")
print("Optimized embedding fitness:", best_score)

Iter 1/50: score = 0.6546
Iter 2/50: score = 0.6599
Iter 3/50: score = 0.6625
Iter 4/50: score = 0.6659
Iter 5/50: score = 0.6722
Iter 6/50: score = 0.6738
Iter 7/50: score = 0.6768
Iter 8/50: score = 0.6797
Iter 9/50: score = 0.6838
Iter 10/50: score = 0.6866
Iter 11/50: score = 0.6872
Iter 12/50: score = 0.6892
Iter 13/50: score = 0.6922
Iter 14/50: score = 0.6944
Iter 15/50: score = 0.6963
Iter 16/50: score = 0.6977
Iter 17/50: score = 0.7006
Iter 18/50: score = 0.7191
Iter 19/50: score = 0.7207
Iter 20/50: score = 0.7256
Iter 21/50: score = 0.7267
Iter 22/50: score = 0.7277
Iter 23/50: score = 0.7285
Iter 24/50: score = 0.7298
Iter 25/50: score = 0.7303
Iter 26/50: score = 0.7315
Iter 27/50: score = 0.7325
Iter 28/50: score = 0.7335
Iter 29/50: score = 0.7340
Iter 30/50: score = 0.7357
Iter 31/50: score = 0.7370
Iter 32/50: score = 0.7380
Iter 33/50: score = 0.7387
Iter 34/50: score = 0.7402
Iter 35/50: score = 0.7413
Iter 36/50: score = 0.7418
Iter 37/50: score = 0.7427
Iter 38/50

  images.append(imageio.imread("frame3d.png"))


Optimized embedding fitness: 0.749609676145354


In [96]:
docs = retrieve(best_emb, db, k=4)
for i, d in enumerate(docs):
    print(f"\n---- RESULT {i+1} ----")
    print(d.page_content)


---- RESULT 1 ----
Category: Nvidia
Headline: More questions than answers in Nvidia‚Äôs $100 billion OpenAI deal
Source: The Indian Express
Content: Nvidia‚Äôs move to invest up to $100 billion into OpenAI at the same time it plans to supply millions of its market-leading artificial intelligence chips to the ChatGPT creator has little precedent in the tech industry. Under the deal, Nvidia will be taking a f‚Ä¶
Timestamp: 2025-09-23

---- RESULT 2 ----
Category: AMD
Headline: Nvidia promises its $100 billion OpenAI deal won't impact GPU supply ‚Äî 'we will continue to make every customer a top priority'
Source: Tom's Hardware UK
Content: Nvidia has released a statement to make it clear that, no matter what deals it does with companies to provide hardware or take an equity stake in their business, it will ensure all companies have equal access to next-generation GPU hardware.
Timestamp: 2025-09-23

---- RESULT 3 ----
Category: Nvidia
Headline: "The next leap forward" - Nvidia is investi

In [99]:
query = "Nvidia news 07/10/2025"
init_emb = embedding_fn.embed_query(query)
best_emb, best_score, hist = pso_optimize(
    init_emb, db,
    w=0.5,
    c1=0.4,
    c2=0.8, 
    num_particles=384, 
    iters=50
)

make_gif_3d(hist, "pso2.gif")
print("Optimized embedding fitness:", best_score)

Iter 1/50: score = 0.6490
Iter 2/50: score = 0.6614
Iter 3/50: score = 0.6636
Iter 4/50: score = 0.6692
Iter 5/50: score = 0.6728
Iter 6/50: score = 0.6781
Iter 7/50: score = 0.6815
Iter 8/50: score = 0.6849
Iter 9/50: score = 0.6871
Iter 10/50: score = 0.6881
Iter 11/50: score = 0.6895
Iter 12/50: score = 0.6912
Iter 13/50: score = 0.6933
Iter 14/50: score = 0.6947
Iter 15/50: score = 0.6960
Iter 16/50: score = 0.6966
Iter 17/50: score = 0.6972
Iter 18/50: score = 0.6976
Iter 19/50: score = 0.6977
Iter 20/50: score = 0.6980
Iter 21/50: score = 0.6982
Iter 22/50: score = 0.6986
Iter 23/50: score = 0.6989
Iter 24/50: score = 0.6989
Iter 25/50: score = 0.6990
Iter 26/50: score = 0.6990
Iter 27/50: score = 0.6991
Iter 28/50: score = 0.6991
Iter 29/50: score = 0.6992
Iter 30/50: score = 0.6992
Iter 31/50: score = 0.6992
Iter 32/50: score = 0.6992
Iter 33/50: score = 0.6993
Iter 34/50: score = 0.6993
Iter 35/50: score = 0.6993
Iter 36/50: score = 0.6994
Iter 37/50: score = 0.6994
Iter 38/50

  images.append(imageio.imread("frame3d.png"))


Optimized embedding fitness: 0.700344254755069


In [100]:
docs = retrieve(best_emb, db, k=4)
for i, d in enumerate(docs):
    print(f"\n---- RESULT {i+1} ----")
    print(d.page_content)


---- RESULT 1 ----
Category: Nvidia
Headline: More questions than answers in Nvidia‚Äôs $100 billion OpenAI deal
Source: The Indian Express
Content: Nvidia‚Äôs move to invest up to $100 billion into OpenAI at the same time it plans to supply millions of its market-leading artificial intelligence chips to the ChatGPT creator has little precedent in the tech industry. Under the deal, Nvidia will be taking a f‚Ä¶
Timestamp: 2025-09-23

---- RESULT 2 ----
Category: Nvidia
Headline: "The next leap forward" - Nvidia is investing $100bn in OpenAI, and will start by deploying as much power for 10 nuclear reactors
Source: TechRadar

---- RESULT 3 ----
Category: Nvidia
Headline: MRVL Stock vs. NVIDIA
Source: Forbes
Content: NVIDIA presents superior revenue growth in key periods, enhanced profitability, and a comparatively lower valuation...
Timestamp: 2025-09-23

---- RESULT 4 ----
Category: Nvidia
Headline: NVIDIA to invest $100 billion in OpenAI ‚Äî after Microsoft backed out of two data cent

In [103]:
query = "Nvidia news 07/10/2025"
init_emb = embedding_fn.embed_query(query)
best_emb, best_score, hist = pso_optimize(
    init_emb, db,
    w=0.8,
    c1=0.8,
    c2=0.8, 
    num_particles=500, 
    iters=50
)

make_gif_3d(hist, "pso3.gif")
print("Optimized embedding fitness:", best_score)

Iter 1/50: score = 0.6480
Iter 2/50: score = 0.6566
Iter 3/50: score = 0.6587
Iter 4/50: score = 0.6620
Iter 5/50: score = 0.6665
Iter 6/50: score = 0.6703
Iter 7/50: score = 0.6767
Iter 8/50: score = 0.6955
Iter 9/50: score = 0.7103
Iter 10/50: score = 0.7180
Iter 11/50: score = 0.7203
Iter 12/50: score = 0.7225
Iter 13/50: score = 0.7256
Iter 14/50: score = 0.7269
Iter 15/50: score = 0.7316
Iter 16/50: score = 0.7355
Iter 17/50: score = 0.7394
Iter 18/50: score = 0.7411
Iter 19/50: score = 0.7433
Iter 20/50: score = 0.7448
Iter 21/50: score = 0.7474
Iter 22/50: score = 0.7494
Iter 23/50: score = 0.7517
Iter 24/50: score = 0.7536
Iter 25/50: score = 0.7551
Iter 26/50: score = 0.7564
Iter 27/50: score = 0.7570
Iter 28/50: score = 0.7582
Iter 29/50: score = 0.7593
Iter 30/50: score = 0.7601
Iter 31/50: score = 0.7606
Iter 32/50: score = 0.7613
Iter 33/50: score = 0.7616
Iter 34/50: score = 0.7622
Iter 35/50: score = 0.7629
Iter 36/50: score = 0.7634
Iter 37/50: score = 0.7641
Iter 38/50

  images.append(imageio.imread("frame3d.png"))


Optimized embedding fitness: 0.7691645658636433


In [104]:
docs = retrieve(best_emb, db, k=4)
for i, d in enumerate(docs):
    print(f"\n---- RESULT {i+1} ----")
    print(d.page_content)


---- RESULT 1 ----
Category: Nvidia
Headline: More questions than answers in Nvidia‚Äôs $100 billion OpenAI deal
Source: The Indian Express
Content: Nvidia‚Äôs move to invest up to $100 billion into OpenAI at the same time it plans to supply millions of its market-leading artificial intelligence chips to the ChatGPT creator has little precedent in the tech industry. Under the deal, Nvidia will be taking a f‚Ä¶
Timestamp: 2025-09-23

---- RESULT 2 ----
Category: Nvidia
Headline: NVIDIA to invest $100 billion in OpenAI ‚Äî after Microsoft backed out of two data center deals to escape additional ChatGPT training support
Source: Windows Central

---- RESULT 3 ----
Category: Nvidia
Headline: "The next leap forward" - Nvidia is investing $100bn in OpenAI, and will start by deploying as much power for 10 nuclear reactors
Source: TechRadar

---- RESULT 4 ----
Category: Nvidia
Headline: Nvidia plans to splash OpenAI with cash, pouring out $100 billion for ChatGPT's creator and making last week