In [None]:

import faiss
import numpy as np
import ollama

# Define embedding and language model
EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'


embedding_dim = 768  
index = faiss.IndexFlatIP(embedding_dim)  
chunks = []  


In [None]:
def normalize(vec):
    return vec / np.linalg.norm(vec)  

In [None]:
def add_chunk_to_database(chunk):
    embedding = ollama.embed(model=EMBEDDING_MODEL, input=chunk)['embeddings'][0]
    embedding = np.array(embedding).astype('float32').reshape(1, -1)
    embedding = normalize(embedding)  
    
    index.add(embedding)  
    chunks.append(chunk)  

In [None]:

with open('cat-facts.txt', 'r', encoding='utf-8') as file:
    dataset = file.readlines()
    print(f'Loaded {len(dataset)} entries')


for i, chunk in enumerate(dataset):
    add_chunk_to_database(chunk)
    print(f'Added chunk {i+1}/{len(dataset)} to the database')

Loaded 150 entries
Added chunk 1/150 to the database
Added chunk 2/150 to the database
Added chunk 3/150 to the database
Added chunk 4/150 to the database
Added chunk 5/150 to the database
Added chunk 6/150 to the database
Added chunk 7/150 to the database
Added chunk 8/150 to the database
Added chunk 9/150 to the database
Added chunk 10/150 to the database
Added chunk 11/150 to the database
Added chunk 12/150 to the database
Added chunk 13/150 to the database
Added chunk 14/150 to the database
Added chunk 15/150 to the database
Added chunk 16/150 to the database
Added chunk 17/150 to the database
Added chunk 18/150 to the database
Added chunk 19/150 to the database
Added chunk 20/150 to the database
Added chunk 21/150 to the database
Added chunk 22/150 to the database
Added chunk 23/150 to the database
Added chunk 24/150 to the database
Added chunk 25/150 to the database
Added chunk 26/150 to the database
Added chunk 27/150 to the database
Added chunk 28/150 to the database
Added chun

In [None]:
def retrieve(query, top_n=3):
    query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]
    query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)
    query_embedding = normalize(query_embedding)  
    
    distances, indices = index.search(query_embedding, top_n)  
    results = [chunks[i] for i in indices[0] if i < len(chunks)] 
    return results

In [None]:

query = input("Enter your query: ")
results = retrieve(query, top_n=3)

print("\nTop retrieved results:")
for res in results:
    print(res)


Top retrieved results:
A form of AIDS exists in cats.

The world’s rarest coffee, Kopi Luwak, comes from Indonesia where a wildcat known as the luwak lives. The cat eats coffee berries and the coffee beans inside pass through the stomach. The beans are harvested from the cat’s dung heaps and then cleaned and roasted. Kopi Luwak sells for about $500 for a 450 g (1 lb) bag.

Perhaps the most famous comic cat is the Cheshire Cat in Lewis Carroll’s Alice in Wonderland. With the ability to disappear, this mysterious character embodies the magic and sorcery historically associated with cats.

