In [1]:
import ollama

# Load multiple datasets
datasets = []

# Load the first dataset
with open('cn.txt', 'r') as file:
    dataset = file.readlines()
    datasets.append(dataset)
    print(f'Loaded {len(dataset)} entries from cn.txt')

# Load the second dataset
with open('AI.txt', 'r') as file:
    dataset = file.readlines()
    datasets.append(dataset)
    print(f'Loaded {len(dataset)} entries from AI.txt')

# Load the third dataset (if any)
with open('cd.txt', 'r') as file:
    dataset = file.readlines()
    datasets.append(dataset)
    print(f'Loaded {len(dataset)} entries from cd.txt')

# Combine all datasets into one
combined_dataset = [item for sublist in datasets for item in sublist]
print(f'Total entries after combining datasets: {len(combined_dataset)}')

# Implement the retrieval system

EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

# Each element in the VECTOR_DB will be a tuple (chunk, embedding)
# The embedding is a list of floats, for example: [0.1, 0.04, -0.34, 0.21, ...]
VECTOR_DB = []

def add_chunk_to_database(chunk):
    embedding = ollama.embed(model=EMBEDDING_MODEL, input=chunk)['embeddings'][0]
    VECTOR_DB.append((chunk, embedding))

# Add chunks from the combined dataset to the vector database
for i, chunk in enumerate(combined_dataset):
    add_chunk_to_database(chunk)
    print(f'Added chunk {i + 1}/{len(combined_dataset)} to the database')

def cosine_similarity(a, b):
    dot_product = sum([x * y for x, y in zip(a, b)])
    norm_a = sum([x ** 2 for x in a]) ** 0.5
    norm_b = sum([x ** 2 for x in b]) ** 0.5
    return dot_product / (norm_a * norm_b)

def retrieve(query, top_n=3):
    query_embedding = ollama.embed(model=EMBEDDING_MODEL, input=query)['embeddings'][0]
    # Temporary list to store (chunk, similarity) pairs
    similarities = []
    for chunk, embedding in VECTOR_DB:
        similarity = cosine_similarity(query_embedding, embedding)
        similarities.append((chunk, similarity))
    # Sort by similarity in descending order, because higher similarity means more relevant chunks
    similarities.sort(key=lambda x: x[1], reverse=True)
    # Finally, return the top N most relevant chunks
    return similarities[:top_n]

# Chatbot
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve(input_query)

print('Retrieved knowledge:')
for chunk, similarity in retrieved_knowledge:
    print(f' - (similarity: {similarity:.2f}) {chunk}')

instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question. Don't make up any new information:
{'\n'.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])}
'''

stream = ollama.chat(
    model=LANGUAGE_MODEL,
    messages=[
        {'role': 'system', 'content': instruction_prompt},
        {'role': 'user', 'content': input_query},
    ],
    stream=True,
)

# Print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
    print(chunk['message']['content'], end='', flush=True)


Loaded 4295 entries from cn.txt
Loaded 89 entries from AI.txt
Loaded 4689 entries from cd.txt
Total entries after combining datasets: 9073
Added chunk 1/9073 to the database
Added chunk 2/9073 to the database
Added chunk 3/9073 to the database
Added chunk 4/9073 to the database
Added chunk 5/9073 to the database
Added chunk 6/9073 to the database
Added chunk 7/9073 to the database
Added chunk 8/9073 to the database
Added chunk 9/9073 to the database
Added chunk 10/9073 to the database
Added chunk 11/9073 to the database
Added chunk 12/9073 to the database
Added chunk 13/9073 to the database
Added chunk 14/9073 to the database
Added chunk 15/9073 to the database
Added chunk 16/9073 to the database
Added chunk 17/9073 to the database
Added chunk 18/9073 to the database
Added chunk 19/9073 to the database
Added chunk 20/9073 to the database
Added chunk 21/9073 to the database
Added chunk 22/9073 to the database
Added chunk 23/9073 to the database
Added chunk 24/9073 to the database
Added 

Ask me a question:  what is computer network 


Retrieved knowledge:
 - (similarity: 0.83) COMPUTER NETWORKS

 - (similarity: 0.77) USES OF COMPUTER NETWORKS

 - (similarity: 0.75) network have connections to multiple other computers in that network. It is an

Chatbot response:
A computer network is a system of interconnected computers, devices, and other electronic devices that communicate with each other and share resources. This can include hardware such as routers, switches, and servers, and software applications like operating systems and network protocols.

In simple terms, a computer network is a collection of devices connected together to enable communication, sharing, and collaboration between them.