In [1]:
%pip install ollama chromadb --quiet
from tqdm import tqdm

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m599.2/599.2 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m273.8/273.8 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import ollama as olm
import chromadb as cdb

In [3]:
# Import the Narnia books as documents

import os

def read_all_lines(dir_path:str, files:list) :
  books = [[] for x in files]
  for f_name, b in zip(tqdm(files), books) :
    full_path = os.path.join(dir_path, f_name)
    with open(full_path, 'r', encoding='unicode_escape') as f :
      i = 0
      while f.readable() and i < 10000:
        i += 1
        read_text = f.readline()
        if len(read_text) == 0 :
          break
        else :
          b.append({"text" : read_text, 'file' : f_name})
  return books

# print(*books[6][0:40])

In [4]:
# Optional addition of chapter and para metadata
import re

def add_chap_para_data(book_lines:list) :
  chapter_start_matcher = re.compile("[^a-zA-Z]*chapter", re.IGNORECASE)

  for b in tqdm(book_lines) :
    chapter_num = 0
    para_num = 0
    last_line_was_empty = True
    for line in b :
      if chapter_start_matcher.match(line['text']) is not None and last_line_was_empty :
        chapter_num += 1
        para_num = -1
      else :
        if last_line_was_empty :
          para_num += 1
      line['chapter'] = chapter_num
      line['paragraph'] = para_num
      last_line_was_empty = ( len(line['text'].lstrip()) == 0 )


In [5]:
def compile_into_snippets(book_lines:list, line_stride:int = 10, line_overlap:int = 2) :
  line_jump = line_stride - line_overlap
  snippets_per_book = [
    [
      {
        'text' : ' '.join( [
                  x['text'] for x in b[i_start:i_end]
                ] ),
        'file' : b[i_start]['file'],
        'chap-para' : [
          (b[i_start]['chapter'], b[i_start]['paragraph']),
          (b[i_end]['chapter'], b[i_end]['paragraph'])
        ],
      }
      for i_start in range(0,len(b),line_jump)
      for i_end in [min(i_start + line_stride, len(b)-1)]
    ]
    for b in book_lines
  ]
  documents = []
  for snips in snippets_per_book :
    documents.extend(snips)
  return documents

In [6]:
def create_embeddings_and_store(documents:list, collection, embed_model:str) :
  olm.pull(model=embed_model)

  print("pulled embedding model")

  olm_embeddings = [None] * len(documents)
  tagged_text = [None] * len(documents)
  for i, doc in enumerate(tqdm(documents)) :
    # Add tags of filename and chapter/para
    prefix = doc['file'] + ' '
    chap_para = doc['chap-para']
    # Same Chapter and Para
    if len(set(chap_para)) == 1 :
      prefix += 'Chapter %d, Para %d' % chap_para[0]
    # Same Chapter, diff para
    elif len({x[0] for x in chap_para}) == 1 :
      chap = chap_para[0][0]
      para1,para2 = chap_para[0][1], chap_para[1][1]
      prefix += 'Chapter %d, Para %d to Para %d' % (chap, para1, para2)
    # Diff Chapter, diff para
    else :
      (chap1, para1), (chap2, para2) = chap_para
      prefix += 'Chapter %d, Para %d to Chapter %d, Para %d' % (chap1, para1, chap2, para2)
    prefix += ' : '
    tagged_text[i] = prefix + doc['text']

    res = olm.embeddings(
      model=embed_model,
      prompt=tagged_text[i],
    )
    olm_embeddings[i] = res['embedding']

  print("created embeddings")

  prev_count = collection.count()

  for i, (emb, doc, text) in enumerate(zip(tqdm(olm_embeddings), documents, tagged_text)) :
    metadata = doc.copy()
    del metadata['text']
    metadata['chap-para'] = "%d,%d to %d,%d" % (metadata['chap-para'][0][0],
                                                metadata['chap-para'][0][1],
                                                metadata['chap-para'][1][0],
                                                metadata['chap-para'][1][1])
    collection.add(
      embeddings=[emb],
      documents=[text],
      metadatas=[metadata],
      ids= [str(prev_count + i)],
    )

  print("added to collection")

In [27]:
def answer_with_context(query:str, collection, answering_model:str, embed_model:str, top_k:int=20) :
  olm.pull(model=answering_model)
  print("pulled answering model")

  q_embed = olm.embeddings(
    model=embed_model,
    prompt=query
  )['embedding']

  print("querying knowledge base... ", end='')

  context = collection.query(
    query_embeddings=[q_embed],
    n_results = top_k
  )
  # return context
  context_ids = context['ids'][0]
  context_str = context['documents'][0]
  context_dis = context['distances'][0]
  # Using cosine sim
  relevant_context = [x for x, dis in zip(context_str,context_dis) if dis < 0.5]
  all_context = '\n\n'.join(relevant_context)

  print("query complete")
  print("Generating response... ", end='')

  answer = olm.generate(
    model = answering_model,
    prompt = f"Using the given context : \n\n {all_context} \n\n Answer the following question. Question {query} Answer :"
  )

  print("Answer generated")

  return query, answer['response'], relevant_context

In [8]:
import os
import urllib.request

gutenberg_links = [
    "https://gutenberg.ca/ebooks/lewiscs-magiciansnephew/lewiscs-magiciansnephew-00-t.txt",
    "https://gutenberg.ca/ebooks/lewiscs-thelionthewitchandthewardrobe/lewiscs-thelionthewitchandthewardrobe-00-t.txt",
    "https://gutenberg.ca/ebooks/lewiscs-thehorseandhisboy/lewiscs-thehorseandhisboy-00-t.txt",
    "https://gutenberg.ca/ebooks/lewiscs-princecaspian/lewiscs-princecaspian-00-t.txt",
    "https://gutenberg.ca/ebooks/lewiscs-voyageofthedawntreader/lewiscs-voyageofthedawntreader-00-t.txt",
    "https://gutenberg.ca/ebooks/lewiscs-silverchair/lewiscs-silverchair-00-t.txt",
    "https://gutenberg.ca/ebooks/lewiscs-lastbattle/lewiscs-lastbattle-00-t.txt",
]

narnia_path = 'narnia_series'
for f in tqdm(gutenberg_links) :
  if not os.path.exists(narnia_path) :
    os.mkdir(narnia_path)
  urllib.request.urlretrieve(f, os.path.join(narnia_path,f.split('/')[-1]))

100%|██████████| 7/7 [00:02<00:00,  2.46it/s]


In [9]:
client = cdb.PersistentClient(path="narnia_db")
narnia_db = {"name" : "narnia_knowledge_base"}
narnia_db["db"] = client.get_or_create_collection(name=narnia_db['name'],metadata={"hnsw:space": "cosine"})

files = os.listdir('narnia_series')

trimmed_files = [x for x in files if x.endswith(".txt")]

book_lines = read_all_lines(narnia_path, trimmed_files)
print(*[x['text'] for x in book_lines[6][0:40]])

100%|██████████| 7/7 [00:00<00:00, 21.89it/s]


 * A Project Gutenberg Canada Ebook *
 
 This ebook is made available at no cost and with very few
 restrictions. These restrictions apply only if (1) you make
 a change in the ebook (other than alteration for different
 display devices), or (2) you are making commercial use of
 the ebook. If either of these conditions applies, please
 check gutenberg.ca/links/licence.html before proceeding.
 
 This work is in the Canadian public domain, but may be
 under copyright in some countries. If you live outside Canada,
 check your country's copyright laws. IF THE BOOK IS UNDER
 COPYRIGHT IN YOUR COUNTRY, DO NOT DOWNLOAD
 OR REDISTRIBUTE THIS FILE.
 
 Title: The Lion, the Witch and the Wardrobe.
   A Story for Children.
 Author: Lewis, C. S. [Clive Staples] (1898-1963)
 Date of first publication: 1950
 Edition used as base for this ebook:
   New York: Macmillan, undated
   [twenty-first printing]
 Date first posted: 26 January 2014
 Date last updated: 26 January 2014
 Project Gutenberg Canada 




In [10]:
%pip install pprintpp --quiet
from pprint import pp

print()

add_chap_para_data(book_lines)
pp(book_lines[6][420])




100%|██████████| 7/7 [00:00<00:00, 371.52it/s]

{'text': '"But what have you done?" asked Lucy.\n',
 'file': 'lewiscs-thelionthewitchandthewardrobe-00-t.txt',
 'chapter': 2,
 'paragraph': 34}





In [11]:
snippets = compile_into_snippets(book_lines)
print(len(snippets))
pp(snippets[420])

4478
{'text': "chosen councillors was quite close.  Digory knew that he couldn't\n"
         ' possibly break in on so solemn a meeting, but there was no need to '
         'do\n'
         ' so.  At a word from Aslan, the He-Elephant, the Ravens, and all '
         'the\n'
         ' rest of them drew aside.  Digory slipped off the horse and found\n'
         ' himself face to face with Aslan.  And Aslan was bigger and more\n'
         ' beautiful and more brightly golden and more terrible than he had\n'
         ' thought.  He dared not look into the great eyes.\n'
         ' \n'
         ' "Please--Mr. Lion--Aslan--Sir?" said Digory, "could you--may '
         'I--please,\n'
         ' will you give me some magic fruit of this country to make Mother '
         'well?"\n',
 'file': 'lewiscs-magiciansnephew-00-t.txt',
 'chap-para': [(11, 31), (11, 32)]}


In [12]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [13]:
!pip install colab-xterm #https://pypi.org/project/colab-xterm/
%load_ext colabxterm

Collecting colab-xterm
  Downloading colab_xterm-0.2.0-py3-none-any.whl.metadata (1.2 kB)
Downloading colab_xterm-0.2.0-py3-none-any.whl (115 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/115.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.6/115.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: colab-xterm
Successfully installed colab-xterm-0.2.0


In [14]:
!nohup ollama serve & disown
# !ollama serve

nohup: appending output to 'nohup.out'


In [15]:
answering_model = "gemma2"
embed_model = "mxbai-embed-large"

# We don't want to add again and again
if narnia_db['db'].count() == 0 :
  create_embeddings_and_store(snippets, narnia_db['db'], embed_model)

pulled embedding model


100%|██████████| 4478/4478 [06:30<00:00, 11.47it/s]


created embeddings


100%|██████████| 4478/4478 [00:51<00:00, 87.64it/s]

added to collection





In [28]:
responses = []

In [29]:
responses.append(answer_with_context("Who is Eustace?", narnia_db['db'], answering_model, embed_model))
print()
print("Question : %s \n Answer : %s" % (responses[-1][0], responses[-1][1]))

pulled answering model
querying knowledge base... query complete
Generating response... Answer generated

Question : Who is Eustace? 
 Answer : Eustace is a character in C.S. Lewis's fantasy series *The Chronicles of Narnia*. He is a  young boy who starts off as grumpy and unpleasant but undergoes a transformation during his adventures in Narnia. 


Let me know if you have any other questions about Eustace or The Chronicles of Narnia!


In [30]:
responses.append(answer_with_context("Who is Jadis?", narnia_db['db'], answering_model, embed_model))
print()
print("Question : %s \n Answer : %s" % (responses[-1][0], responses[-1][1]))

pulled answering model
querying knowledge base... query complete
Generating response... Answer generated

Question : Who is Jadis? 
 Answer : Based on the provided text, Jadis is a powerful and fearsome witch.  

Here's why:

* **She is referred to as "the Queen of Queens and the Terror of Charn."** This suggests she holds a high position and is known for her ruthlessness.
* **Her appearance is described as menacing:** She bares her teeth, her eyes shine like fire, and her hair streams out behind her like a comet's tail.
* **She treats her horse cruelly,** flogging it mercilessly. 
* **She possesses magical abilities** allowing her to jump clear of a crashing hansom cab and seemingly communicate with the horse telepathically.
* **Her voice is powerful enough to make a room quiver.**


All of these details paint a picture of Jadis as a formidable and potentially dangerous figure.  



In [34]:
# Direct questions require fewer RAG results
responses.append(answer_with_context("In what chapter, para and book was Susan called 'not a friend of Narnia'?", narnia_db['db'], answering_model, embed_model,top_k=10))
print()
print("Question : %s \n Answer : %s" % (responses[-1][0], responses[-1][1]))

pulled answering model
querying knowledge base... query complete
Generating response... Answer generated

Question : In what chapter, para and book was Susan called 'not a friend of Narnia'? 
 Answer : Susan was called "not a friend of Narnia" in **Chapter 12, Paragraph 51** of  **The Last Battle**. 


Let me know if you have any other questions about these C.S. Lewis excerpts! 😊 



In [33]:
responses.append(answer_with_context("Who is Ramandu's daughter?", narnia_db['db'], answering_model, embed_model))
print()
print("Question : %s \n Answer : %s" % (responses[-1][0], responses[-1][1]))

pulled answering model
querying knowledge base... query complete
Generating response... Answer generated

Question : Who is Ramandu's daughter? 
 Answer : The provided text states that Caspian married Ramandu's daughter and she became a great queen in Narnia.  


Let me know if you have any other questions from this excerpt! 

