1. Read the external text file and split it into chunks.
2. Initialize an embedding model.
3. Generate embeddings for each chunk.
4. Generate embedding of the query ( QE).
5. Generate a similarity score betweent QE and each of the Chunk embeddings.
6. Extract top K chunks based on similarity score.
7. Frame a prompt with the query and the top-k chunks.
8. Prompt an LLM with the prompt framed in step-7.

In [20]:
from langchain_community.embeddings import OllamaEmbeddings
import math
import numpy as np
from langchain_community.llms import Ollama

## Utility functions

In [2]:
model_for_embeding =OllamaEmbeddings(model="llama3")

def read_file( filename , encoding=  'utf-8' ):
    with open( filename, 'r' ) as f:
        text =f.read()
    return text

def make_chunks(text, chunk_size):
    chunks= [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

def embedding_model():
    embed_model =model_for_embeding
    return embed_model

def make_embeddings( chunk, embed_model):
    embeddings = embed_model.embed_query(chunk)
    return embeddings

def dot_product(vec1,vec2):
    return sum(a * b for a, b in zip(vec1, vec2))


def magnitude(vec):
    return math.sqrt(sum(v**2 for v in vec))

def cosine_similarity(vec1,vec2):
    dot_prod = dot_product(vec1,vec2)
    mag_vec1 = magnitude(vec1)
    mag_vec2 = magnitude(vec2)
    if mag_vec1 == 0 or mag_vec2 == 0:
        return 0  # Handle division by zero

    return dot_prod / (mag_vec1 * mag_vec2)
    


In [3]:
text_file = read_file("J. K. Rowling - Harry Potter 1 - Sorcerer's Stone.txt")
chunks =make_chunks(text_file,chunk_size=200)

In [4]:
embed_model =embedding_model()

In [5]:
embeddings =[]
for chunk in chunks[:30]:
    embedding= make_embeddings(chunk,embed_model)
    embeddings.append(embedding)

In [6]:
embeddings[0]

[-1.355804443359375,
 4.870009899139404,
 0.1783161759376526,
 -0.136270672082901,
 -1.0225539207458496,
 4.9153666496276855,
 -2.7575700283050537,
 1.3425228595733643,
 -4.943639278411865,
 1.2691378593444824,
 1.4069074392318726,
 -1.5717366933822632,
 2.2005906105041504,
 -6.1073431968688965,
 -6.39471960067749,
 0.2667825520038605,
 -1.8162380456924438,
 0.49577367305755615,
 -0.3893185257911682,
 3.3540995121002197,
 -3.121548891067505,
 0.46827518939971924,
 3.746399402618408,
 0.13194939494132996,
 -0.242906853556633,
 2.5499351024627686,
 1.5358314514160156,
 1.1397335529327393,
 2.947211503982544,
 1.9377692937850952,
 2.98879075050354,
 4.9104108810424805,
 6.695400714874268,
 -1.6693427562713623,
 -4.097033977508545,
 -2.69535493850708,
 -1.3394010066986084,
 -1.3497462272644043,
 2.826103925704956,
 2.7779598236083984,
 1.974961519241333,
 4.0821428298950195,
 -2.2803993225097656,
 3.0225372314453125,
 1.392267107963562,
 5.566384315490723,
 1.3896011114120483,
 -4.23642969

In [27]:
query = "who is Mr. Dursley"
q_embd = make_embeddings(query,embed_model)

In [28]:
q_embd

[-2.4158217906951904,
 0.9602771401405334,
 3.0418860912323,
 2.675150156021118,
 -0.9814565181732178,
 0.8400055170059204,
 -2.3929808139801025,
 -3.1503043174743652,
 -1.5658071041107178,
 0.28127220273017883,
 -0.5349699258804321,
 -0.5257326364517212,
 -0.6289327144622803,
 -2.7228832244873047,
 -2.5435574054718018,
 2.9310381412506104,
 1.4975405931472778,
 -0.9755702018737793,
 -0.1464834362268448,
 1.517569661140442,
 -4.849628925323486,
 -0.6696875095367432,
 -0.5857241153717041,
 -1.6475651264190674,
 -2.033785343170166,
 -2.513700485229492,
 3.9595813751220703,
 4.020542144775391,
 1.848860502243042,
 0.7849398255348206,
 -0.9370803833007812,
 4.77432918548584,
 3.6450161933898926,
 1.2521287202835083,
 0.3439480662345886,
 -1.9455868005752563,
 -3.595900297164917,
 -0.5162835717201233,
 3.005463123321533,
 4.266862869262695,
 -0.3976494073867798,
 0.8858478665351868,
 0.7962168455123901,
 -1.739383578300476,
 1.1733038425445557,
 1.185168743133545,
 -0.12197083979845047,
 2.

In [29]:
ratings =[cosine_similarity(i,q_embd)for i in embeddings]

In [30]:
ratings

[0.36701623759081736,
 0.26293073707720704,
 0.34596918497330953,
 0.31213526074405823,
 0.477563839645835,
 0.15891999984596175,
 0.3954459034627253,
 0.426708450244005,
 0.17013487541222994,
 0.46531791221663343,
 0.4843876103551897,
 0.19587356065516268,
 0.15693685032332375,
 0.3719775321268353,
 0.30274914227479,
 0.2450541461559779,
 0.18865597032347625,
 0.10482891131946324,
 0.4737053461771573,
 0.3180061216013255,
 0.30327144223296937,
 0.37281743939265555,
 0.2598576950757504,
 0.2350245856824475,
 0.3511558244458132,
 0.20395055967149356,
 0.29320931474110995,
 0.24611957867317588,
 0.15690182436283825,
 0.33331412035089775]

In [31]:
k = 5
idx = np.argpartition(ratings, -k)[-k:]  # Indices not sorted

In [32]:
idx

array([ 7,  9, 18,  4, 10], dtype=int64)

In [33]:
prompt = f"You are a smart agent. A question would be asked to you and relevant information would be provided.\
    Your task is to answer the question and use the information provided. Question - {query}. Relevant Information - {[chunks[index] for index in idx]}"

In [34]:

llm_model = Ollama(model="llama3")



In [35]:
output =llm_model.invoke(prompt)

In [36]:
output

'Based on the provided information, I can tell you that Mr. Dursley is a character in the story who seems to be quite ordinary and perhaps even a bit dull. He appears to be a working-class man, possibly an office worker, as he mentions having a briefcase and going to work.\n\nFrom the passage, we can infer some of Mr. Dursley\'s personality traits. He seems to be quite annoyed or enraged when he sees something that bothers him, such as the group of people gathering outside his window (whom he calls "weirdos"). He also appears to be somewhat dismissive of his nephew, Harry Potter, referring to him as a "child like that" and not wanting Dudley, his own son, to mix with Harry.\n\nIt\'s worth noting that Mr. Dursley seems to have a rather mundane life, focused on work and domestic routine, with little excitement or adventure. He is also very concerned about keeping secrets from others, specifically about the Potters (Harry\'s parents), which suggests that there may be something significant