Firstly we load all of our fundamental data

Requirements:
Download all the modules given in the requirements.txt file
Download the following models
(https://huggingface.co/nomic-ai/nomic-embed-text-v1 - embedding model)
(https://huggingface.co/mixedbread-ai/mxbai-rerank-large-v1 - ReRanker mode)
The Large language model (mistral 7b it) will be downloaded as the code is run



In [3]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from transformers.utils import is_flash_attn_2_available
import numpy as np
from daft import DataFrame
from tqdm.auto import tqdm
import typing
import pandas as pd
from spacy.lang.en import English
import time
import re
from sentence_transformers import SentenceTransformer, util, CrossEncoder
import textwrap
k= 15
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#Loading Data from csv file
text_chunks_and_embeddings_df = pd.read_csv('text_chunks_and_embeddings_df.csv')

#convert to a numpy array from a string (as we are reading it from a csv file)
text_chunks_and_embeddings_df['embedding'] = text_chunks_and_embeddings_df['embedding'].apply(lambda x: 
                           np.fromstring(
                               x.strip('[]'), sep = ' '))
embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df['embedding'].tolist(), axis = 0), dtype=torch.float32).to(device)
                                

scripts_with_chunks = text_chunks_and_embeddings_df.to_dict(orient = 'records')

embedding_model = SentenceTransformer('nomic-embed-text-v1', trust_remote_code=True, device = device)

ReRanker = CrossEncoder("mixedbread-aimxbai-rerank-large-v1",device = device)

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
module 'torch' has no attribute 'version'

torch and np return diff data types (f32 and f64) so make sure to address that before dot product

Semantic search Steps
We have to return all strings that are relevant to a given query (using the embeddings we have generated
1)Define a queryy string
2)Turn the query string into an embedding
3)Perform a dot product of cosine similarity function between the text embeddings and the query embedding
4)Sort the results from 3 in ascending order

Searching over embeddings is insanely fast, you might want to use an index when you cross well over 10,000-100,000 times the amount of data we are dealing with right now

Now we make the printed data presentable

In [None]:
def print_wrapped(text, wrap_length = 80):
    wrap_text = textwrap.fill(text, wrap_length)
    print(wrap_text)

We can re rank the results we recieve from this and make it into functions

In [None]:
def retrieve_relevant_resources(query:str,
                                embeddings:torch.tensor,
                                n_resources_to_return = k):
    #Query Embedding
    query_embedding = embedding_model.encode(query, convert_to_tensor=True).to(device)
      
        
    #Dot product
    dot_scores= util.dot_score(query_embedding, embeddings)[0]
    scores, indices = torch.topk(dot_scores, k=n_resources_to_return)
    
    return scores, indices

def print_top_results_and_scores(query:str,
                                 embeddings: torch.tensor,
                                 scripts_with_chunks: list[dict] = scripts_with_chunks,
                                 n_resources_to_return = k):
    
    
    scores, indices = retrieve_relevant_resources(query=query, embeddings=embeddings, n_resources_to_return=n_resources_to_return)
    documents=[]
    for index in indices:
        Text = scripts_with_chunks[index]['chunks']
        documents.append(Text)
        
        
    results = ReRanker.rank(query, documents, return_documents=True, top_k = k) #Returns [{corpus id, Score, text}]
    print("Results")
    for corpus in results: 
        corpus_id = corpus['corpus_id']
        Score = corpus['score']
        text = corpus['text']
        print(f'Score: {Score:.4f}')
        print(f'Text: {text}')
        print(f"title: {scripts_with_chunks[indices[corpus_id]]['title']}")
    
#print_top_results_and_scores(query=query, embeddings=embeddings,  n_resources_to_return=k)
    

Local LLM implementation

In [None]:
#quantization configuration

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

#checks if flash_attn_2 is available
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability()[0]>=8):
    attn_implementation = 'flash_attention_2'
else:
    attn_implementation = 'sdpa'   #scaled dot product attention

attn_implementation
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
        attn_implementation =  attn_implementation,    
)

In [None]:
def get_model_num_params(model:torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])
get_model_num_params(model)

Generate text with our LLM

Creation of a Prompt Template

In [None]:
#Set of Instructions that the LLM should follow
instruction_str = '''1)The given Query relates to a question about movies from one of the following
Avengers, The How to Train Your Dragon Trilogy and The Titanic
2) The returned answer must be the answer to the question
3)If more info must be required to answer the question, reply with a request for more context to the question, so that you can provide a good answer'''

input_text = 'read out a poem to me about how to commit tax evasion'
dialogue_template = [{
    'instruction': instruction_str,
    'role':'user',
    'content':input_text
}]


#Applying thr prompt template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template, tokenize=False, add_generation_prompt=True)

print(prompt)
input_ids = tokenizer(prompt, return_tensors='pt').to(device)

#Generate outputs from local llm
output = model.generate(**input_ids, max_new_tokens = 1000)

#decode output
output_text = tokenizer.decode(output[0])
print(f'Model Output: {output_text}')


Augmenting our prompt with necessary context
After Retrival and Generation, we gonna add augmentation

It could be considered a form of prompt engineering, so we must
1)Give very clear instructions
2)Give a few examples of input/output
3)Give it room to think

So we will implement a function to do so

In [None]:
#prompt formatting
def prompt_formatter(query:str, context_items:list[dict]) -> str:
    context = "- "+ "\n ".join(item['chunks'] for item in context_items)
    
    prompt = context
    
    return prompt
query = 'what is the real identity of iron man in avengers'

#Get relevant responses
scores, indices = retrieve_relevant_resources(query = query, embeddings = embeddings)

#Creat e alist of context items
context_items = [scripts_with_chunks[i] for i in indices]

#formatting our prompt
prompt = prompt_formatter(query=instruction_str+query, context_items = context_items)
print(len(prompt)//4)

In [None]:
input_ids = tokenizer(prompt, return_tensors='pt').to(device)

outputs = model.get_generation(**input_ids,
                               temperature = 0.7, #lower value makes the model more deterministic
                               do_sample = True, #whether or not to use sampling https://huyenchip.com/2024/01/16/sampling.html
                               
                               )

output_text= tokenizer.decode(outputs[0])
print(f"Query: {query}")
print(f"RAG Answer:{output_text}")