<a href="https://colab.research.google.com/github/Sweta-Das/LangChain-HuggingFace-LLM/blob/SentenceTransformers/AnalysisOfTextSimilarityMatrices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# %%capture
%pip -q install PyPDF2 pdfplumber langchain sentence-transformers transformers numba

In [None]:
%pip -q install llama-cpp-python

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader
from sentence_transformers import SentenceTransformer, util
from langchain.llms import LlamaCpp
from langchain import HuggingFaceHub
from PyPDF2 import PdfReader
from numba import jit, cuda
from pdfplumber import pdf
import numpy as np
import sys, random
import torch
import time
import os

**About Libraries**:<br>
- *RecursiveCharacterTextSplitter* : a function to split text into smaller chunks based on a specified character set & chunk size. Recursive splitting works by repeatedly splitting the text into smaller pieces until it reaches a desired size or encounters a separator character.
- *SentenceTransformer* : class used for embedding sentences into numerical vectors for various NLP tasks
- *LlamaCPP* : a LangChain's wrapper class that enables to use the Llama LLM within LangChain.
- *LLMChain* : a LangChain's class specifically designed to interact with LLMs
- *HuggingFaceHub* :  class that joins LangChain with Hugging Face
- *PyPDF2* : a library that works with PDF files in Python; *PdfReader* reads the PDF docs' content
- *pdfplumber* : a library for extracting text & data from PDF docs; *pdf* works with PDFs
- *numba* : a library in Python ecosystem used for high-performance numerical computing. It provides **JIT (Just In Time)** compiler *(@jit)* that translates Python functions into optimized machine code at runtime. It also support **cuda** like *(@cuda.jit)* to execute code on NVIDIA GPUs.



In [4]:
# Accessing through HuggingFace Access Token
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'HUGGINGFACEHUB_API_TOKEN'

In [18]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive/')
model = 'drive/MyDrive/LLM_Model/mistral-7b-instruct-v0.1.Q3_K_S.gguf'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [6]:
def progressBar(count_value, total, suffix=''):
  # Designing progress bar (==---)
  bar_length = 100
  filled_up_length = int(round(bar_length * count_value / float(total)))
  percent = round(100.0 * count_value/float(total), 1)
  bar = '=' * filled_up_length + '-' * (bar_length - filled_up_length)
  sys.stdout.write('[%s] %s%s ...%s\r' %(bar, percent, '%', suffix))
  sys.stdout.flush()

### Reading the pdf file

In [8]:
def load_split_pdf(pdf_path):
  # Reading pdf in binary mode
  pdf_loader = PdfReader(open(pdf_path, "rb"))
  pdf_text = ""

  # Reading only 8 pages of pdf
  for page_num in range(len(pdf_loader.pages)): # min(8, len(pdf_loader.pages))
    # Loading page
    pdf_page = pdf_loader.pages[page_num]
    # Extracting text
    pdf_text += pdf_page.extract_text()

  progressBar(2, 7)
  return pdf_text

### Recursive Text Character Splitter

In [9]:
def split_text_using_RCTS(pdf_text):

  # Splitting text recursively
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = 2048,
      chunk_overlap = 64
  )
  split_texts = text_splitter.split_text(pdf_text)

  # Separating texts at paragraphs
  paragraphs = []
  for text in split_texts:
    paragraphs.extend(text.split('\n'))

  progressBar(3, 7)
  return paragraphs

### Sentence Transformer

In [10]:
# Initializing sentence transformer
def Initialize_sentence_transformer():
  model_name = "sentence-transformers/all-MiniLM-L6-v2"
  embeddings = SentenceTransformer(model_name)

  progressBar(4, 7)
  return embeddings

In [11]:
# Encoding each paragraph
def encode_each_paragraph(paragraphs, embeddings):
  responses = []
  for paragraph in paragraphs:
    response = embeddings.encode([paragraph], convert_to_tensor=True)
    responses.append((paragraph, response))

  progressBar(5, 7)
  return responses

In [12]:
# Choosing most relevant sentence
def choose_most_relevant_sentence_using_CosineSimilarity(embeddings, responses, query):
  query_embedding = embeddings.encode([query], convert_to_tensor=True)
  best_response = None
  best_similarity = -1.0
  answers = []

  for paragraph, response in responses:
    # Finding cosine similarity between query embedding and response
    similarity = util.pytorch_cos_sim(query_embedding, response).item()

    if similarity >= 0.8:
      # count += 1
      answers.append(paragraph)

  answer = "\n".join(answers)

  progressBar(6, 7)
  return answer

### Querying the LLM

In [13]:
def get_query():
    query = input("Enter your question\n")
    progressBar(1, 7)
    return query

In [14]:
def query_the_llm(answer, llm_model, query):
    prompt_message = answer + "\n" + query

    final_response = llm_model.generate(prompts=[prompt_message])

    return final_response

## Cosine Similarity

In [20]:
# Loading the LLM Model
def main():
  start_time = time.time()
  pdf_path = "./HandbookOfTechnicalAnalysis.pdf"
  pdf_text = load_split_pdf(pdf_path)
  paragraphs = split_text_using_RCTS(pdf_text)
  embeddings = Initialize_sentence_transformer()
  responses = encode_each_paragraph(paragraphs=paragraphs, embeddings=embeddings)
  # print(responses)
  query = get_query()
  answer = choose_most_relevant_sentence_using_CosineSimilarity(embeddings=embeddings, responses=responses, query=query)

  llm = LlamaCpp(
      streaming = True,
      model_path = "/content/drive/MyDrive/LLM_Model/mistral-7b-instruct-v0.1.Q3_K_S.gguf",
      temperature = 0.75, # degree of randomness
      top_p = 1,
      verbose = True,
      n_ctx = 4096 # max no. of tokens to generate
  )

  final_response = query_the_llm(answer=answer, llm_model=llm, query=query)

  print ("The answer from model is\n", final_response)
  end_time = time.time()
  elapsed_time = end_time - start_time
  print(f"Execution time: {elapsed_time/60} minutes \n")

  progressBar(7, 7)

if __name__ == "__main__":
  main()

Enter your question
Explain about number of registered users in elearnmarket.com


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /content/drive/MyDrive/LLM_Model/mistral-7b-instruct-v0.1.Q3_K_S.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - k

The answer from model is
 generations=[[Generation(text='.\nNumber of registered users on Elearnmarket.com refers to the total number of individuals who have created an account with the website by providing their personal information, such as name, email address, and a password. These users are able to access the platform’s features and services, including purchasing and enrolling in online courses, interacting with instructors and peers, and managing their account settings. The number of registered users on Elearnmarket.com may fluctuate over time due to factors such as new user sign-ups, existing users discontinuing their accounts, and changes in the platform’s marketing efforts.')]] llm_output=None run=[RunInfo(run_id=UUID('1affb2c4-deb1-472b-9dd5-07369279f298'))]
Execution time: 3.106291969617208 minutes 


Parameters within LlamaCPP: <br>
- *streaming* : controls how the LLM processes and generates text. Set to True (by default).  Handles input text in a continuous stream, generating output word by word or in small chunks.
<br>

- *temperature* : controls the **level of randomness**; higher temperature (closer to 1) leads to more creative and diverse outputs, but potentially less coherence, while  lower temperature (closer to 0) results in more predictable and consistent outputs that are more likely to follow the established patterns in the training data
<br>

- *top-p* : **nucleus sampling**; influences which words the LLM is more likely to choose during generation, value of '1' means the LLM considers all possible words in its vocabulary, but prioritizes those with higher probabilities according to its internal model. Higher values (closer to the total vocabulary size) would restrict the selection to a smaller set of high-probability words, potentially leading to less creative but more focused outputs.
<br>

- *verbose* : controls whether the LLM prints additional information during its operation; <br>
Set to **True** leads to output messages about its progress, warnings or errors; helpful for debugging or behavior monitoring. <br>
Set tp **False** suppresses these messages for clean execution.
<br>

- *n_ctx* : specifies the max. no. of tokens (word or subword units) considered by LLM generating text

## Euclidean Distance

In [34]:
import numpy as np

def choose_most_relevant_sentence_using_EuclideanDistance(embeddings, responses, query):
  query_embedding = embeddings.encode([query], convert_to_tensor=True)
  best_response = None
  best_similarity = -1.0
  answers = []

  for paragraph, response in responses:
    # Finding Euclidean distance between query embedding and response
    euc_dist = np.linalg.norm(query_embedding - response)
    similarity = 1 / (1 + euc_dist)

    if similarity <= 0.3:
      # count += 1
      answers.append(paragraph)

  answer = "\n".join(answers)

  progressBar(6, 7)
  return answer

In [35]:
start_time = time.time()
pdf_path = "./HandbookOfTechnicalAnalysis.pdf"
pdf_text = load_split_pdf(pdf_path)
paragraphs = split_text_using_RCTS(pdf_text)
embeddings = Initialize_sentence_transformer()
responses = encode_each_paragraph(paragraphs=paragraphs, embeddings=embeddings)
# print(responses)
query = get_query()
answer = choose_most_relevant_sentence_using_EuclideanDistance(embeddings=embeddings, responses=responses, query=query)

llm = LlamaCpp(
    streaming = True,
    model_path = "/content/drive/MyDrive/LLM_Model/mistral-7b-instruct-v0.1.Q3_K_S.gguf",
    temperature = 0.75, # degree of randomness
    top_p = 1,
    verbose = False,
    n_ctx = 4096 # max no. of tokens to generate
)

final_response = query_the_llm(answer=answer, llm_model=llm, query=query)

print ("The answer from model is\n", final_response)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time/60} minutes \n")

progressBar(7, 7)

Enter your question
Explain about registered users in elearnmarket.com
The answer from model is
 generations=[[Generation(text='.\nA registered user is someone who has created an account on ElearnMarket.com by providing their personal information, including a valid email address and password. Once registered, users can access various features and services available on the platform, such as creating and managing online courses, posting and searching for jobs, and participating in discussion forums.\n\nRegistered users also have the ability to earn points and rewards by completing various tasks on the site, such as taking quizzes or attending webinars. These points can be redeemed for prizes or used to purchase additional features and services.\n\nIn addition, registered users can also track their progress and performance through detailed analytics and reporting tools available on the platform. This allows them to better understand their strengths and areas for improvement, and make info

## Manhattan Distance

In [32]:
import numpy as np

def choose_most_relevant_sentence_using_ManhattanDistance(embeddings, responses, query):
  query_embedding = embeddings.encode([query], convert_to_tensor=True)
  best_response = None
  best_similarity = -1.0
  answers = []

  for paragraph, response in responses:
    # Finding Manhattan Distance between query embedding and response
    man_dist = np.linalg.norm(query_embedding - response, ord=1)
    similarity = 1 / (1 + man_dist)

    if similarity <= 0.3:
      # count += 1
      answers.append(paragraph)

  answer = "\n".join(answers)

  progressBar(6, 7)
  return answer

In [33]:
start_time = time.time()
pdf_path = "./HandbookOfTechnicalAnalysis.pdf"
pdf_text = load_split_pdf(pdf_path)
paragraphs = split_text_using_RCTS(pdf_text)
embeddings = Initialize_sentence_transformer()
responses = encode_each_paragraph(paragraphs=paragraphs, embeddings=embeddings)
# print(responses)
query = get_query()
answer = choose_most_relevant_sentence_using_ManhattanDistance(embeddings=embeddings, responses=responses, query=query)

llm = LlamaCpp(
    streaming = True,
    model_path = "/content/drive/MyDrive/LLM_Model/mistral-7b-instruct-v0.1.Q3_K_S.gguf",
    temperature = 0.75, # degree of randomness
    top_p = 1,
    verbose = False,
    n_ctx = 4096 # max no. of tokens to generate
)

final_response = query_the_llm(answer=answer, llm_model=llm, query=query)

print ("The answer from model is\n", final_response)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time/60} minutes \n")

progressBar(7, 7)

Enter your question
Explain about registered users in elearnmarket.com
The answer from model is
 generations=[[Generation(text='\n\nWhat is meant by registered users in elearnmarket.com? Are these users who have created accounts on the website or are they those who have completed their courses and certifications?')]] llm_output=None run=[RunInfo(run_id=UUID('c1db927b-a70a-47e9-a9e4-662b10e5a8c0'))]
Execution time: 1.1008305033047994 minutes 



## Dot Product

In [28]:
import numpy as np

def choose_most_relevant_sentence_using_DotProduct(embeddings, responses, query):
  query_embedding = embeddings.encode([query], convert_to_tensor=True)

  best_response = None
  best_similarity = -1.0
  answers = []

  for paragraph, response in responses:
    # Reshaping query embedding to match the shape of response
    # query_embedding = query_embedding.view(response.shape)

    # Finding dot product between query embedding and response
    dot_product = torch.matmul(query_embedding, response.T) # np.dot(query_embedding, response)
    similarity = dot_product

    if similarity >= 0.8:
      # count += 1
      answers.append(paragraph)

  answer = "\n".join(answers)

  progressBar(6, 7)
  return answer

In [29]:
start_time = time.time()
pdf_path = "./HandbookOfTechnicalAnalysis.pdf"
pdf_text = load_split_pdf(pdf_path)
paragraphs = split_text_using_RCTS(pdf_text)
embeddings = Initialize_sentence_transformer()
responses = encode_each_paragraph(paragraphs=paragraphs, embeddings=embeddings)
# print(responses)
query = get_query()
answer = choose_most_relevant_sentence_using_DotProduct(embeddings=embeddings, responses=responses, query=query)

llm = LlamaCpp(
    streaming = True,
    model_path = "/content/drive/MyDrive/LLM_Model/mistral-7b-instruct-v0.1.Q3_K_S.gguf",
    temperature = 0.4, # degree of randomness
    top_p = 1,
    verbose = False,
    n_ctx = 4096 # max no. of tokens to generate
)

final_response = query_the_llm(answer=answer, llm_model=llm, query=query)

print ("The answer from model is\n", final_response)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time/60} minutes \n")

progressBar(7, 7)

Enter your question
Explain about registered users in elearnmarket.com
The answer from model is
 generations=[[Generation(text="\n\nElearnmarket.com is an online learning platform that offers a wide range of courses and training programs to help individuals and organizations improve their skills and knowledge. Registered users on the platform are those who have created an account with their personal information, including their name, email address, and password. Once registered, users can access the platform's features, such as creating profiles, enrolling in courses, tracking progress, and receiving certificates of completion. Additionally, registered users can also interact with other users on the platform by participating in discussion forums and sharing course content. Overall, being a registered user on elearnmarket.com allows individuals to take advantage of the platform's many benefits and improve their learning experience.")]] llm_output=None run=[RunInfo(run_id=UUID('cf9e7564-

## Angular Similarity

In [None]:
import numpy as np

def choose_most_relevant_sentence_using_AngularSimilarity(embeddings, responses, query):
  query_embedding = embeddings.encode([query], convert_to_tensor=True)
  best_response = None
  best_similarity = -1.0
  answers = []

  for paragraph, response in responses:
    # Finding cosine similarity between query embedding and response
    dot_product = np.dot(query_embedding, response) / (np.linalg.norm(query_embedding) * np.linalg.norm(response))
    similarity = dot_product

    if similarity >= 0.8:
      # count += 1
      answers.append(paragraph)

  answer = "\n".join(answers)

  progressBar(6, 7)
  return answer

In [None]:
start_time = time.time()
pdf_path = "./HandbookOfTechnicalAnalysis.pdf"
pdf_text = load_split_pdf(pdf_path)
paragraphs = split_text_using_RCTS(pdf_text)
embeddings = Initialize_sentence_transformer()
responses = encode_each_paragraph(paragraphs=paragraphs, embeddings=embeddings)
# print(responses)
query = get_query()
answer = choose_most_relevant_sentence_using_AngularSimilarity(embeddings=embeddings, responses=responses, query=query)

llm = LlamaCpp(
    streaming = True,
    model_path = "/content/drive/MyDrive/LLM_Model/mistral-7b-instruct-v0.1.Q3_K_S.gguf",
    temperature = 0.4, # degree of randomness
    top_p = 1,
    verbose = False,
    n_ctx = 4096 # max no. of tokens to generate
)

final_response = query_the_llm(answer=answer, llm_model=llm, query=query)

print ("The answer from model is\n", final_response)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time/60} minutes \n")

progressBar(7, 7)

### Referenced From: <br>
[**Handbook**](https://www.elearnmarkets.com/uploads/content_pdf/MxudjQevBU.pdf)<br>
[**Querying a PDF file using LLM models and Sentence transformer**](https://medium.com/@yashashm77/querying-a-pdf-file-using-llm-models-and-sentence-transformer-b3d4d0b40f7d)<br>