In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install chromadb huggingface_hub langchain
!pip install -U langchain-community
!pip install sentence-transformers
!pip install langchain requests
!pip install langdetect
!pip install beautifulsoup4
!pip install langchain together
!pip install scikit-learn

Collecting chromadb
  Downloading chromadb-0.5.18-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.28.1-py3

In [3]:
import os
# Set the USER_AGENT environment variable
os.environ['USER_AGENT'] = 'MyAppName/1.0'

In [4]:
# Import the necessary classes
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import requests
from langchain.llms import Together
import re
from bs4 import BeautifulSoup
from langchain.prompts import PromptTemplate
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import numpy as np
import pandas as pd

In [5]:
# Helper function to load or create a Chroma database
def create_or_load_chroma_db(persist_directory, embedding_model):
    if not os.path.exists(persist_directory):
        os.makedirs(persist_directory)
        print(f"Created directory for Chroma database at: {persist_directory}")

    try:
        # Load the existing ChromaDB
        chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
        print(f"Loaded existing Chroma database from {persist_directory}")
    except Exception as e:
        print(f"Error loading Chroma database: {e}")
        # If loading fails, create a new Chroma database
        print(f"Creating a new Chroma database at {persist_directory}")
        try:
            chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
            print("New Chroma database created successfully.")
        except Exception as e:
            print(f"Error creating Chroma database: {e}")
            raise  # Re-raise the exception after logging

    return chroma_db

In [6]:
def create_retriever(chroma_db, search_type="similarity", threshold=0.55, k=4, lambda_mult=0.25):
    retriever = chroma_db.as_retriever(
        search_type=search_type,
        relevance_score_threshold=threshold,
        k=k,
        lambda_mult=lambda_mult
    )
    return retriever

In [26]:
# Load the Hugging Face embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
chroma_db_path ="/content/drive/MyDrive/rag-model/ChromaDB_Lang_Chain"
chroma_db_drive=create_or_load_chroma_db(chroma_db_path,embedding_model)



Loaded existing Chroma database from /content/drive/MyDrive/rag-model/ChromaDB_Lang_Chain


In [8]:
# Clean HTML from retrieved answers
def clean_and_format_answers(answers):
    clean_html = re.compile('<.*?>')
    cleaned_text = ""
    for answer in answers:
        cleaned_text += f"Question: {answer['question']}\n\n"
        for doc in answer['relevant_documents']:
            cleaned_doc = re.sub(clean_html, '', doc)  # Clean HTML tags
            cleaned_text += f"{cleaned_doc.strip()}\n\n"
    return cleaned_text.strip()

In [9]:
# Function to retrieve documents based on questions
def get_answers(questions, retriever):
    answers = []
    for question in questions:
        results = retriever.get_relevant_documents(question)
        relevant_docs = [result.page_content for result in results]
        answers.append({
            "question": question,
            "relevant_documents": relevant_docs
        })
    return answers

In [10]:
# Function to generate embeddings for a list of texts
def get_embeddings(texts, embedding_model):
    return embedding_model.embed_documents(texts)

In [11]:
# Function to calculate Euclidean distance
def calculate_euclidean_distance(query_embedding, doc_embeddings):
    # Convert to NumPy arrays if they are lists
    query_embedding = np.array(query_embedding)
    doc_embeddings = np.array(doc_embeddings)

    # Calculate distances
    distances = np.linalg.norm(doc_embeddings - query_embedding, axis=1)
    return distances

In [12]:
# Function to calculate Manhattan distance
def calculate_manhattan_distance(query_embedding, doc_embeddings):
    # Convert to NumPy arrays if they are not already
    query_embedding = np.array(query_embedding)
    doc_embeddings = np.array(doc_embeddings)

    # Calculate Manhattan distances
    distances = np.abs(doc_embeddings - query_embedding).sum(axis=1)
    return distances

In [13]:
# Function to calculate dot product
def calculate_dot_product(query_embedding, doc_embeddings):
    return np.dot(doc_embeddings, query_embedding)

In [27]:
# Function to calculate cosine similarity for embeddings
def calculate_cosine_similarity(query_embedding, doc_embeddings):
    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
    return similarities

In [24]:
# Function to calculate similarities between all document embeddings
def calculate_document_similarities(doc_embeddings, metric='cosine'):
    # Prepare an array to store similarity scores
    num_docs = len(doc_embeddings)
    similarity_matrix = np.zeros((num_docs, num_docs))

    for i in range(num_docs):
        for j in range(num_docs):
            if metric == 'cosine':
                similarity_matrix[i][j] = cosine_similarity([doc_embeddings[i]], [doc_embeddings[j]])[0][0]
            elif metric == 'euclidean':
                similarity_matrix[i][j] = euclidean_distances([doc_embeddings[i]], [doc_embeddings[j]])[0][0]
            elif metric == 'manhattan':
                similarity_matrix[i][j] = manhattan_distances([doc_embeddings[i]], [doc_embeddings[j]])[0][0]
            elif metric == 'dot':
                similarity_matrix[i][j] = np.dot(doc_embeddings[i], doc_embeddings[j])
            else:
                raise ValueError("Unknown metric: choose from 'cosine', 'euclidean', 'manhattan', or 'dot'.")

    return similarity_matrix

In [29]:
# Main function to load embeddings, retrieve documents, and calculate similarities..
if __name__ == "__main__":
    # Path for Chroma database
    chroma_db_path = "/content/drive/MyDrive/rag-model/ChromaDB_Lang_Chain"

    # Create or load Chroma database
    chroma_db = create_or_load_chroma_db(chroma_db_path, embedding_model)

    # Create a retriever
    retriever = create_retriever(chroma_db=chroma_db)

    # Define the question
    questions = [
        "In what ways does the Convention aim to extend the protection accorded to refugees beyond previous agreements?"
    ]

    # Retrieve relevant answers and documents
    answers = get_answers(questions, retriever)

    for answer in answers:
        # Generate question embedding
        question_embedding = np.array(embedding_model.embed_query(answer['question']))

        # Generate document embeddings
        doc_embeddings = np.array(get_embeddings(answer['relevant_documents'], embedding_model))

        # Calculate similarities and distances for each document to question
        cosine_similarities = calculate_cosine_similarity(question_embedding, doc_embeddings)
        euclidean_dists = calculate_euclidean_distance(question_embedding, doc_embeddings)
        manhattan_dists = calculate_manhattan_distance(question_embedding, doc_embeddings)
        dot_products = calculate_dot_product(question_embedding, doc_embeddings)

        # Display document-to-question similarity metrics
        print(f"Question: {answer['question']}\n")
        for i, doc in enumerate(answer['relevant_documents']):
            print(f"Document {i+1}:\n"
                  #f"Document: {doc}\n"
                  f"Cosine Similarity: {cosine_similarities[i]:.4f}\n"
                  f"Euclidean Distance: {euclidean_dists[i]:.4f}\n"
                  f"Manhattan Distance: {manhattan_dists[i]:.4f}\n"
                  f"Dot Product: {dot_products[i]:.4f}\n")

        # Calculate document-to-document similarities
        document_similarity_matrix = calculate_document_similarities(doc_embeddings, metric='cosine')

        # Convert the similarity matrix to a DataFrame with document labels
        doc_labels = [f"Document {i+1}" for i in range(len(doc_embeddings))]
        similarity_df = pd.DataFrame(document_similarity_matrix, index=doc_labels, columns=doc_labels)

        # Display the labeled document similarity matrix
        print("\nDocument-to-Document Similarity Matrix (Cosine):")
        print(similarity_df)

Loaded existing Chroma database from /content/drive/MyDrive/rag-model/ChromaDB_Lang_Chain
Question: In what ways does the Convention aim to extend the protection accorded to refugees beyond previous agreements?

Document 1:
Cosine Similarity: 0.7738
Euclidean Distance: 0.6726
Manhattan Distance: 14.7353
Dot Product: 0.7738

Document 2:
Cosine Similarity: 0.7722
Euclidean Distance: 0.6750
Manhattan Distance: 14.5602
Dot Product: 0.7722

Document 3:
Cosine Similarity: 0.7554
Euclidean Distance: 0.6995
Manhattan Distance: 15.3002
Dot Product: 0.7554

Document 4:
Cosine Similarity: 0.7520
Euclidean Distance: 0.7042
Manhattan Distance: 15.2918
Dot Product: 0.7520


Document-to-Document Similarity Matrix (Cosine):
            Document 1  Document 2  Document 3  Document 4
Document 1    1.000000    0.762123    0.844348    0.774417
Document 2    0.762123    1.000000    0.832039    0.760273
Document 3    0.844348    0.832039    1.000000    0.801208
Document 4    0.774417    0.760273    0.801208