In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install chromadb huggingface_hub langchain
!pip install -U langchain-community
!pip install sentence-transformers
!pip install langchain requests
!pip install langdetect
!pip install beautifulsoup4
!pip install langchain together
!pip install scikit-learn



In [3]:
import os
# Set the USER_AGENT environment variable
os.environ['USER_AGENT'] = 'MyAppName/1.0'

In [4]:
# Import the necessary classes
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import requests
from langchain.llms import Together
import re
from bs4 import BeautifulSoup
from langchain.prompts import PromptTemplate
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [5]:
# Helper function to load or create a Chroma database
def create_or_load_chroma_db(persist_directory, embedding_model):
    if not os.path.exists(persist_directory):
        os.makedirs(persist_directory)
        print(f"Created directory for Chroma database at: {persist_directory}")

    try:
        # Load the existing ChromaDB
        chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
        print(f"Loaded existing Chroma database from {persist_directory}")
    except Exception as e:
        print(f"Error loading Chroma database: {e}")
        # If loading fails, create a new Chroma database
        print(f"Creating a new Chroma database at {persist_directory}")
        try:
            chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)
            print("New Chroma database created successfully.")
        except Exception as e:
            print(f"Error creating Chroma database: {e}")
            raise  # Re-raise the exception after logging

    return chroma_db

In [6]:
def create_retriever(chroma_db, search_type="similarity", threshold=0.55, k=4, lambda_mult=0.25):
    retriever = chroma_db.as_retriever(
        search_type=search_type,
        relevance_score_threshold=threshold,
        k=k,
        lambda_mult=lambda_mult
    )
    return retriever

In [7]:
# Load the Hugging Face embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
chroma_db_path ="/content/drive/MyDrive/rag-model/ChromaDB_Lang_Chain"
chroma_db_drive=create_or_load_chroma_db(chroma_db_path,embedding_model)

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  chroma_db = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)


Loaded existing Chroma database from /content/drive/MyDrive/rag-model/ChromaDB_Lang_Chain


In [8]:
# Clean HTML from retrieved answers (documents)
def clean_answers(docs):
    clean_html = re.compile('<.*?>')
    cleaned_docs = []
    for doc in docs:
        cleaned_doc = re.sub(clean_html, '', doc).strip()  # Clean HTML tags
        cleaned_docs.append(cleaned_doc)
    return cleaned_docs

In [9]:
# Function to retrieve documents based on questions
def get_answers(questions, retriever):
    answers = []
    for question in questions:
        results = retriever.get_relevant_documents(question)
        relevant_docs = [result.page_content for result in results]
        answers.append({
            "question": question,
            "relevant_documents": relevant_docs
        })
    return answers

In [10]:
# Function to generate embeddings for a list of texts
def get_embeddings(texts, embedding_model):
    return embedding_model.embed_documents(texts)

In [11]:
# Function to calculate cosine similarity for embeddings
def calculate_cosine_similarity(query_embedding, doc_embeddings):
    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
    return similarities

In [12]:
llm = Together(model="meta-llama/Llama-2-13b-chat-hf", together_api_key="481dc4b7f7e5a6bff243a6d6d6f7c896e83a909a5b52c5f3aaf663d291c8946a",temperature=0)

  llm = Together(model="meta-llama/Llama-2-13b-chat-hf", together_api_key="481dc4b7f7e5a6bff243a6d6d6f7c896e83a909a5b52c5f3aaf663d291c8946a",temperature=0)


In [13]:
def calculate_confidence(sorted_response_docs, weights, threshold=0.7):
    # Normalize the weights if they don't sum to 1
    weight_sum = sum(weights)
    if weight_sum != 1:
        print(f"Normalizing weights. Current sum: {weight_sum}")
        weights = [w / weight_sum for w in weights]

    # Calculate weighted confidence score
    confidence_score = sum(sim * weight for (_, sim), weight in zip(sorted_response_docs, weights))

    # Determine confidence level based on the threshold
    if confidence_score >= threshold:
        confidence_level = "High Confidence"
    elif 0.5 <= confidence_score < threshold:
        confidence_level = "Medium Confidence"
    else:
        confidence_level = "Low Confidence"

    return confidence_score, confidence_level

In [14]:
# Define weights for 4 relevant documents
weights = [0.4, 0.3, 0.2, 0.1]

In [15]:
import firebase_admin
from firebase_admin import credentials, firestore
 # Initialize Firebase with Firestore using our service account credentials
cred = credentials.Certificate("/content/drive/MyDrive/rag-model/rag.json")
  # Replace with our .json file path
firebase_admin.initialize_app(cred)
  # Initialize Firestore client
db = firestore.client()

In [17]:
import firebase_admin
from firebase_admin import credentials, firestore

# Initialize Firebase Admin SDK using our service account credentials
cred = credentials.Certificate("/content/drive/MyDrive/rag-model/rag.json")  # Path to the .json file

# Initialize Firestore
db = firestore.client()

In [18]:
# Function to store high-confidence Q&A in Firestore
def store_in_firebase(question, ai_response, confidence_score):
    if confidence_score >= 0.7:  # Store if confidence score is 0.7 or higher
        # Reference to Firestore collection
        collection_ref = db.collection("qa_dataa")

        # Data to be stored
        data = {
            "question": question,
            "ai_response": ai_response,
            "confidence_score": confidence_score,
            "timestamp": firestore.SERVER_TIMESTAMP  # Automatically adds a timestamp
        }

        # Add data to Firestore
        collection_ref.add(data)
        print("Stored in Firebase: Question and AI response.")
    else:
        print("Confidence score too low, not stored.")


In [19]:
# Main function to load embeddings, retrieve documents, calculate similarities, and format the output..
if __name__ == "__main__":
    # Path for Chroma database
    chroma_db_path = "/content/drive/MyDrive/rag-model/ChromaDB_Lang_Chain"

    # Create or load Chroma database
    chroma_db = create_or_load_chroma_db(chroma_db_path, embedding_model)

    # Create a retriever
    retriever = create_retriever(chroma_db=chroma_db)

    # Define the question
    questions = [
        "What is the primary purpose of the Inter-American Convention on International Traffic in Minors?"
    ]

    # Retrieve relevant answers and documents
    answers = get_answers(questions, retriever)

    # Clean and format the retrieved documents (answers), but not the questions
    for answer in answers:
        # Clean only the retrieved answers (documents), not the question
        cleaned_docs = clean_answers(answer['relevant_documents'])

        # Generate question embedding
        question_embedding = np.array(embedding_model.embed_query(answer['question']))

        # Generate document embeddings (after cleaning)
        doc_embeddings = np.array(get_embeddings(cleaned_docs, embedding_model))

        # Calculate cosine similarities for each cleaned document with respect to the cleaned question
        cosine_similarities = calculate_cosine_similarity(question_embedding, doc_embeddings)

        print(f"\nQuestion: {answer['question']}\n")
        # Sort documents based on cosine similarity (in descending order)
        sorted_docs = sorted(zip(cleaned_docs, cosine_similarities), key=lambda x: x[1], reverse=True)

        for i, (doc, sim) in enumerate(sorted_docs):
            print(f"Document {i+1}:\nCosine Similarity: {sim:.4f}\n")

        # Prepare the prompt template for Together model
        template = """
        This is a conversation between a human and an AI assistant familiar with human rights.
        {history}
        Human: I will provide a text from the retrieved documents and the question asked. Please formulate a coherent response based on the information provided.
        Be sure to highlight all important aspects of human rights mentioned in the text.
        If specific articles or laws related to human rights are mentioned in the text, please refer to them explicitly.
        In addition, be neutral in any response and make your primary reference the retrieved documents that I will send you.
        If sources are available, please refer to the documents appropriately.
        Text from the retrieved documents and the question asked:
        {input}
        AI:"""

        # Initialize PromptTemplate
        prompt = PromptTemplate(
            input_variables=["history", "input"],
            template=template,
        )

        # Define history and input text (combining the cleaned documents and question)
        history = ""
        input_text = f"Question: {answer['question']}\nDocuments:\n" + "\n".join([doc for doc, _ in sorted_docs])

        # Generate the complete prompt
        formatted_prompt = prompt.format(history=history, input=input_text)

        # Get the response from the Together model
        response = llm(formatted_prompt)

        # Print the response
        print("AI Response:", response)

        # Ncalculate similarity between AI response and the most relevant documents
        # Generate embedding for AI response
        response_embedding = np.array(embedding_model.embed_query(response))

        # Calculate cosine similarity between the AI response and each document embedding
        response_doc_similarities = cosine_similarity([response_embedding], doc_embeddings)[0]

        # Sort documents based on cosine similarity to AI response
        sorted_response_docs = sorted(zip(cleaned_docs, response_doc_similarities), key=lambda x: x[1], reverse=True)

        print("\nSimilarity between AI Response and Relevant Documents:\n")
        for i, (doc, sim) in enumerate(sorted_response_docs):
            print(f"Document {i+1}:\nCosine Similarity to AI Response: {sim:.4f}\n")

        # Now calculate the confidence score using the sorted documents and the weights
        confidence_score, confidence_level = calculate_confidence(sorted_response_docs, weights, threshold=0.7)

        # Print the confidence score and level
        print(f"Confidence Score: {confidence_score:.4f} ({confidence_level})")

        # Store in Firebase if confidence score is high
        store_in_firebase(answer['question'], response, confidence_score)

Loaded existing Chroma database from /content/drive/MyDrive/rag-model/ChromaDB_Lang_Chain


  results = retriever.get_relevant_documents(question)



Question: What is the primary purpose of the Inter-American Convention on International Traffic in Minors?

Document 1:
Cosine Similarity: 0.7984

Document 2:
Cosine Similarity: 0.7390

Document 3:
Cosine Similarity: 0.7250

Document 4:
Cosine Similarity: 0.7212



  response = llm(formatted_prompt)


AI Response:  Thank you for providing the text of the Inter-American Convention on International Traffic in Minors. Based on the information provided, the primary purpose of this Convention is to prevent and punish the international traffic in minors, as well as to regulate its civil and penal aspects. The Convention aims to protect the fundamental rights of minors and their best interests.

To achieve these objectives, the States Parties to the Convention undertake to:

* Ensure the protection of minors in consideration of their best interests (Article 1(a));
* Institute a system of mutual legal assistance among the States Parties (Article 8(a));
* Establish mechanisms for the exchange of information about domestic statutes, case law, administrative practices, statistics, and modalities regarding international traffic in minors (Article 8(b)); and
* Order such measures as may be necessary to remove any obstacles that might affect the enforcement of the Convention in their States (Arti