In [2]:
# !pip install requests langchain-community beautifulsoup4 langchain chromadb sentence-transformers together

In [None]:
!pip install chromadb

In [3]:
# !pip install --upgrade langchain

In [4]:
# pip install firebase-admin

In [1]:
import os
import json
import requests
import chromadb
from chromadb import Client
import pickle
# from google.colab import drive
import numpy as np
# import gensim.downloader
# from langchain.embeddings import HuggingFaceEmbeddings
# from sklearn.metrics.pairwise import cosine_similarity
from langchain.vectorstores import Chroma
# from scipy.spatial.distance import cosine
from together import Together
from chromadb.config import Settings
# from transformers import GPT2Tokenizer
# import firebase_admin
# from firebase_admin import credentials, firestore

In [9]:
# drive.mount('/content/drive')

In [3]:
persist_directory='chromadb'
os.makedirs(persist_directory, exist_ok=True) 

In [4]:
client = Client(Settings(persist_directory=persist_directory))

# **Load the files from the github repo and add the data from the files into chromadb collection**

In [5]:
def load_data_from_github_json(github_repo_url, collection_type, client):
    # Dictionary of files based on collection type
    json_files = {
        "HF": ["rag_with_HF_1.json", "rag_with_HF_2.json", "rag_with_HF_3.json"],
        "w2v": ["rag_with_w2v_1.json", "rag_with_w2v_2.json", "rag_with_w2v_3.json"]
    }

    # Ensure the collection type is valid
    if collection_type not in json_files:
        print("Invalid collection type. Please choose either 'HF' or 'w2v'.")
        return

    # Loop through each file and load its data
    for filename in json_files[collection_type]:
        file_url = f"{github_repo_url}/raw/main/{filename}"  # Adjust branch name if necessary
        response = requests.get(file_url)

        if response.status_code == 200:
            data = response.json()

            # Extract the data arrays
            documents = data.get('documents', [])
            embeddings = data.get('embeddings', [])
            metadatas = data.get('metadatas', [])
            ids = data.get('ids', [])

            collection_name = f"rag_with_{collection_type}"
            collection = client.get_or_create_collection(name=collection_name)

            for doc, embedding, metadata, doc_id in zip(documents, embeddings, metadatas, ids):
                collection.add(
                    documents=[doc],
                    embeddings=[embedding],  # embeddings should be a list of lists
                    metadatas=[metadata],
                    ids=[doc_id]
                )

In [6]:
load_data_from_github_json("https://github.com/FarahSaleh121/chroma_collections", "HF",client)

: 

In [9]:
load_data_from_github_json("https://github.com/FarahSaleh121/chroma_collections", "w2v",client)

# **Query**

Embedding model for embedding the query

In [10]:
# 1. Hugging Face Embedding
hf_embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

  hf_embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
# 2. Word2Vec Embedding
with open('/content/drive/My Drive/w2v_RAG.pkl', 'rb') as file:
    word2vec_model = pickle.load(file)

In [12]:
# 3. TF-IDF Vectorizer
with open('/content/drive/My Drive/tfidf_vectorizer_RAG.pkl', 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

functions to query the collections

In [13]:
def calculate_cosine_similarity(query_embedding, document_embeddings):
    query_embedding = np.array(query_embedding)  # Convert to NumPy array

    # Ensure that the query and document embeddings are in the correct shape
    query_embedding = query_embedding.reshape(1, -1)  # Reshape to 2D array for cosine similarity
    return cosine_similarity(query_embedding, document_embeddings).flatten()  # Flatten to get a 1D array of similaritiess

In [15]:
def query_and_compare_collections(query_text):
    # Get the query embedding from the Hugging Face embedding function
    query_embedding_hf = hf_embedding_function.embed_query(query_text)
    tokens = query_text.split()

    # Calculate Word2Vec embedding by averaging token embeddings
    token_embeddings = [word2vec_model[token] for token in tokens if token in word2vec_model.key_to_index]
    if token_embeddings:  # Check if there are valid tokens
        query_embedding_w2v = np.mean(token_embeddings, axis=0)
    else:
        print("No valid tokens found in Word2Vec model.")
        return

    # Query the Hugging Face collection
    hf_collection_name = "rag_with_HF"
    hf_collection = client.get_collection(name=hf_collection_name)
    hf_results = hf_collection.query(
        query_embeddings=[query_embedding_hf],
        n_results=5,  # Adjust number of results as needed
        include=["embeddings", 'documents']
    )

    # Query the Word2Vec collection
    w2v_collection_name = "rag_with_w2v"
    w2v_collection = client.get_collection(name=w2v_collection_name)
    w2v_results = w2v_collection.query(
        query_embeddings=[query_embedding_w2v],
        n_results=5,  # Adjust number of results as needed
        include=["embeddings", 'documents']
    )

    # Calculate cosine similarity for Hugging Face results
    hf_embeddings = hf_results['embeddings'][0]  # Get the first (and only) list of embeddings
    hf_similarities = calculate_cosine_similarity(query_embedding_hf, hf_embeddings)

    # Calculate cosine similarity for Word2Vec results
    w2v_embeddings = w2v_results['embeddings'][0]  # Get the first (and only) list of embeddings
    w2v_similarities = calculate_cosine_similarity(query_embedding_w2v, w2v_embeddings)

    # Print results for Hugging Face
    print("Hugging Face Query Results:")
    for i, (document, similarity) in enumerate(zip(hf_results['documents'][0], hf_similarities)):  # Accessing the first list of documents
        print(f"Result {i + 1}: Cosine Similarity: {similarity}")

    # Print results for Word2Vec
    print("\nWord2Vec Query Results:")
    for i, (document, similarity) in enumerate(zip(w2v_results['documents'][0], w2v_similarities)):  # Accessing the first list of documents
        print(f"Result {i + 1}:, Cosine Similarity: {similarity}")

In [16]:
query_and_compare_collections("human rights")

Hugging Face Query Results:
Result 1: Cosine Similarity: 0.7330039622868058
Result 2: Cosine Similarity: 0.7264168216579585
Result 3: Cosine Similarity: 0.7212332011653644
Result 4: Cosine Similarity: 0.7190080547294045
Result 5: Cosine Similarity: 0.7146716006066723

Word2Vec Query Results:
Result 1:, Cosine Similarity: 0.5968154895767868
Result 2:, Cosine Similarity: 0.5783686264709111
Result 3:, Cosine Similarity: 0.5665472088086554
Result 4:, Cosine Similarity: 0.5683426622006983
Result 5:, Cosine Similarity: 0.5556012270068937


# **Retrieval**

similarity score with a threshold of 0.55.

Diversity of results returned by Maximal Marginal Relevance (MMR) is set to 0.25.

In [17]:
def initial_query(collection, query_embedding, k=10):
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k,
        include=["documents", "embeddings"]
    )
    # Return list of results
    return results['documents'][0], results['embeddings'][0]

In [18]:
def filter_by_similarity(documents, embeddings, query_embedding, threshold=0.7):

    document_embeddings = np.array(embeddings)  # Convert embeddings list to NumPy array
    if document_embeddings.ndim == 3:
        # Reshape to (10, 768)
        document_embeddings = document_embeddings.reshape(document_embeddings.shape[1], -1)

    # Calculate cosine similarities
    similarities = calculate_cosine_similarity(query_embedding, document_embeddings)

    # Filter documents based on threshold
    filtered_documents = []
    for doc, similarity, embedding in zip(documents, similarities, document_embeddings):
        if similarity >= threshold:
            filtered_documents.append((doc, similarity, embedding))  # Include embedding in the tuple

    return filtered_documents

In [19]:
# Apply MMR for diversity
def apply_mmr(filtered_results, query_embedding, k, lambda_mult):
    selected_results = []

    # Convert query_embedding into a 1D array
    query_embedding = np.array(query_embedding).flatten() #shape(768,)

    for _ in range(k):
        if not filtered_results:  # Check if there are any results left to select
            break

        # Calculate the scores for each candidate in the filtered results
        candidate_scores = []
        for candidate in filtered_results:
            candidate_document = candidate[0]
            candidate_embedding = candidate[2]

            # Ensure the embedding is a NumPy array and flatten it
            candidate_embedding = np.array(candidate_embedding).flatten()
            # print(candidate_embedding.shape) #(768)(ddebugging)
            # Compute the mmr score based on the similarity to the query and the selected results
            score = lambda_mult * (1 - cosine(query_embedding, candidate_embedding)) - \
                    (1 - lambda_mult) * min(
                        [cosine(candidate_embedding, sel[2].flatten()) for sel in selected_results] or [1]
                    )
            candidate_scores.append((candidate, score))

        # Select the candidate with the highest score
        selected_candidate, _ = max(candidate_scores, key=lambda x: x[1])
        selected_results.append(selected_candidate)

        # Remove the selected candidate from the filtered results
        filtered_results.remove(selected_candidate)

    return selected_results

In [20]:
# Parameters
k = 4
lambda_mult = 0.25

In [200]:
query_text = "According to the ASEAN Human Rights Declaration, what is the primary responsibility of ASEAN Member States in promoting and protecting human rights?"
query_embedding = hf_embedding_function.embed_query(query_text)

In [22]:
query_embedding = np.array(query_embedding)

if query_embedding.ndim > 1:
    query_embedding = query_embedding.flatten()

In [None]:
# print(query_embedding.shape)

In [23]:
# Execute the query, filter, and apply MMR
collection_name = "rag_with_HF"
# collection = client.get_collection(name=collection_name)
# initial_results_documents, initial_results_embeddings = initial_query(collection, query_embedding, k=10)
# filtered_results = filter_by_similarity(initial_results_documents, initial_results_embeddings, query_embedding)

In [213]:
def retrieve_final_results(client, collection_name, query_embedding, k, lambda_mult, similarity_threshold=0.7):
    # Retrieve the collection
    collection = client.get_collection(name=collection_name)

    # Execute the initial query
    initial_results_documents, initial_results_embeddings = initial_query(collection, query_embedding, k)
    # Filter the results based on similarity
    filtered_results = filter_by_similarity(initial_results_documents, initial_results_embeddings, query_embedding, similarity_threshold)
    # Apply MMR for diversity
    final_results = apply_mmr(filtered_results, query_embedding, k, lambda_mult)

    return final_results

In [None]:
# # Print all filtered results
# print("Filtered documents before MMR:")
# for doc, similarity,embed in filtered_results:
#     print(f"Document Similarity: {similarity}")

Filtered documents before MMR:
Document Similarity: 0.7330039622868058
Document Similarity: 0.7264168216579585
Document Similarity: 0.7212332011653644
Document Similarity: 0.7190080547294045
Document Similarity: 0.7146716006066723
Document Similarity: 0.7094282959060765
Document Similarity: 0.7045538977612839


In [214]:
final_results = retrieve_final_results(client,collection_name, query_embedding, k, lambda_mult)

In [26]:
print("Filtered documents after MMR:")
for i, (result, similarity, embed) in enumerate(final_results, 1):
    # Check if result is a dictionary before accessing 'document'
    if isinstance(result, dict) and 'document' in result:
        print(f"Similarity Score: {similarity:.2f}\n")
    else:
        # Handle the case where result is not a dictionary or doesn't have 'document' key
        print(f"Similarity Score: {similarity:.2f}\n")

Filtered documents after MMR:
Similarity Score: 0.73

Similarity Score: 0.72

Similarity Score: 0.73

Similarity Score: 0.72



# **Prompts**


In [None]:
#tg_client = Together(api_key='b49d2dd93bb8afc46de91bb8c6d7d5827bedd471d414df36dd2cbcb7bb0aa786')

In [37]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [97]:
# Define the structured summarization prompt
summarization_prompt = """
Summarize the following document based on the question in a structured format:
Question: "{question}"
Document: "{document}"
"""

In [30]:
# Define a template for handling unknown information
unknown_info_template = """
Response to the Query:

Thank you for your question! However, it appears that the information you've provided falls outside the scope of my training and knowledge base. Here are some suggestions on how you can approach this situation:

1. Contextual Clarification:
   - Please provide additional context or clarify your question. This might help me assist you better.

2. General Guidance:
   - While I may not have specific information on that topic, I can offer general advice or direct you to reliable sources.

3. Further Exploration:
   - Consider exploring reputable websites, academic journals, or subject-matter experts who specialize in the area you’re inquiring about.

4. Related Topics:
   - If you're interested, I can provide information or summaries related to similar topics that are within my knowledge base.

If you have other questions or topics you'd like to discuss, feel free to ask!
"""


In [107]:
# Define the response template for formatting the aggregated output
response_template = """
1. Introduction:
   {introduction}

2. Key Principles:
   {key_principles}

3. Rights Affirmed:
   {rights_affirmed}

4. Conclusion:
   {conclusion}
"""

In [161]:
def aggregate_summaries(summaries):
    num_summaries = len(summaries)

    if num_summaries == 0:
        return "No summaries available to aggregate."

    # Logical grouping of summaries into template sections
    introduction = " ".join(summaries[:1])  # First summary for Introduction
    key_principles = " ".join(summaries[1:num_summaries // 2])  # First half of remaining for Key Principles
    rights_affirmed = " ".join(summaries[num_summaries // 2:-1])  # Second half of remaining for Rights Affirmed
    conclusion = " ".join(summaries[-1:])  # Last summary for Conclusion

    # Format the aggregated sections with the response template
    formatted_response = response_template.format(
        introduction=introduction,
        key_principles=key_principles,
        rights_affirmed=rights_affirmed,
        conclusion=conclusion
    )

    return formatted_response

In [130]:
# Function to summarize the document using the Together client
def summarize_with_together_api(tg_client, question, document, max_tokens=4097):
    # Prepare prompt from the summarization template
    prompt = summarization_prompt.format(question=question, document=document)

    # Calculate the token limit for the document after accounting for the prompt overhead
    prompt_tokens = tokenizer.encode(prompt, return_tensors="pt")
    max_document_tokens = max_tokens - len(prompt_tokens[0]) - 1  # Reserve room for response tokens

    # Truncate document if necessary
    document_tokens = tokenizer.encode(document, return_tensors="pt")
    if len(document_tokens[0]) > max_document_tokens:
        truncated_document = tokenizer.decode(document_tokens[0][:max_document_tokens])
        prompt = summarization_prompt.format(question=question, document=truncated_document)

    try:
        # API call with the prompt
        response = tg_client.chat.completions.create(
            model="Gryphe/MythoMax-L2-13b-Lite",
            messages=[{"role": "user", "content": prompt}]
        )

        # Return the summary if response is valid
        if response and response.choices:
            return response.choices[0].message.content.strip()
        else:
            return "No summary available from the model."

    except Exception as e:
        return f"Error while calling the Together API: {str(e)}"

In [195]:
def summarize_documents(tg_client, question, documents, max_tokens=4097):
    if not documents:
        # If documents are empty
        return None

    summaries = []
    for document in documents:
        summary = summarize_with_together_api(tg_client, question, document, max_tokens=4097)

        # Append a formatted summary or placeholder if no summary was generated
        summaries.append(summary if summary else "No summary available for this document.")

    return summaries

In [207]:
def main(client, tg_client, collection_name, query_text, k, lambda_mult):
    # Prepare the query embedding
    query_embedding = hf_embedding_function.embed_query(query_text)
    query_embedding = np.array(query_embedding).flatten()  # Ensure 1D array

    # Retrieve results
    final_results = retrieve_final_results(client, collection_name, query_embedding, k, lambda_mult)
    # print(final_results)
    summaries_ret = summarize_documents(tg_client, query_text, [doc[0] for doc in final_results], max_tokens=4097)
    # print(summaries_ret)
    # Print each summary directly or use the unknown template as fallback
    if isinstance(summaries_ret, list) and all (summaries_ret) :
         aggregated_response = aggregate_summaries(summaries_ret)
         print(aggregated_response)

    else:
        print(unknown_info_template.strip())

In [168]:
collection_name = "rag_with_HF"
k = 5  # Number of results to retrieve
lambda_mult = 0.25
max_tokens=4097

In [208]:
main(client, tg_client, collection_name, query_text, k, lambda_mult)

Response to the Query:

Thank you for your question! However, it appears that the information you've provided falls outside the scope of my training and knowledge base. Here are some suggestions on how you can approach this situation:

1. Contextual Clarification:
   - Please provide additional context or clarify your question. This might help me assist you better.

2. General Guidance:
   - While I may not have specific information on that topic, I can offer general advice or direct you to reliable sources.

3. Further Exploration:
   - Consider exploring reputable websites, academic journals, or subject-matter experts who specialize in the area you’re inquiring about.

4. Related Topics:
   - If you're interested, I can provide information or summaries related to similar topics that are within my knowledge base.

If you have other questions or topics you'd like to discuss, feel free to ask!


# **QA caching system**

Set up Firestore

In [112]:
# Initialize Firebase Admin SDK
cred = credentials.Certificate("/content/drive/MyDrive/firebaseCred.json")
firebase_admin.initialize_app(cred)

<firebase_admin.App at 0x790ce9b52380>

In [113]:
db = firestore.client()

In [114]:
# Define Firestore collection name
fb_collection_name = 'QACache'

Function to Store Queries and Answers

In [123]:
def store_query_answer(query, answer):
    # Store the query and answer in Firestore
    db.collection('QACache').document(query).set({
        'answer': answer
    })

Function to Retrieve Answer

In [156]:
# Function to check and store query results in Firestore
def check_store_in_firestore(query_text, answer):
    doc_ref = db.collection("QACache").document(query_text)

    # Check if the query already exists
    doc = doc_ref.get()
    if doc.exists:
        # print(doc.to_dict())
        return doc.to_dict()["answer"]  # Return the stored response

    # If not, store the new query and response
    doc_ref.set({"answer": answer})
    return answer

In [172]:
# Function to handle queries with summarization and aggregation
def handle_query(query_text, tg_client, client, collection_name, query_embedding, k=10, lambda_mult=0.25):
    # Check if query exists in Firestore
    cached_response = check_store_in_firestore(query_text, None)
    if cached_response:
        res = aggregate_summaries(cached_response)
        return res  # Return the cached response if found

    # Proceed to retrieve results if not cached
    final_results = retrieve_final_results(client, collection_name, query_embedding, k, lambda_mult)

    if final_results:
        # Summarize documents
        summaries = summarize_documents(tg_client, query_text,  [doc[0] for doc in final_results])

        # Aggregate summaries into the structured response template
        response = aggregate_summaries(summaries)

        # Store the new query and response in Firestore
        check_store_in_firestore(query_text, response)

        print(response)

    else:
        print(unknown_info_template.strip())

In [215]:
handle_query(query_text, tg_client,client,collection_name,query_embedding)

Response to the Query:

Thank you for your question! However, it appears that the information you've provided falls outside the scope of my training and knowledge base. Here are some suggestions on how you can approach this situation:

1. Contextual Clarification:
   - Please provide additional context or clarify your question. This might help me assist you better.

2. General Guidance:
   - While I may not have specific information on that topic, I can offer general advice or direct you to reliable sources.

3. Further Exploration:
   - Consider exploring reputable websites, academic journals, or subject-matter experts who specialize in the area you’re inquiring about.

4. Related Topics:
   - If you're interested, I can provide information or summaries related to similar topics that are within my knowledge base.

If you have other questions or topics you'd like to discuss, feel free to ask!


In [176]:
# Function to retrieve and display data from Firestore
def display_data_from_firestore(collection_name):
    # Retrieve the collection
    collection_ref = db.collection(collection_name)
    docs = collection_ref.stream()  # Retrieve all documents in the collection

    # Display each document
    for doc in docs:
        doc_data = doc.to_dict()  # Convert document data to a dictionary
        query_text = doc.id  # Get the document ID (query text)
        answer = doc_data.get("answer", "No answer found")  # Retrieve the answer field

        # Display the data
        print(f"Query: {query_text}\nAnswer: {answer}\n{'-'*40}")

In [216]:
display_data_from_firestore("QACache")

Query: According to the ASEAN Human Rights Declaration, what is the primary responsibility of ASEAN Member States in promoting and protecting human rights?
Answer: None
----------------------------------------
Query: Freedom
Answer: None
----------------------------------------
Query: Freedom of Religion and Belief
Answer: None
----------------------------------------
Query: hi
Answer: None
----------------------------------------
Query: human
Answer: None
----------------------------------------
Query: human rights
Answer: ['The Human Rights Library at the University of Minnesota provides access to over 65,000 human rights documents, including treaties, international instruments, and regional materials. It offers topic guides, fellowship information, and training opportunities. The library also features links to over 4,000 other human rights sites and search engines for efficient document search. It has received numerous awards and honors for its work in promoting human rights educati