In [1]:
from pprint import pprint
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from dotenv import load_dotenv
import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_qdrant import FastEmbedSparse
from qdrant_client.models import  SparseVector
from qdrant_client import QdrantClient, models
from langchain import PromptTemplate    
from langchain.schema import BaseMemory
from langchain.memory import ChatMessageHistory
import numpy as np
import pandas as pd
import time
import csv
import time
import os


In [2]:
# Load environment variables from .env file
load_dotenv()

# Access OpenAI API Key
openai_api_key = os.getenv("OPENAI_API_KEY")

# Debug: Check if the API key is being correctly loaded
if openai_api_key is None or openai_api_key == "":
    print("OpenAI API Key not found! Please make sure it is set correctly in the .env file.")
else:
    print(f"OpenAI API Key Loaded: {openai_api_key[:4]}...")  # Only print part of the key for security reasons


llmName = ChatOpenAI(model="gpt-4o")
Chat_llm = ChatOpenAI(model="gpt-4o", temperature=1)



dense_embedding_model = OpenAIEmbeddings(
                api_key=openai_api_key,
                model="text-embedding-3-large"
            )
sparse_embedding_model = FastEmbedSparse(model_name="Qdrant/bm25")

OpenAI API Key Loaded: sk-p...


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
# Link to the Qdrant database & defines name for the collections
Clientpath = "VectorDB/CombinedData"
client = QdrantClient(path=Clientpath)
merged_collection_name = "merged vectors collection"

In [None]:
# Define the function for sparse and dense retrieval


def merged_dense_for_RRF(user_question, limitN):
    dense_query_vector =  dense_embedding_model.embed_query(user_question)
    merged_dense = client.query_points(
        merged_collection_name,
        query=dense_query_vector,
        using="text-embedding-3-large",
        limit=limitN,
        with_payload=True, 
    )

    return merged_dense

def merged_sparse_for_RRF(user_question, limitN):
    sparse_query_vector = sparse_embedding_model.embed_query(user_question)
    merged_sparse = client.query_points(
        collection_name=merged_collection_name,
        query=SparseVector(indices=sparse_query_vector.indices, values=sparse_query_vector.values),  
        using="bm25",
        with_payload=True,  
        limit=limitN  
    )
    
    return merged_sparse



In [None]:
# RRF calculation for ids
def reciprocal_rank_fusion(results1, results2, k=60):
    """Combine results using Reciprocal Rank Fusion (RRF)."""
    rrf_scores = {}

    # Process results
    for rank, scored_point in enumerate(results1.points, start=1):  
        doc_id = scored_point.id
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1 / (k + rank)

    for rank, scored_point in enumerate(results2.points, start=1):  
        doc_id = scored_point.id
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1 / (k + rank)

    # Sort by the combined RRF score
    sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_results



In [None]:
# From id back to list
def Fetch_chunks_back(fused_results):    
    fused_ids = [doc_id for doc_id, _ in fused_results]


    detailed_results = client.retrieve(
        collection_name=merged_collection_name,
        ids=fused_ids,
        with_payload=True, 
    )

    detailed_results_with_scores = []
    for result in detailed_results:

        doc_id = result.id
        score = next((s for d, s in fused_results if d == doc_id), 0)
        detailed_results_with_scores.append({"ID":result.id, "rrf_score": score, "payload": result.payload} )
    return detailed_results_with_scores


In [None]:
# Prompt template for rephrasing
updated_rephrase_question = PromptTemplate(template= """
You are tasked with rephrasing a user query for dense and sparse search on ISO and JRC documents. 
A good query should maximize precision and recall by focusing on relevant terms, avoiding conversational or reasoning context, and leveraging domain-specific language. 
Your goal is to create a concise, precise, and search-optimized query that retrieves relevant content effectively.



**User Query**:
{user_query}

**Instructions**:
1. Retain the meaning, context, and intent of the original query while simplifying it. Do not reduce it to a list of disconnected keywords.
2. Identify critical keywords or concepts in the user query (e.g., "storage," "ISO compliance," "capacity planning") and use them to construct a concise, structured query.
3. Write the rephrased query in the form of meaningful, concise phrases or sentences. Avoid generating disconnected terms or generic lists.
4. Expand the query with synonyms and related terms relevant to the domain (e.g., "storage systems," "capacity planning," "data protection") to improve hybrid search recall.
5. Tailor the query to support both dense (semantic) and sparse (keyword-based) retrieval by including specific phrases that enhance matching.
6. Avoid over-simplification or redundancy. Ensure the rephrased query is meaningful and aligned with the original user query.

**Example Rephrasing**:
User Query: "What are the best practices and ISO guidelines for my datacenter storage management in a new facility?"
Bad Output: "Datacenter storage, ISO compliance, best practices."
Good Output: "Best practices and ISO guidelines for datacenter storage management in new facilities."
If ISO or JRC are not mentioned do not mention it either

**Search Query**:                                 
Please provide a concise and precise search query based on the above instructions.                                   
                                   
         
                                   """)

# Function to rephrase the user query with template
def updated_rephase_query(user_query):
   new_quest = updated_rephrase_question.format(user_query=user_query)
   new_query = Chat_llm.invoke(new_quest)
    
   return new_query

In [None]:
# Remake the query for multiple iterations
def updated_query_remake(prompt, iterations):
    all_repharased_data = []
    
    for i in range(iterations):
        new_query = updated_rephase_query(prompt)
        all_repharased_data.append(new_query.content)
        
        
        
    return all_repharased_data


In [None]:
# Get retrieved data for multiple iterations at n_chunks for original query
def query_original_experiment(prompt, iterations, n_chunks):
    all_repharased_data = []
    
    for i in range(iterations):
        dense = merged_dense_for_RRF(prompt, n_chunks)
        sparse = merged_sparse_for_RRF(prompt, n_chunks)
        
        fused_results = reciprocal_rank_fusion(dense, sparse, k=60)
        nodes_rrf_back = Fetch_chunks_back(fused_results)
        
        data = (i+1, prompt, dense, sparse, nodes_rrf_back[:n_chunks])

        all_repharased_data.append(data)
        
        
        
    return all_repharased_data

In [None]:
# Get retrieved data for multiple iterations at n_chunks where query is rephrased each time
def query_remake_experiment(prompt, iterations, n_chunks):
    all_repharased_data = []
    
    for i in range(iterations):
        new_query = updated_rephase_query(prompt)
        dense = merged_dense_for_RRF(new_query.content, n_chunks)
        sparse = merged_sparse_for_RRF(new_query.content, n_chunks)
        
        fused_results = reciprocal_rank_fusion(dense, sparse, k=60)
        nodes_rrf_back = Fetch_chunks_back(fused_results)
        
        data = (i+1, new_query.content, dense, sparse, nodes_rrf_back[:n_chunks])
        all_repharased_data.append(data)
        
        
        
    return all_repharased_data


In [None]:
# Sort ids by combined, sparse and combined for each query

def ids_per_query(queries_its):

    dense_query_ids = {}
    sparse_query_ids = {}
    combined_query_ids = {}
    
    for i, query, dense, sparse, combined in queries_its:
        dense_ids = []  
        sparse_ids = []
        combined_ids = []
        
        for x in dense:
            for y in x[1]:  
                dense_ids.append(y.id)  
        dense_query_ids[query] = dense_ids  
        
        for x in sparse:
            for y in x[1]:  
                sparse_ids.append(y.id)
        sparse_query_ids[query] = sparse_ids
        
        for x in combined:
            combined_ids.append(x["ID"])
        combined_query_ids[query] = combined_ids
    
    return dense_query_ids, sparse_query_ids, combined_query_ids



In [None]:
# Matching ids to relevant ids
def matches_per_query(query_ids, correct_ids):


    matches_per_query = {}
    for query, ids in query_ids.items():
        matches = [id_ for id_ in ids if id_ in correct_ids]
        matches_per_query[query] = matches

    return matches_per_query



In [None]:
# Precision, recall, f1 calculation
def precision_recall_f1(ids, correct_ids):
    matchesquery = matches_per_query(ids, correct_ids)
    precision = {}
    recall = {}
    f1 = {}
    
    total_count = sum(len(value) for value in ids.values())/len(ids)
    
    for query, matches in matchesquery.items():
        true_positives = len(matches)
        print("true_positives",true_positives)
        false_positives = total_count - true_positives
        print("false_positives:",false_positives)
        print("correct ids len:",len(correct_ids))
        false_negatives = len(correct_ids) - true_positives
        print("false_negatives:",false_negatives)
        
        if true_positives == 0:
            precision[query] = 0
            recall[query] = 0
            f1[query] = 0
        
        else:
            precision[query] = true_positives / (true_positives + false_positives)
            print("precision:",precision[query])
            recall[query] = true_positives / (true_positives + false_negatives)
            print("recall:",recall[query])
            f1[query] = 2 * (precision[query] * recall[query]) / (precision[query] + recall[query])
            

    
    return precision, recall, f1

In [None]:
# Get top n amount of ids for each query
def k_ids_per_query(queryandid, k):
    k_ids = {}
    
    for query, id_list in queryandid.items():
        k_ids[query] = id_list[:k]
    
    return k_ids



In [None]:
# Calculate average precision, recall, f1
def precision_recall_f1_average(precision, recall, f1):
    avg_precision = sum(precision.values()) / len(precision)
    avg_recall = sum(recall.values()) / len(recall)
    avg_f1 = sum(f1.values()) / len(f1)
    
    return avg_precision, avg_recall, avg_f1

In [None]:
# Calculate precision, recall, f1 
def create_precision_recall_f1_at_n_for_all(ids, correct_ids, nlimit):
    k_ids = k_ids_per_query(ids, nlimit)
    precision, recall, f1 = precision_recall_f1(k_ids, correct_ids)
    print(precision, recall, f1)
    
    return precision, recall, f1, k_ids

In [None]:
# Combine all stats for query at n
def stat_for_n_all(ids, correct_ids, n_values):
    all_stats = {} 

    for n in n_values:
        all_stats[f"at{n}"] = create_precision_recall_f1_at_n_for_all(ids, correct_ids, n)

    return all_stats

    
    

In [None]:
# Create dataframe for recall, precision, f1 for each query and n combination
def return_df_recalltc_from_query(queries_its, correct_ids):
    n_values = [5, 10, 15, 20, 25, 30]
    ids_dense, ids_sparse, ids_combined = ids_per_query(queries_its)
    
    rows = []  # Store rows of data for the DataFrame
    
    for n in n_values:
        # Get top-n IDs for each method
        k_dense = k_ids_per_query(ids_dense, n)
        k_sparse = k_ids_per_query(ids_sparse, n)
        k_combined = k_ids_per_query(ids_combined, n)
        
        # Find matches for each method
        correct_ids_dense_at_n = matches_per_query(k_dense, correct_ids)
        print(n, correct_ids_dense_at_n)
        correct_ids_sparse_at_n = matches_per_query(k_sparse, correct_ids)
        correct_ids_combined_at_n = matches_per_query(k_combined, correct_ids)
        
        # Calculate precision, recall, and F1 for each method
        precision_dense, recall_dense, f1_dense = precision_recall_f1(k_dense, correct_ids)
        precision_sparse, recall_sparse, f1_sparse = precision_recall_f1(k_sparse, correct_ids)
        precision_combined, recall_combined, f1_combined = precision_recall_f1(k_combined, correct_ids)
        
        # Iterate over queries and append rows for each
        for query, dense_precision in precision_dense.items():
            rows.append({
                "query_id": query,
                "@n".format(n): f"n={n}",
                "precision_dense": dense_precision,
                "recall_dense": recall_dense.get(query, 0),
                "f1_dense": f1_dense.get(query, 0),
                "precision_sparse": precision_sparse.get(query, 0),
                "recall_sparse": recall_sparse.get(query, 0),
                "f1_sparse": f1_sparse.get(query, 0),
                "precision_combined": precision_combined.get(query, 0),
                "recall_combined": recall_combined.get(query, 0),
                "f1_combined": f1_combined.get(query, 0),
                # Include IDs used and correct IDs for each method
                "k_dense": k_dense.get(query, []),
                "correct_dense_ids": correct_ids_dense_at_n.get(query, []),
                "k_sparse": k_sparse.get(query, []),
                "correct_sparse_ids": correct_ids_sparse_at_n.get(query, []),
                "k_combined": k_combined.get(query, []),
                "correct_combined_ids": correct_ids_combined_at_n.get(query, []),
            })
    
    # Convert rows to a DataFrame
    df = pd.DataFrame(rows)
    return df


In [None]:

# List of can and should ids for f1, recall, precision calculation
ids_non_list2 = """
03484b73-af2f-4421-ad12-647933bb7b27
0dcad8a0-32fb-446d-b391-aba911b0c299
12636a3d-2a8f-4bfd-b6c9-61bc49367856
5341e7a4-9019-48ae-a2e4-cf5f41513258
6d558621-12df-40a2-8ab8-0536d1e7e831
76bb4ed2-cdc3-4bcc-a655-3a5096d9b567
7eb7238e-ec78-49ef-a3a1-d052faf63db4
8f206e3a-2233-4dcd-b8fd-3320b3b00535
a3d0d0e9-121f-4d8a-9bb8-74d2852a3ead
a592b230-f943-4d3d-a572-a9e9403c17fd
aa7fc6ef-3841-4609-aff0-3da4a3b4d342
b316e555-0a6f-4b6b-b63d-fbbda9e8a2ab
ba00fe1c-b83b-4e03-80b9-f0e4da3b5795
d186607e-d0db-4cab-9207-c04a5e977eb0
d8cba0f6-388a-449d-9451-5e90e8b31d28
ea51e76a-fd58-41ad-b284-322b4897055a
f0f11f5d-0599-4f89-9b6a-db182c3eef88
f77eb48b-6c60-4354-ba3f-a19c46745981
ff370be9-fd3f-43f8-a39f-af28c457f78b
1cab0008-a4c1-4a98-ab28-21fd9f9dda92
6fb75b07-f23d-4976-8d44-a9a4367e8156
608052d1-f563-4c95-8552-856d4a412c9c
67077be6-5a39-4b97-abd9-7c9e0e6f9e9b
8ee90780-61b2-4209-940f-943503d5ab4d
92063286-504f-4330-82ca-eccc8970c81d
b04c4f07-0a41-4ac7-b011-ff9002577342
b2867562-65f0-4ece-85bb-d41956243a66
c8830b8c-f2af-4970-8b23-29fd0bd2edcf
d9c2df6e-eaa4-4631-899e-d492a1f24245
da5889a5-e947-432a-a6f2-761fff918dbf
de2c7621-5139-4c1d-ba17-5b3d6c42422a
bb99c695-02ea-44aa-8673-63341ccc5c96
11ee0f48-baf6-4479-a40f-8867e3263a08
184dd353-cfd3-4f4f-b9cf-c11cdd77eeb4
2f195243-6e83-4df9-8a3c-50d749ce9903
7d35e6a3-3b9f-4b3a-a7a1-1beba469e29c
a71645ab-5c82-4cbd-bb5f-17c98bb9dfa6
add9239b-fe02-4ccd-bc9a-6213f39b5e2d
ba2fb31b-0f08-4c22-a523-53f8b0daeecf
d020d8fc-1c8b-4e45-97f4-b935ae2135cc

"""

# Convert the text into a list
correct_ids2 = ids_non_list2.strip().split("\n")

# Output the list
print(correct_ids2[:5])
print (len(correct_ids2))

['03484b73-af2f-4421-ad12-647933bb7b27', '0dcad8a0-32fb-446d-b391-aba911b0c299', '12636a3d-2a8f-4bfd-b6c9-61bc49367856', '5341e7a4-9019-48ae-a2e4-cf5f41513258', '6d558621-12df-40a2-8ab8-0536d1e7e831']
40


In [None]:
# List of should ids for f1, recall, precision calculation
ids_non_list = """
03484b73-af2f-4421-ad12-647933bb7b27
0dcad8a0-32fb-446d-b391-aba911b0c299
12636a3d-2a8f-4bfd-b6c9-61bc49367856
5341e7a4-9019-48ae-a2e4-cf5f41513258
6d558621-12df-40a2-8ab8-0536d1e7e831
76bb4ed2-cdc3-4bcc-a655-3a5096d9b567
7eb7238e-ec78-49ef-a3a1-d052faf63db4
8f206e3a-2233-4dcd-b8fd-3320b3b00535
a3d0d0e9-121f-4d8a-9bb8-74d2852a3ead
a592b230-f943-4d3d-a572-a9e9403c17fd
aa7fc6ef-3841-4609-aff0-3da4a3b4d342
b316e555-0a6f-4b6b-b63d-fbbda9e8a2ab
ba00fe1c-b83b-4e03-80b9-f0e4da3b5795
d186607e-d0db-4cab-9207-c04a5e977eb0
d8cba0f6-388a-449d-9451-5e90e8b31d28
ea51e76a-fd58-41ad-b284-322b4897055a
f0f11f5d-0599-4f89-9b6a-db182c3eef88
f77eb48b-6c60-4354-ba3f-a19c46745981
ff370be9-fd3f-43f8-a39f-af28c457f78b
1cab0008-a4c1-4a98-ab28-21fd9f9dda92
6fb75b07-f23d-4976-8d44-a9a4367e8156
"""

# Convert the text into a list
correct_ids = ids_non_list.strip().split("\n")

# Output the list
print(correct_ids[:5])
print (len(correct_ids))


['03484b73-af2f-4421-ad12-647933bb7b27', '0dcad8a0-32fb-446d-b391-aba911b0c299', '12636a3d-2a8f-4bfd-b6c9-61bc49367856', '5341e7a4-9019-48ae-a2e4-cf5f41513258', '6d558621-12df-40a2-8ab8-0536d1e7e831']
21


In [None]:
# Check all the ids
detailed_results = client.retrieve(
    collection_name=merged_collection_name,
    ids=correct_ids,
    with_payload=True,  
    )

print(len(detailed_results))
for x in detailed_results:
    print(x.id, x.payload.get("content", "No content found"))
    


21
03484b73-af2f-4421-ad12-647933bb7b27 A.9.1
Monitoring, measurement, analysis and evaluation for energy performance and the EnMS This clause involves implementation of the data collection plan (see 6.6) and evaluation of both energy performance improvement and effectiveness of the EnMS. Effectiveness of the EnMS can be demonstrated by improvement in energy performance and other intended outcomes. Energy performance improvement can be demonstrated by improvements in EnPI values over time, relative to the corresponding EnB. There can be situations where energy performance improvement is achieved from an activity that is not related to an SEU or key characteristic. In those instances, an EnPI and EnB can be established to demonstrate energy performance improvement. When conducting analysis, the limitations of the data (accuracy, precision, measurement uncertainty) and consistency of energy accounting should be taken into account before reaching final conclusions.
0dcad8a0-32fb-446d-b391

In [None]:
# Query for the experiment. If you want to save using the query as name, do not use '?'
query_test = "How can I monitor and report energy consumption trends over time"




In [None]:
# Run the experiment 

# Rephrased queries
# queries_its = query_remake_experiment(query_test, 5, 30)

# Original query only
queries_its = query_original_experiment(query_test, 1, 30)

In [1]:
# Make Df from results
should_be= return_df_recalltc_from_query(queries_its, correct_ids)

should_be


NameError: name 'return_df_recalltc_from_query' is not defined

In [None]:
print(should_be)

                                            query_id    @n  precision_dense  \
0  How can I monitor and report energy consumptio...   n=5         1.000000   
1  How can I monitor and report energy consumptio...  n=10         0.800000   
2  How can I monitor and report energy consumptio...  n=15         0.800000   
3  How can I monitor and report energy consumptio...  n=20         0.650000   
4  How can I monitor and report energy consumptio...  n=25         0.560000   
5  How can I monitor and report energy consumptio...  n=30         0.466667   

   recall_dense  f1_dense  precision_sparse  recall_sparse  f1_sparse  \
0      0.238095  0.384615              0.60       0.142857   0.230769   
1      0.380952  0.516129              0.40       0.190476   0.258065   
2      0.571429  0.666667              0.60       0.428571   0.500000   
3      0.619048  0.634146              0.50       0.476190   0.487805   
4      0.666667  0.608696              0.44       0.523810   0.478261   
5      0

In [None]:
can_should= return_df_recalltc_from_query(queries_its, correct_ids2)

can_should


5 {'How can I monitor and report energy consumption trends over time?': ['8f206e3a-2233-4dcd-b8fd-3320b3b00535', '7eb7238e-ec78-49ef-a3a1-d052faf63db4', '76bb4ed2-cdc3-4bcc-a655-3a5096d9b567', 'aa7fc6ef-3841-4609-aff0-3da4a3b4d342', '12636a3d-2a8f-4bfd-b6c9-61bc49367856']}
true_positives 5
false_positives: 0.0
correct ids len: 40
false_negatives: 35
precision: 1.0
recall: 0.125
true_positives 4
false_positives: 1.0
correct ids len: 40
false_negatives: 36
precision: 0.8
recall: 0.1
true_positives 4
false_positives: 1.0
correct ids len: 40
false_negatives: 36
precision: 0.8
recall: 0.1
10 {'How can I monitor and report energy consumption trends over time?': ['8f206e3a-2233-4dcd-b8fd-3320b3b00535', '7eb7238e-ec78-49ef-a3a1-d052faf63db4', '76bb4ed2-cdc3-4bcc-a655-3a5096d9b567', 'aa7fc6ef-3841-4609-aff0-3da4a3b4d342', '12636a3d-2a8f-4bfd-b6c9-61bc49367856', 'b316e555-0a6f-4b6b-b63d-fbbda9e8a2ab', '608052d1-f563-4c95-8552-856d4a412c9c', 'f0f11f5d-0599-4f89-9b6a-db182c3eef88', '5341e7a4-9019-

Unnamed: 0,query_id,@n,precision_dense,recall_dense,f1_dense,precision_sparse,recall_sparse,f1_sparse,precision_combined,recall_combined,f1_combined,k_dense,correct_dense_ids,k_sparse,correct_sparse_ids,k_combined,correct_combined_ids
0,How can I monitor and report energy consumptio...,n=5,1.0,0.125,0.222222,0.8,0.1,0.177778,0.8,0.1,0.177778,"[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[ea51e76a-fd58-41ad-b284-322b4897055a, fd9d500...","[ea51e76a-fd58-41ad-b284-322b4897055a, ff370be...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, fd9d500...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, 608052d..."
1,How can I monitor and report energy consumptio...,n=10,0.9,0.225,0.36,0.8,0.2,0.32,0.9,0.225,0.36,"[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[ea51e76a-fd58-41ad-b284-322b4897055a, fd9d500...","[ea51e76a-fd58-41ad-b284-322b4897055a, ff370be...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, fd9d500...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, 608052d..."
2,How can I monitor and report energy consumptio...,n=15,0.933333,0.35,0.509091,0.866667,0.325,0.472727,0.8,0.3,0.436364,"[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[ea51e76a-fd58-41ad-b284-322b4897055a, fd9d500...","[ea51e76a-fd58-41ad-b284-322b4897055a, ff370be...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, fd9d500...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, 608052d..."
3,How can I monitor and report energy consumptio...,n=20,0.8,0.4,0.533333,0.75,0.375,0.5,0.8,0.4,0.533333,"[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[ea51e76a-fd58-41ad-b284-322b4897055a, fd9d500...","[ea51e76a-fd58-41ad-b284-322b4897055a, ff370be...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, fd9d500...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, 608052d..."
4,How can I monitor and report energy consumptio...,n=25,0.76,0.475,0.584615,0.68,0.425,0.523077,0.84,0.525,0.646154,"[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[ea51e76a-fd58-41ad-b284-322b4897055a, fd9d500...","[ea51e76a-fd58-41ad-b284-322b4897055a, ff370be...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, fd9d500...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, 608052d..."
5,How can I monitor and report energy consumptio...,n=30,0.633333,0.475,0.542857,0.666667,0.5,0.571429,0.766667,0.575,0.657143,"[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[8f206e3a-2233-4dcd-b8fd-3320b3b00535, 7eb7238...","[ea51e76a-fd58-41ad-b284-322b4897055a, fd9d500...","[ea51e76a-fd58-41ad-b284-322b4897055a, ff370be...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, fd9d500...","[7eb7238e-ec78-49ef-a3a1-d052faf63db4, 608052d..."


In [None]:
# Name for the csv files
should_be_name = f"NEW_ORIGINAL_o3_should_be_{query_test}.csv"


can_should_name = f"NEW_ORIGINAL_o3_can_be_and_should_be_{query_test}.csv"

In [None]:
# Save the csv files
should_be .to_csv(should_be_name)
can_should.to_csv(can_should_name)

In [None]:
# Next part is to check how long it takes to run rephrase queries

In [2]:
# Define queries for the experiment
queries_non_list = """

How can I monitor and report energy consumption trends over time?
What are the risks of retrofitting an older data center?
What are the first steps in designing a new energy-efficient data center?
How do I upgrade an existing data center for better energy efficiency
How can I reduce cooling costs without breaking compliance?
"""

# Convert the text into a list
queries_list = queries_non_list.strip().split("\n")

# Output the list
print(queries_list)
print (len(queries_list))

['How can I monitor and report energy consumption trends over time?', 'What are the risks of retrofitting an older data center?', 'What are the first steps in designing a new energy-efficient data center?', 'How do I upgrade an existing data center for better energy efficiency', 'How can I reduce cooling costs without breaking compliance?']
5


In [None]:
# Remake n amount of queries for the same query
def get_new_query_for_query(query, k):
    responses = []
    save_path = "remake_time.csv"  
    file_exists = os.path.isfile(save_path)

    # Open CSV file in append mode
    with open(save_path, mode="a", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        
        # Write header only if file does not exist
        if not file_exists:
            writer.writerow(["Query", "Response", "Context", "Time Taken"])  
        
        for i in range(k):
            start_time = time.time()
            new_query = updated_rephase_query(query)
            end_time = time.time()
            time_taken = end_time - start_time
            
            print("All_info")
            print(query, new_query, time_taken)
            
            # Save the response to CSV immediately
            writer.writerow([query, new_query, time_taken])
            
            responses.append((query, new_query, time_taken))
            
            time.sleep(5)  # Pause between iterations (Helps with API eventhough it should not be needed)

    return responses

In [None]:
# Iterate through the queries and get the rephrased queries
def list_of_query_remakes(queries, k):
    responses_all = {}
    
    
    for query in queries:
        response_new = get_new_query_for_query(query, k)
        if query not in responses_all:
            responses_all[query] = []  
        responses_all[query].extend(response_new)
        

    return responses_all

In [None]:
# Run the experiment
try:
    all_query_responses = list_of_query_remakes(queries_list, 5)
except Exception as e:
    print(f"Error occurred: {e}")

All_info
How can I monitor and report energy consumption trends over time? content='"Monitoring and reporting energy consumption trends over time; methods, metrics, analysis, and tools for tracking energy use patterns."' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 386, 'total_tokens': 411, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_4691090a87', 'finish_reason': 'stop', 'logprobs': None} id='run-6a63141f-5f2f-4f9e-ad09-807ec43cf867-0' usage_metadata={'input_tokens': 386, 'output_tokens': 25, 'total_tokens': 411, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}} 1.9228088855743408
All_info
How can I monitor and report energy consumption trends

In [None]:
#put all data in DF (Though its also in the csv)
data = []
for query, responses in all_query_responses.items():
    for response, context, time_taken in responses:
        data.append((query, response, context, time_taken))
df = pd.DataFrame(data, columns=["Query", "Response", "Context", "Time Taken (s)"])

In [None]:
# Define new queries for testing retrieval/RRF time
list_queries = """

Monitoring and reporting energy consumption trends over time; methods, metrics, analysis, and tools for tracking energy use patterns.
Methods for monitoring and reporting energy consumption trends over time, including analysis and tracking tools.
Monitoring and reporting energy consumption trends over time: methods, tools, and data analysis techniques.
Methods to track and document energy consumption trends over time, including monitoring and reporting techniques.
Monitoring and reporting energy consumption trends over time; energy usage tracking, data analysis, trend identification, energy monitoring systems, consumption reporting methods.
'Risks associated with retrofitting older data centers, including potential impacts on infrastructure, energy efficiency, and operational reliability.
Risks of retrofitting older data centers: potential security vulnerabilities, energy efficiency challenges, infrastructure compatibility issues, and costs.
Risks and challenges of retrofitting aging data centers, including infrastructure updates, system compatibility, power and cooling efficiency, and data security.
Risks and challenges of retrofitting older data centers, including infrastructure, energy efficiency, and operational disruptions.
Risks and challenges of retrofitting older data centers, including infrastructure upgrades, operational continuity, and compatibility issues.
Initial steps in designing an energy-efficient data center, focusing on design strategies, energy-saving technologies, and sustainability practices.
Design initial steps for energy-efficient data center development
Designing a new energy-efficient data center initial steps: energy management, sustainable architecture, green technology integration, efficiency optimization.
Designing an energy-efficient data center: initial steps, energy optimization, green technology, sustainable facility planning, and power management.
Designing energy-efficient data centers: initial steps, energy optimization, sustainable practices, capacity planning, infrastructure design.
trategies for upgrading data centers to improve energy efficiency
Strategies for upgrading data center energy efficiency
Strategies to upgrade data center for improved energy efficiency, including enhancements, sustainable technology, energy-saving measures, and facility optimization.
Optimizing energy efficiency in data center upgrades.
Strategies and methods to upgrade data center energy efficiency improvements
Methods to reduce cooling costs while maintaining compliance with regulations
'Reducing cooling costs while maintaining regulatory compliance for energy efficiency.
Reducing cooling costs while maintaining compliance standards.
Reducing cooling costs while maintaining compliance.
educing cooling costs while maintaining compliance standards.


"""

# Convert the text into a list
list_queries = list_queries.strip().split("\n")

# Output the list
print(list_queries)
print (len(list_queries))

['Monitoring and reporting energy consumption trends over time; methods, metrics, analysis, and tools for tracking energy use patterns.', 'Methods for monitoring and reporting energy consumption trends over time, including analysis and tracking tools.', 'Monitoring and reporting energy consumption trends over time: methods, tools, and data analysis techniques.', 'Methods to track and document energy consumption trends over time, including monitoring and reporting techniques.', 'Monitoring and reporting energy consumption trends over time; energy usage tracking, data analysis, trend identification, energy monitoring systems, consumption reporting methods.', "'Risks associated with retrofitting older data centers, including potential impacts on infrastructure, energy efficiency, and operational reliability.", 'Risks of retrofitting older data centers: potential security vulnerabilities, energy efficiency challenges, infrastructure compatibility issues, and costs.', 'Risks and challenges 

In [None]:
# Time taken for retrieval after rephrasing
for x in list_queries:
    start_time = time.time()
    dense = merged_dense_for_RRF(x, 15)
    sparse = merged_sparse_for_RRF(x, 15)
        
    fused_results = reciprocal_rank_fusion(dense, sparse, k=60)
    nodes_rrf_back = Fetch_chunks_back(fused_results)
    
    end_time = time.time()
    time_taken = end_time - start_time
    print(time_taken)
    

0.6324546337127686
0.46624159812927246
0.3856334686279297
0.8570172786712646
0.4477729797363281
0.2763094902038574
0.570655107498169
1.0443291664123535
0.6629433631896973
0.5233328342437744
0.35721564292907715
0.2911980152130127
0.34453725814819336
0.3997797966003418
4.486051559448242
0.9787073135375977
0.6152215003967285
0.3515200614929199
0.869025468826294
0.47598767280578613
0.8658714294433594
0.41042613983154297
0.4432179927825928
0.9538888931274414
0.417144775390625
