In [1]:
import os
import weaviate
import weaviate.classes as wvc
import requests
import requests
import json
from tqdm import tqdm
from weaviate.classes.config import Configure, Property, DataType
import re


In [2]:
url = "https://llm.srv.webis.de/api/embeddings"

path_to_data = "/home/rag/Downloads/msmarco_v2.1_doc_segmented_00.json"

model_name = "all-minilm"
#model_name = "llama3:70b"

In [2]:
client = weaviate.connect_to_local(
    port=8080,
    grpc_port=50051)

# Population

In [4]:
def llm_generator(segment):
    prompt = f"I am going to give you a chunk. You will generate 5 possible questions for the given chunk. The chunk is: {segment}"
    url = "https://llm.srv.webis.de/api/generate"
    data = {
        "model": "llama3",
        "prompt": prompt,
        "stream": False
    }
    try:
        response = requests.post(url, json=data)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"Error: {e}")
        return None

In [5]:
def question_extractor(llm_response):
    text = llm_response["response"]

    pattern = r'\d+\.\s(.+?)(?=\n\d+\.|\Z)'

    questions = re.findall(pattern, text)

    return questions
    

In [15]:
client.collections.create(
    "mini_data_segment_hyde",
    properties=[  # Define properties
        Property(name="docid", data_type=DataType.TEXT),
        Property(name="url", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="headings", data_type=DataType.TEXT),
        Property(name="h_question", data_type=DataType.TEXT),
        Property(name="segment", data_type=DataType.TEXT),
        Property(name="start_char", data_type=DataType.INT),
        Property(name="end_char", data_type=DataType.INT),
    ],
    vectorizer_config=[
        # Set a named vector
        Configure.NamedVectors.text2vec_transformers(  # Use the "text2vec-cohere" vectorizer
            name="question", source_properties=["h_question"]       # Set the source property(ies)
        ),
        # Set another named vector
        Configure.NamedVectors.text2vec_transformers(  # Use the "text2vec-openai" vectorizer
            name="segment", source_properties=["segment"]         # Set the source property(ies)
        )
    ],
)

<weaviate.collections.collection.Collection at 0x7aec4a6a2d40>

In [11]:
object_list = []   
with open(path_to_data, 'rb') as f:
    # Iterate over the objects
    for line_number, object in enumerate(f):
        json_obj = json.loads(object)

        """embedding_response = requests.post(url, json={"model": model_name,
                                                    "prompt": object["segment"],
                                                    }).json()"""
        


        #webis llama70b generates the related questions based on segment
        llm_response = llm_generator(json_obj["segment"])
        h_questions = question_extractor(llm_response)

        for q in h_questions:
            sample_object = {
                'docid': json_obj["docid"], 
                'url': json_obj["url"], 
                'title': json_obj["title"], 
                'headings': json_obj["headings"], 
                'segment': json_obj["segment"], 
                'start_char': json_obj["start_char"], 
                'end_char': json_obj["end_char"], 
                'h_question': q,
                }
        
            object_list.append(sample_object)
        if line_number == 200:
            break

In [13]:
print(len(object_list))
c=0
for object in object_list:
    c=c+1
    print(object)
    if(c==10):
        break


747
{'docid': 'msmarco_v2.1_doc_00_0#0_0', 'url': 'http://0-60.reviews/0-60-times/', 'title': '0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews', 'headings': '0-60 Times\n0-60 Times', 'segment': '0-60 Times - 0-60 | 0 to 60 Times & 1/4 Mile Times | Zero to 60 Car Reviews\n0-60 Times\nThere are many ways to measure the power a vehicle has – top speed, horsepower, foot-pounds of torque. Those are all important, but the most asked question is, “What’s the 0-60 time?” This is nothing more than a measure of how quickly a vehicle can reach the 60 mile per hour mark. It is a measure of acceleration of a vehicle. 0-60 times differ a great deal depending on the amount of power a motor puts out, of course. But anyone who spends any amount of time with car enthusiasts are sure to hear the ubiquitous term bantered around more often than most other metrics by which cars are measured in terms of power. The only other measure that comes close as far as how acceleration is c

In [16]:
collection=client.collections.get("mini_data_segment_hyde")
with collection.batch.dynamic() as batch:
        for d in object_list:
            properties={
            "docid":d["docid"],
            "url":d["url"],
            "title":d["title"],
            "headings":d["headings"],
            "segment":d["segment"],
            "start_char":d["start_char"],
            "end_char":d["end_char"],
            "h_question":d["h_question"]
            }
            batch.add_object(properties)
            

In [17]:
with open('output.json','w') as file:
    json.dump(object_list,file,indent=4)

# Evaluation

In [3]:
from weaviate.classes.query import MetadataQuery

mini_collection = client.collections.get("mini_data_segment_hyde")

In [54]:
def retrieve_segments(topic,num_segments, target_vector):
    return mini_collection.query.near_text(query=topic,
                                              limit=num_segments,
                                              target_vector=target_vector,
                                              return_metadata=MetadataQuery(certainty=True))
#a function that takes a list of retrieval objects with duplicates
#and returns distinct results with the top 5 certainty score(cosine)
def trim_segments(retrieval_result):
    already_retrieved_ids = []
    distinct_segments = []
    for o in retrieval_result.objects:
            if o.properties["docid"] not in already_retrieved_ids:
                already_retrieved_ids.append(o.properties["docid"])
                distinct_segments.append(o)
                #only get top 5 similar
                if len(distinct_segments)== 5:
                    break

    return distinct_segments

In [90]:
topics = '/home/rag/Desktop/project-rag-ss24/src/Data/topics.msmarco-v2-doc.dev2.txt'

eval_dic = {}

with open(topics, 'r') as file:
    for line in file:
        try:
            id_, topic = line.split('\t')
            id_ = id_.strip()
            topic = topic.strip()
            #25 because we will evaluate based on 5 most similar segments and 
            #every segment has 5 hypothetical questions it is indexed through
            segment_result = retrieve_segments(topic, 25, "segment")
            question_result = retrieve_segments(topic, 25, "question")
        
            #this will ensure only topics with related segments to be put in the eval dictionary
            if (segment_result.objects[0].metadata.certainty < 0.80) & (question_result.objects[0].metadata.certainty < 0.80):
                pass
            else:
            
                segment_distinct_segments = trim_segments(segment_result)
                question_distinct_segments = trim_segments(question_result)
                
                eval_dic[topic] = {"nonhyde" : segment_distinct_segments,
                                "hyde": question_distinct_segments}
        except:
            print(line)

        
        
        






In [91]:
eval_dic

{'what is function of registry': {'nonhyde': [Object(uuid=_WeaviateUUIDInt('d9ba5f46-6191-46eb-98a7-f15075e16b05'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.7431946396827698, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'headings': '\n', 'start_char': 3306, 'url': 'http://00000.extensionfile.net/', 'segment': "Open 00000 file article translations\nFor more general information about how to open 00000 files, file extension 00000 and registry you can read one of the following articles: Windows registry- Windows registry is included in modern Windows operating\nsystems to replace the older INI files which also contained system configuration. Let's concentrate on the structure and purpose of Windows System\nRegistry, review some possible attacks to it and ways to avoid them...\nFolders, files and paths- Files are the entries or information stored on your\ncomputer. These are represented by binary cod

## Similarity Metric Comparison

In [117]:
#this gives a comparison between retrieved certainty scores(cosine similarity)
def compare_certainties(eval_dic):
    total_top_c_diff = 0

    nonhyde_total_certainty = 0
    hyde_total_certainty = 0

    len_eval = len(eval_dic.keys())

    for topic in eval_dic.keys():
        nonhyde_segments = eval_dic[topic]["nonhyde"]
        hyde_segments = eval_dic[topic]["hyde"]
        
        nonhyde_segments_top_c = nonhyde_segments[0].metadata.certainty
        hyde_segments_top_c = hyde_segments[0].metadata.certainty

        top_c_diff = hyde_segments_top_c - nonhyde_segments_top_c 
        
        
        for i in range(4):
            nonhyde_total_certainty += nonhyde_segments[i].metadata.certainty
            hyde_total_certainty += hyde_segments[i].metadata.certainty

        total_top_c_diff += top_c_diff

    total_certainty_diff = hyde_total_certainty - nonhyde_total_certainty
    avg_certainty_diff = total_certainty_diff/len_eval
    avg_top_certainty_diff = total_top_c_diff/len_eval


    print("Difference between total certainties: ", (total_certainty_diff), " and the avg. certainty change per question: ", (avg_certainty_diff) )
    print("Difference between top certainties: ", (total_top_c_diff), " and the avg. difference in top certainties: ", (avg_top_certainty_diff))

    

In [118]:
compare_certainties(eval_dic)

Difference between total certainties:  -0.1685647964477539  and the avg. certainty change per question:  -0.018729421827528212
Difference between top certainties:  0.03347349166870117  and the avg. difference in top certainties:  0.003719276852077908


In [104]:
for topic in eval_dic.keys():

    nonhyde_segments = eval_dic[topic]["nonhyde"]
    hyde_segments = eval_dic[topic]["hyde"]



    print(topic)
    print()
    for segment_i in range(4):
        print("Non-hyde certainty: ", nonhyde_segments[segment_i].metadata.certainty," Hyde certainty: ", hyde_segments[segment_i].metadata.certainty )
        print()
    print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

what is function of registry

Non-hyde certainty:  0.7431946396827698  Hyde certainty:  0.8076457977294922

Non-hyde certainty:  0.6820136308670044  Hyde certainty:  0.7454531788825989

Non-hyde certainty:  0.6575680375099182  Hyde certainty:  0.6477391719818115

Non-hyde certainty:  0.640792727470398  Hyde certainty:  0.6451333165168762

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
what is the fastest car in the?

Non-hyde certainty:  0.826582670211792  Hyde certainty:  0.7787390351295471

Non-hyde certainty:  0.784464955329895  Hyde certainty:  0.7160167694091797

Non-hyde certainty:  0.782470166683197  Hyde certainty:  0.7117319107055664

Non-hyde certainty:  0.7397179007530212  Hyde certainty:  0.6897644400596619

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
definition of shelf life

Non-hyde certainty:  0.7076284885406494  Hyde certainty:  0.8197730779647827

Non

## Human evaluation

In [153]:
#method 1 is nonhyde method 2 is hyde kept anonymous for eval purposes
exp_text = ""
for topic in eval_dic.keys():

    exp_text += "The query was: " + topic + " and the returned segments are: \n"

    nonhyde_segments = eval_dic[topic]["nonhyde"]
    hyde_segments = eval_dic[topic]["hyde"]

    
    nonhyde_text = "For method 1: \n"
    hyde_text = "For method 2: \n"
    for i in range(4):
        nonhyde_text += "SEGMENT NO "+ str(i+1) + ": \n " + nonhyde_segments[i].properties["segment"] + "\n"
        hyde_text += "SEGMENT NO "+ str(i+1) + ": \n " + hyde_segments[i].properties["segment"]+ "\n"
        
    exp_text += nonhyde_text + "\n \n"
    exp_text += hyde_text + "\n xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx \n"

with open("eval_text.txt", "w") as text_file:
    text_file.write(exp_text)


The query was: what is function of registry and the returned segments are: 
For method 1: 
SEGMENT NO 1: 
 Open 00000 file article translations
For more general information about how to open 00000 files, file extension 00000 and registry you can read one of the following articles: Windows registry- Windows registry is included in modern Windows operating
systems to replace the older INI files which also contained system configuration. Let's concentrate on the structure and purpose of Windows System
Registry, review some possible attacks to it and ways to avoid them...
Folders, files and paths- Files are the entries or information stored on your
computer. These are represented by binary coding and written on the tracks on a disk. Files are often represented by distinct icons, a normal practice done by
Microsoft with their products including system files for their series of operating systems...
What are file extensions? - File extensions are unnoticed yet are very crucial parts of
the co

## LLM-as-a-judge

In [209]:
def llm_judge(segment1,segment2, query):
    prompt = "Which one of these segments is more related to this query: '" + query + "' DONT'T SUMMARIZE ANTHING JUST TELL ME WHICH YOU THINK IS MORE RELATED. 1/2? \n1. " + segment1 + "\n 2." + segment2
    url = "https://llm.srv.webis.de/api/generate"
    data = {
        "model": "llama3",
        "prompt": prompt,
        "stream": False
    }
    try:
        response = requests.post(url, json=data)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"Error: {e}")
        return None

In [233]:
#method 1 is nonhyde method 2 is hyde kept anonymous for eval purposes
llm_text = ""
for topic in eval_dic.keys():
    nonhyde_segments = eval_dic[topic]["nonhyde"]
    hyde_segments = eval_dic[topic]["hyde"]


    for i in range(4):
        #only go to llm if there is a disagreement between the methods
        if nonhyde_segments[i].properties["docid"] != hyde_segments[i].properties["docid"]:
            nonhyde_s = nonhyde_segments[i].properties["segment"]
            hyde_s = hyde_segments[i].properties["segment"]

            llm_evaluation = llm_judge(nonhyde_s, hyde_s, topic)
            llm_text += "For segment: " + str(i+1) + "\n" 
            llm_text += llm_evaluation["response"] + "\n"



with open("llm_eval_text.txt", "w") as text_file:
   text_file.write(llm_text)


In [223]:
q = list(eval_dic.keys())[2]
s1 = eval_dic[q]["hyde"][0].properties["segment"]
s2 = eval_dic[q]["hyde"][1].properties["segment"]


llm_judge(s1, s2, q)["response"]

"I think segment 2 is more related to this query: 'definition of shelf life' because it explicitly answers the question about the shelf life of LA's Totally Awesome All Purpose Cleaner products."

In [220]:
q

'what is the fastest car in the?'

In [224]:
s1

'Let surface dry and then set it back up for the pet. Return the pet to the room. Are LA’s Totally Awesome All Purpose Cleaner ® containers recyclable? Please refer to the logo on the container. With a little bit of care much plastic can be recycled, and collection of plastics for recycling is increasing rapidly. The plastic industry has responded to this problem by developing a series of cryptic markers, commonly seen on the bottom of plastic containers. These markers do not mean the plastic can be recycled, these makers do not mean the container uses recycled plastic. You should place in your bin only those types of plastic listed by your local recycling agency! What is the shelf life of your products? All of our products are tested for a shelf life of a minimum of two years from the date of manufacture.'

In [226]:
s2

'The plastic industry has responded to this problem by developing a series of cryptic markers, commonly seen on the bottom of plastic containers. These markers do not mean the plastic can be recycled, these makers do not mean the container uses recycled plastic. You should place in your bin only those types of plastic listed by your local recycling agency! What is the shelf life of your products? All of our products are tested for a shelf life of a minimum of two years from the date of manufacture. However, we do recommend using the product within one year of opening it to guarantee freshness and efficacy. What is LA’s Totally Awesome All Purpose Cleaner ® made from? LA’s Totally Awesome All Purpose Cleaner ® is a blend of water, surfactant (surface-active) agents, color and scent. Surfactant agents lift grease and oil up off of the surface. LA’s Totally Awesome All Purpose Cleaner ® is a safe, naturally gentle soap, specially formulated for cleaning a variety of surfaces like floors, 