In [2]:
from datasets import load_dataset
import json
import requests
from bs4 import BeautifulSoup, Comment
from bs4 import Tag
from collections import deque
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds = load_dataset(
    "google-research-datasets/natural_questions", "dev", split="validation"
)

In [4]:
i = 4

sample1 = ds[i]["document"]["html"]
ques1 = ds[i]["question"]["text"]
token_list1 = ds[i]["document"]["tokens"]
short_answer = ds[i]["annotations"]
long_answer = ds[i]["long_answer_candidates"]

In [5]:
sample1

'<!DOCTYPE html>\n<HTML class="client-js ve-not-available" lang="en" dir="ltr"><HEAD>\n\n<TITLE>Longest word in English - Wikipedia</TITLE>\n\n\n<LINK rel="stylesheet" href="/w/load.php?debug=false&amp;lang=en&amp;modules=ext.cite.styles%7Cext.math.styles%7Cext.tmh.thumbnail.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cmediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.sectionAnchor%7Cmediawiki.skinning.interface%7Cskins.vector.styles%7Cwikibase.client.init&amp;only=styles&amp;skin=vector" />\n\n<STYLE>\n.referencetooltip{position:absolute;list-style:none;list-style-image:none;opacity:0;font-size:10px;margin:0;z-index:5;padding:0}.referencetooltip li{border:#080086 2px solid;max-width:260px;padding:10px 8px 13px 8px;margin:0px;background-color:#F7F7F7;-webkit-box-shadow:2px 4px 2px rgba(0,0,0,0.3);-moz-box-shadow:2px 4px 2px rgba(0,0,0,0.3);box-shadow:2px 4px 2px rgba(0,0,0,0.3)}.referencetooltip li+li{margin-left:7px;margin-top

In [6]:
long_answer.keys()

dict_keys(['start_token', 'end_token', 'start_byte', 'end_byte', 'top_level'])

In [7]:
for candidate in zip(
    long_answer["start_token"],
    long_answer["end_token"],
    long_answer["start_byte"],
    long_answer["end_byte"],
    long_answer["top_level"],
):
    start_byte, end_byte = candidate[2], candidate[3]
    start_token, end_token = candidate[0], candidate[1]
    if candidate[4] is True:
        print(start_byte, end_byte, start_token, end_token)

        print(sample1[start_byte:end_byte])
        print(token_list1["token"][start_token], token_list1["token"][end_token])

131573 132771 26 130
<P>The identity of the <B>longest word in English</B> depends upon the <A href="/wiki/Definition" title="Definition">definition</A> of what constitutes a <A href="/wiki/Word" title="Word">word</A> in the <A href="/wiki/English_language" title="English language">English language</A>, as well as how length should be compared. In addition to words derived naturally from <A href="/wiki/History_of_English" title="History of English">the language&#39;s roots</A> (without any known intentional invention), English allows <A href="/wiki/New_words" class="mw-redirect" title="New words">new words</A> to be formed by <A href="#Coinages">coinage</A> and <A href="#Constructions">construction</A>; <A href="#Place_names">place names</A> may be considered words; <A href="#Technical_terms">technical terms</A> may be arbitrarily long. Length may be understood in terms of <A href="/wiki/Orthography" title="Orthography">orthography</A> and number of written <A href="/wiki/Letter_(alpha

In [8]:
sample2 = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>More Complex Example</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            line-height: 1.6;
            margin: 20px;
        }
        .container {
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
            border: 1px solid #ccc;
        }
        h1, h2 {
            color: #333;
        }
        p {
            margin-bottom: 15px;
        }
        ul {
            list-style-type: disc;
            padding-left: 20px;
        }
        a {
            color: blue;
            text-decoration: none;
        }
        a:hover {
            text-decoration: underline;
        }
        #important {
            font-weight: bold;
            color: red;
        }
    </style>
</head>
<body>
    <div class="container">
        <h1>Welcome to My Website</h1>
        <p>This is a more detailed example with additional content and styling. We're exploring various HTML elements and CSS properties.</p>

        <h2>Section 1: Introduction</h2>
        <p>This section provides an overview of the topic. We'll discuss key concepts and provide context.</p>
        <ul>
            <li>Point 1: A key feature.</li>
            <li>Point 2: Another important aspect.</li>
            <li>Point 3: A supporting detail.</li>
        </ul>

        <h2>Section 2: Further Details</h2>
        <p>Here, we delve deeper into the subject matter. You can find more information and examples below.</p>
        <p id="important">This is an important paragraph with special styling.</p>
        <a href="https://www.example.com">Visit Example Website</a>

        <div>
          <h3>A Sub Section</h3>
          <p>Some more text here</p>
          <img src="dummy_image.jpg" alt="Dummy Image" width="300">
        </div>
    </div>
</body>
</html>"""

In [5]:
class BlockNode:
    def __init__(self, tag, attributes=None, content=None):
        self.tag = tag
        self.attributes = attributes or {}
        self.content = content.strip() if content else ""
        self.children = []
        self.block = ""
        self.is_leaf = False
        self.embedding = None

    def add_child(self, child):
        self.children.append(child)

    def to_dict(self):
        return {
            "tag": self.tag,
            "attributes": self.attributes,
            "content": self.content,
            "children": [child.to_dict() for child in self.children],
        }

In [6]:
def clean_html(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Step 1: HTML Content Cleaning
    # Remove <script>, <style>, and comments
    for tag in soup(["script", "style"]):
        tag.decompose()
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Remove lengthy attributes (e.g., inline styles, large data attributes)
    def clean_attributes(tag):
        for attr in list(tag.attrs.keys()):
            if len(str(tag[attr])) > 50:  # Arbitrary threshold for "lengthy"
                del tag[attr]

    for tag in soup.find_all(True):  # All tags
        clean_attributes(tag)

    # Step 2: Lossless Structural Compression
    # Merge single-nested tags
    def merge_single_nested_tags(tag):
        while (
            len(tag.contents) == 1
            and isinstance(tag.contents[0], Tag)
            and tag.contents[0].name == tag.name
        ):
            child = tag.contents[0]
            tag.attrs.update(child.attrs)  # Merge attributes
            tag.contents = child.contents  # Replace contents
        for child in tag.find_all(recursive=False):
            merge_single_nested_tags(child)

    # Remove all \t and \n in the data
    for text in soup.find_all(string=True):
        text.replace_with(text.replace("\n", "").replace("\t", ""))
    for tag in soup.find_all(True):  # All tags
        merge_single_nested_tags(tag)

    # Remove empty tags
    for tag in soup.find_all():
        if not tag.contents or all(
            str(content).strip() == "" for content in tag.contents
        ):
            tag.decompose()

    return soup

In [7]:
def build_dom_tree(html_content):
    def build_tree(element):
        if not element.name:
            return None

        # Create a BlockNode for the current element
        node = BlockNode(
            tag=element.name,
            attributes=element.attrs,
            content=element.get_text(strip=True) if element.string else "",
        )
        # print(node.attributes, node.content)
        # Recursively process children
        for child in element.children:
            child_node = build_tree(child)
            if child_node:
                node.add_child(child_node)

        return node

    root = build_tree(html_content)
    return root

In [59]:
def build_block_tree(dom_tree: BlockNode, max_window: int):
    """
    Construct a Block Tree from a DOM Tree.

    Args:
        dom_tree (BlockNode): The root of the DOM tree.
        max_window (int): The maximum token limit for a block.

    Returns:
        BlockNode: The root of the constructed Block Tree.
    """

    def merge_children_content(node: BlockNode, max_window):
        """Merge content of children nodes."""
        grand_children = []
        child_block = []
        for child in node.children:
            print(child.tag, child.content)
            grand_children.extend(child.children)
            if child.content.strip():
                child_block.append(f"{child.content}")
        if not grand_children:
            return []
        # If the total content of children is less than the max window, add it to the block
        # print(" ".join(child_block), max_window)
        if len(" ".join(child_block)) < max_window:
            node.block += " ".join(child_block)
            node.children = grand_children
            return merge_children_content(
                node, max_window - len(node.block) - 1
            )
        return node.children

    def split_content(content, max_window):
        """Split content into chunks within the token limit."""
        tokens = content.split()
        return [
            " ".join(tokens[i : i + max_window])
            for i in range(0, len(tokens), max_window)
        ]
    def retag_dom_tree(node: BlockNode, tag_counts=None):
        if tag_counts is None:
            tag_counts = {"[document]": 0}
        tag_prefix = ''.join(filter(str.isalpha, node.tag)) if node.tag else ''
        print(tag_counts.keys())
        if tag_prefix not in tag_counts:
            tag_counts[tag_prefix] = 0

        tag_counts[tag_prefix] += 1
        node.tag = f"{tag_prefix}{tag_counts[tag_prefix]}"

        for child in node.children:
            retag_dom_tree(child, tag_counts)

    retag_dom_tree(dom_tree)
    def build_block_tree_bfs(root: BlockNode, max_window):
        queue = deque([root])
        while queue:
            node = queue.popleft()
            if not node.children:
                node.block = node.content if node.content.strip() else ""
                node.is_leaf = True
                continue
            node.block = node.content if node.content.strip() else ""
            if len(node.block) < max_window:
                merge_children_content(node, max_window - len(node.block) - 1)
                if not node.children:
                    node.is_leaf = True
                else:
                    queue.extend(node.children)
                    node.is_leaf = False
            else:
                queue.extend(node.children)
                node.is_leaf = False

        return root
    retag_dom_tree(dom_tree)
    new_tree = build_block_tree_bfs(dom_tree, max_window)
    return new_tree

In [9]:
def print_tree_blocks(node: BlockNode, indent: int = 0):
    """Print the block tree."""
    print("    " * indent + f"{node.block}")
    for child in node.children:
        print_tree_blocks(child, indent + 1)

In [10]:
def print_tree_contents(node: BlockNode, depth=0):
    """Print tree structure with tag and block contents."""
    if not node:
        return
    print("  " * depth + f"Tag: {node.tag}, Content: {node.content}, {node.attributes}")
    for child in node.children:
        # print(child.content)
        print_tree_contents(child, depth + 1)

In [60]:
content = clean_html(sample1)
# content.find_all()
dom_tree = build_dom_tree(content)
# print_tree_contents(dom_tree)
tree = build_block_tree(dom_tree, 3000)
print_tree_blocks(tree)

dict_keys(['[document]'])
dict_keys(['[document]', 'document'])
dict_keys(['[document]', 'document', 'html'])
dict_keys(['[document]', 'document', 'html', 'head'])
dict_keys(['[document]', 'document', 'html', 'head', 'title'])
dict_keys(['[document]', 'document', 'html', 'head', 'title', 'body'])
dict_keys(['[document]', 'document', 'html', 'head', 'title', 'body', 'div'])
dict_keys(['[document]', 'document', 'html', 'head', 'title', 'body', 'div'])
dict_keys(['[document]', 'document', 'html', 'head', 'title', 'body', 'div'])
dict_keys(['[document]', 'document', 'html', 'head', 'title', 'body', 'div', 'a'])
dict_keys(['[document]', 'document', 'html', 'head', 'title', 'body', 'div', 'a', 'h'])
dict_keys(['[document]', 'document', 'html', 'head', 'title', 'body', 'div', 'a', 'h'])
dict_keys(['[document]', 'document', 'html', 'head', 'title', 'body', 'div', 'a', 'h'])
dict_keys(['[document]', 'document', 'html', 'head', 'title', 'body', 'div', 'a', 'h'])
dict_keys(['[document]', 'documen

In [12]:
EMB_MODEL_ID = "text-embedding-nomic-embed-text-v1.5"


def fetch_embedding(text):
    """Fetch embedding for a given text from LMStudio."""
    try:
        url = "http://localhost:9999/v1/embeddings"
        data = {"model": EMB_MODEL_ID, "input": text}
        headers = {"Content-Type": "application/json"}
        response = requests.post(url, headers=headers, data=json.dumps(data))
        if response.status_code == 200:
            # print(response.json())
            return response.json()["data"]
        else:
            print(response.text)
    except requests.exceptions.RequestException as e:
        print(f"Error querying LMStudio: {e}")
        return None

In [61]:
def tree_contents(root):
    contents = []
    if not root:
        return contents
    if root.block.strip():
        contents.append(root.block)
    for child in root.children:
        child_content = tree_contents(child)
        child_content = [content for content in child_content if content.strip()]
        contents.extend(child_content)
    return contents

In [14]:
def print_tree_embeddings(node: BlockNode, depth=0):
    """Print tree structure with embeddings."""
    if not node:
        return
    # if node.content.strip():
    print(
        "  " * depth
        + f"Tag: {node.tag}, Text: {node.block}, Embeddings: {node.embedding}"
    )
    for child in node.children:
        print_tree_embeddings(child, depth + 1)

In [15]:
def embed_contents(root: BlockNode, contents):
    embd = fetch_embedding(contents)

    # print(len(embd))
    def tranverse_tree(root: BlockNode, idx):
        if not root:
            return
        if root.block.strip():
            root.embedding = embd[idx]["embedding"]
            # print(root.content, idx, root.embedding)
            idx += 1
        for child in root.children:
            idx = tranverse_tree(child, idx)
            # print(idx)
        return idx

    tranverse_tree(root, 0)

In [62]:
contents_to_embd = tree_contents(tree)

In [63]:
print(len(" ".join(contents_to_embd).split()), len(contents_to_embd))

2905 853


In [65]:
embed_contents(tree, contents_to_embd)

In [66]:
contents_to_embd

['NBA All-Star Game Most Valuable Player Award - WikipediaNBA All-Star Game Most Valuable Player Award Navigation menuFrom Wikipedia, the free encyclopedia This page was last edited on 19 March 2018, at 02:49. Privacy policy About Wikipedia Disclaimers Contact Wikipedia Developers Cookie statement Mobile view Enable previewsnavigation search https://en.wikipedia.org/w/index.php?title=NBA_All-Star_Game_Most_Valuable_Player_Award&oldid=831162386 Personal tools Navigation Interaction Tools Print/export Languages Creative Commons Attribution-ShareAlike License Terms of Use Privacy Policy Wikimedia Foundation, Inc. Privacy policy About Wikipedia Disclaimers Contact Wikipedia Developers Cookie statement Mobile view Enable previewsCategories Not logged in Talk Contributions Create account Log in Namespaces Views More Edit links',
 "Championship Individual awards HonorsChampionshiphide National Basketball Association awards and honorsv t ev t ev t eO'Brien Trophy All-Star Game MVP Bill Russell

In [19]:
ques_embd = fetch_embedding(ques1)[0]["embedding"]

In [20]:
def similarity_compute(node: BlockNode, ques_embd, total_sim, max_sim, min_sim):
    if not node:
        return
    if node.embedding is not None:
        cosine_sim = similarity([node.embedding], [ques_embd])[0][0]
        print(cosine_sim, node.block if cosine_sim > 0.2 else None)
        max_sim = max(max_sim, cosine_sim)
        min_sim = min(min_sim, cosine_sim)
        total_sim += cosine_sim

    for child in node.children:
        max_sim, min_sim, total_sim = similarity_compute(
            child, ques_embd, total_sim, max_sim, min_sim
        )
    return max_sim, min_sim, total_sim

In [67]:
max_sim, min_sim, total_sim = similarity_compute(tree, ques_embd, 0.0, 0.0, 1.0)
print(max_sim, min_sim, total_sim / len(contents_to_embd))

0.6435927502128173 NBA All-Star Game Most Valuable Player Award - WikipediaNBA All-Star Game Most Valuable Player Award Navigation menuFrom Wikipedia, the free encyclopedia This page was last edited on 19 March 2018, at 02:49. Privacy policy About Wikipedia Disclaimers Contact Wikipedia Developers Cookie statement Mobile view Enable previewsnavigation search https://en.wikipedia.org/w/index.php?title=NBA_All-Star_Game_Most_Valuable_Player_Award&oldid=831162386 Personal tools Navigation Interaction Tools Print/export Languages Creative Commons Attribution-ShareAlike License Terms of Use Privacy Policy Wikimedia Foundation, Inc. Privacy policy About Wikipedia Disclaimers Contact Wikipedia Developers Cookie statement Mobile view Enable previewsCategories Not logged in Talk Contributions Create account Log in Namespaces Views More Edit links
0.6720604017633176 Championship Individual awards HonorsChampionshiphide National Basketball Association awards and honorsv t ev t ev t eO'Brien Troph

In [68]:
ques1

'who has the most all star mvp awards'

In [22]:
def pruning_tree_on_similarity(node: BlockNode, ques_embd, threshold):
    if not node:
        return None
    if node.embedding is not None:
        cosine_sim = similarity([node.embedding], [ques_embd])[0][0]

        pruned_children = []

        for child in node.children:
            pruned_child = pruning_tree_on_similarity(child, ques_embd, threshold)
            if pruned_child is not None:
                pruned_children.extend(pruned_child)

        if cosine_sim < threshold:
            return pruned_children if pruned_children else None

        node.children = pruned_children
        return [node]
    return node.children

In [69]:
pruning_tree_on_similarity(tree, ques_embd, total_sim / len(contents_to_embd) * 1.1)

[<__main__.BlockNode at 0x188fb2b3020>]

In [70]:
print_tree_blocks(tree)

NBA All-Star Game Most Valuable Player Award - WikipediaNBA All-Star Game Most Valuable Player Award Navigation menuFrom Wikipedia, the free encyclopedia This page was last edited on 19 March 2018, at 02:49. Privacy policy About Wikipedia Disclaimers Contact Wikipedia Developers Cookie statement Mobile view Enable previewsnavigation search https://en.wikipedia.org/w/index.php?title=NBA_All-Star_Game_Most_Valuable_Player_Award&oldid=831162386 Personal tools Navigation Interaction Tools Print/export Languages Creative Commons Attribution-ShareAlike License Terms of Use Privacy Policy Wikimedia Foundation, Inc. Privacy policy About Wikipedia Disclaimers Contact Wikipedia Developers Cookie statement Mobile view Enable previewsCategories Not logged in Talk Contributions Create account Log in Namespaces Views More Edit links
    Championship Individual awards HonorsChampionshiphide National Basketball Association awards and honorsv t ev t ev t eO'Brien Trophy All-Star Game MVP Bill Russell F

In [82]:
def tree_blocks_to_string(node: BlockNode, indent: int = 0) -> str:
    """Convert the block tree to a string."""
    if not node:
        return ""
    result = "    " * indent + f"<{node.tag}>{node.block}\n"
    for child in node.children:
        result += tree_blocks_to_string(child, indent + 1)
    return result


# Convert the tree blocks to a string
tree_blocks_str = tree_blocks_to_string(tree)

In [104]:
tree_blocks_str

'<document1><title1>Longest word in English - Wikipedia<h1>Longest word in English <h16>Navigation menu<div6>From Wikipedia, the free encyclopedia <li139>This page was last edited on 5 January 2018, at 00:10. <li141>Privacy policy <li142>About Wikipedia <li143>Disclaimers <li144>Contact Wikipedia <li145>Developers <li146>Cookie statement <li147>Mobile view <li148>Enable previews<a3>navigation <a4>search <a336>https://en.wikipedia.org/w/index.php?title=Longest_word_in_English&oldid=818685520 <h17>Personal tools <h23>Navigation <h24>Interaction <h25>Tools <h26>Print/export <h27>Languages <a389>Creative Commons Attribution-ShareAlike License <a390>Terms of Use <a391>Privacy Policy <a392>Wikimedia Foundation, Inc. <a393>Privacy policy <a394>About Wikipedia <a395>Disclaimers <a396>Contact Wikipedia <a397>Developers <a398>Cookie statement <a399>Mobile view <a400>Enable previews<h3>Major dictionaries <h4>Creations of long words <h5>Coinages <h6>Agglutinative constructions <h7>Technical terms 

In [77]:
def get_node_paths_and_contents(node: BlockNode, path="") -> list:
    """Return a list containing each node path and node block content."""
    if not node:
        return []
    
    current_path = f"{path}<{node.tag}>"
    result = [{"path": current_path, "content": node.block}]
    
    for child in node.children:
        result.extend(get_node_paths_and_contents(child, current_path))
    
    return result

# Get the list of node paths and contents
node_paths_and_contents = get_node_paths_and_contents(tree)

node_paths_and_contents_zip = [[node_paths_and_contents[0]['path']+node_paths_and_contents[0]['content']]]
for path in node_paths_and_contents[1:]:
    if len(path["path"]) + len(path["content"]) < 3000 - len(''.join(node_paths_and_contents_zip[-1])):
        node_paths_and_contents_zip[-1].append(path["path"]+path["content"])
    else:
        node_paths_and_contents_zip.append([path["path"]+path["content"]])
node_paths_and_contents_zip

[['<document1>NBA All-Star Game Most Valuable Player Award - WikipediaNBA All-Star Game Most Valuable Player Award Navigation menuFrom Wikipedia, the free encyclopedia This page was last edited on 19 March 2018, at 02:49. Privacy policy About Wikipedia Disclaimers Contact Wikipedia Developers Cookie statement Mobile view Enable previewsnavigation search https://en.wikipedia.org/w/index.php?title=NBA_All-Star_Game_Most_Valuable_Player_Award&oldid=831162386 Personal tools Navigation Interaction Tools Print/export Languages Creative Commons Attribution-ShareAlike License Terms of Use Privacy Policy Wikimedia Foundation, Inc. Privacy policy About Wikipedia Disclaimers Contact Wikipedia Developers Cookie statement Mobile view Enable previewsCategories Not logged in Talk Contributions Create account Log in Namespaces Views More Edit links',
  "<document1><tbody1>Championship Individual awards HonorsChampionshiphide National Basketball Association awards and honorsv t ev t ev t eO'Brien Troph

In [75]:
def count_nodes(node: BlockNode) -> int:
    if not node:
        return 0
    count = 1  # Count the current node
    for child in node.children:
        count += count_nodes(child)
    return count


# Count the number of nodes in the latest tree
num_nodes = count_nodes(tree)
print(f"Number of nodes in the tree: {num_nodes}")

Number of nodes in the tree: 246


In [76]:
CHAT_MODEL_ID = "llama-3.2-1b-instruct"


def fetch_irrelevent(html_content, question, min_items):
    """Fetch a joke response from LMStudio."""
    try:
        url = "http://127.0.0.1:9999/v1/chat/completions"
        data = {
            "model": CHAT_MODEL_ID,
            "messages": [
                {
                    "role": "system",
                    "content": """
                    You are an expert system designed to evaluate and prune irrelevant or redundant HTML content. Your task is to process an HTML document and retain only the text blocks that are directly or indirectly relevant to a given question. 
                    Text is considered relevant if it:\n
                        1. Provides a direct answer to the question.\n
                        2. Supplies evidence or details that support answering the question.\n
                        3. Offers necessary context for understanding the answer.\n
                    Avoid retaining blocks that:\n
                        1. Are completely unrelated to the question.\n
                        2. Contain redundant or repetitive information.\n
                        3. Discuss general information not connected to the topic.\n
                    You should output only a path to the relevant blocks, preserving the original structure.
                    Input: **HTML**: {HTML tag and content}. **Question**: {Question}. **Output**: {List of text blocks path that relevant}
                    ### Example
                    **Output**: ["{<html1><title1>, Relevant Title}", "{<html1><body1><p3>, Relevant Information}"]
                    """,
                },
                {
                    "role": "user",
                    "content": f"**HTML**: “{html_content}”\n**Question**: **{question}**",
                },
            ],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "TextBlocks",
                    "schema": {
                        "type": "object",
                        "properties": {
                            "paths": {
                                "type": "array",
                                "items": {
                                    "type": "object",
                                    "properties": {
                                        "path": {"type": "string"},
                                    },
                                    "required": [
                                        "path",
                                    ],
                                },
                                "minItems": min_items,
                            }
                        },
                        "required": ["paths"],
                    },
                },
            },
            "temperature": 0.7,
            "stream": False,
        }
        headers = {"Content-Type": "application/json"}
        response = requests.post(url, headers=headers, data=json.dumps(data))

        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error: {response.status_code} - {response.text}")
    except requests.exceptions.RequestException as e:
        print(f"Error querying LMStudio: {e}")
        return None


In [None]:
final_path_list = []
for path in node_paths_and_contents_zip:
    min_items = min(len(path), 5)
    path_response = fetch_irrelevent(path, ques1, min_items)
    if path_response:
        print(path_response["choices"][0]["message"]["content"])
        final_path_list.extend(json.loads(path_response["choices"][0]["message"]["content"])["paths"])


In [54]:
for path in final_path_list:
    print(path)

{'path': '<html1><title1>NBA All-Star Game Most Valuable Player Award</title>'}
{'path': '<html1><body1><p2>NBA All-Star Game Most Valuable Player Awards</p2></body1>'}
{'path': '<html1><body1><p3>NBA All-Star Game Most Valuable Player Awards history</p3></body1>'}
{'path': "document1/tbody1/Championship Individual awards HonorsChampionshiphide National Basketball Association awards and honorsv t ev t ev t eO'Brien Trophy All-Star Game MVP Bill Russell Finals MVP Coach of the Year Defensive Player of the Year Executive of the Year J. Walter Kennedy Citizenship Community Assist Award Most Improved Player Most Valuable Player Rookie of the Year Sixth Man of the Year Sportsmanship Award Twyman–Stokes Teammate of the Year Lifetime Achievement Award All-NBA Team All-Rookie Team All-Defensive Team"}
{'path': 'document1/tbody1/body1/p3/All-Star Game MVP Bill Russell Finals MVP Coach of the Year Defensive Player of the Year Executive of the Year J. Walter Kennedy Citizenship Community Assist A

In [55]:
for path in final_path_list:
    embd = fetch_embedding(path["path"])
    if embd:
        simi_score = similarity([embd[0]["embedding"]], [ques_embd])[0][0]
        print(simi_score, path["path"] if simi_score > 0.2 else None)

0.7116832931250092 <html1><title1>NBA All-Star Game Most Valuable Player Award</title>
0.7413948189748758 <html1><body1><p2>NBA All-Star Game Most Valuable Player Awards</p2></body1>
0.7359045021138821 <html1><body1><p3>NBA All-Star Game Most Valuable Player Awards history</p3></body1>
0.6687989290344609 document1/tbody1/Championship Individual awards HonorsChampionshiphide National Basketball Association awards and honorsv t ev t ev t eO'Brien Trophy All-Star Game MVP Bill Russell Finals MVP Coach of the Year Defensive Player of the Year Executive of the Year J. Walter Kennedy Citizenship Community Assist Award Most Improved Player Most Valuable Player Rookie of the Year Sixth Man of the Year Sportsmanship Award Twyman–Stokes Teammate of the Year Lifetime Achievement Award All-NBA Team All-Rookie Team All-Defensive Team
0.714236423881688 document1/tbody1/body1/p3/All-Star Game MVP Bill Russell Finals MVP Coach of the Year Defensive Player of the Year Executive of the Year J. Walter Ke

In [56]:
short_answer

{'id': ['16561957862311810757',
  '9013765907572342552',
  '17009528298897439994',
  '1543919103567142329',
  '12851569555955771450'],
 'long_answer': [{'start_token': 449,
   'end_token': 759,
   'start_byte': 64455,
   'end_byte': 68310,
   'candidate_index': 40},
  {'start_token': 449,
   'end_token': 759,
   'start_byte': 64455,
   'end_byte': 68310,
   'candidate_index': 40},
  {'start_token': 449,
   'end_token': 759,
   'start_byte': 64455,
   'end_byte': 68310,
   'candidate_index': 40},
  {'start_token': 449,
   'end_token': 759,
   'start_byte': 64455,
   'end_byte': 68310,
   'candidate_index': 40},
  {'start_token': 3326,
   'end_token': 3606,
   'start_byte': 156131,
   'end_byte': 160678,
   'candidate_index': 121}],
 'short_answers': [{'start_token': [450, 453],
   'end_token': [452, 455],
   'start_byte': [64504, 64571],
   'end_byte': [64514, 64582],
   'text': ['Bob Pettit', 'Kobe Bryant']},
  {'start_token': [450, 453],
   'end_token': [452, 455],
   'start_byte': [6