# Requirements


In [None]:
# web-rag
!pip install -q mistralai
!pip install -q requests
!pip install -q beautifulsoup4
!pip install -q selenium
!pip install -q praw
!pip install -q python-docx
!pip install -q faiss-cpu -q

## retrieval pipeline
!pip install -q pathway
!pip install -q llama-index
!pip install -q llama-index-retrievers-pathway
!pip install -q llama-index-embeddings-huggingface
!pip install -q llama-index-embeddings-instructor

## evidence graph
!pip install -q llama-index-core
!pip install -q llama-index-postprocessor-flag-embedding-reranker
!pip install -q llama-index-graph-stores-neo4j
!pip install -q llama-parse
!pip install -q pyvis Ipython

# Imports



In [None]:
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Any
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from mistralai import Mistral
import time
import requests
from bs4 import BeautifulSoup
import urllib.parse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from docx import Document
from textblob import TextBlob
from datetime import datetime, timedelta
import faiss
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from typing import Dict, List, Tuple
import os
import nest_asyncio

# Initialization
  - Mistral Client : model - "mistral-large-latest"
  - Embedding Model : "BAAI/bge-small-en-v1.5"
  - MistralLLM() : Custom Wrapper Object for the LlamaIndex Framework

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# model_kwargs = {"device": "cuda"}
embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5",
    # model_kwargs=model_kwargs
)

In [None]:
from getpass import getpass
api_key = getpass("Enter the api key in the box beside : ") # API_KEY - oN2G2UkwPDgjeknYrr1oL3oPOWa7AETe
client = Mistral(api_key=api_key)

In [None]:
## Mistral API Caller

def mistral_response(prompt: str) -> str:
  model = "mistral-large-latest"
  chat_response = client.chat.complete(
      model= model,
      temperature=0.5, # changed from 0.7 -> 0.5
      messages = [
          {
              "role": "user",
              "content": prompt,
          },
      ]
  )
  print(chat_response.choices[0].message.content)
  return chat_response.choices[0].message.content.strip()

In [None]:
## MistralLLM: A custom LLM wrapper for integrating the Mistral model with the LlamaIndex framework.

from llama_index.core.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.core.llms.callbacks import llm_completion_callback
from llama_index.core import Settings
import time

class MistralLLM(CustomLLM):
    context_window: int = 3900
    num_output: int = 256
    model_name: str = "custom"

    @property
    def metadata(self) -> LLMMetadata:
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs) -> CompletionResponse:
        messages = [{"role": "user", "content": prompt}]
        chat_response = client.chat.complete(
            model="mistral-large-latest",
            messages=messages
        )
        response_text = chat_response.choices[0].message.content
        time.sleep(1)
        return CompletionResponse(text=response_text)

    @llm_completion_callback()
    def stream_complete(self, prompt: str, **kwargs) -> CompletionResponseGen:
        messages = [{"role": "user", "content": prompt}]
        chat_response = client.chat.stream_complete(
            model="mistral-large-latest",
            messages=messages
        )
        response = ""
        for token in chat_response:
            response += token
            yield CompletionResponse(text=response, delta=token)


In [None]:
## Setting the primary llm object to MistralLLM() and embed model to BAAI/bge-small-en-v1.5
Settings.llm = MistralLLM()
Settings.embed_model = embed_model

# Chain of Thought

In [None]:
def cot_tasks(method, prompt):

  system_prompt = f"""
  System Prompts:
  Q:= Compare the privacy policy of 2 companies Company A and Company B.
  A:= 1) Fetch the privacy policy of Company A.
      2) Fetch the privacy policy of Company B.
      3) Compare the privacy policy of Company A and Company B.
  Q:= Give the Protection of Information from the privacy policy of Company A.
  A:= 1) Fetch the privacy policy of Company A.
      2) Give the Protection of Information from the privacy policy of Company A.
  Q:= List down the Information Required to Detect Violations in the privacy policy of Company A.
  A:= 1) Fetch the privacy policy of Company A.
      2) List down the Information Required to Detect Violations in the privacy policy of Company A.
  Just give the points and do not include question prompt and A:= tag.
  """
  initial_prompt = f"""Main Task: {prompt}\nBreak this task into a list of smaller subtasks. Give only the names of the smaller subtasks and not any description of that subtask. Just give the subtasks in as least number of points as possible. Give the answer following the pattern given in System Prompts.\n"""

  thought_process = system_prompt + "\n" + initial_prompt

  final_answer = ""
  max_steps = 5
  wait_time = 1
  list_length = 0

  modified_prompt = thought_process
  modified_thought = mistral_response(modified_prompt)
  subtasks = modified_thought.strip().split("\n")
  subtasks = [subtask.strip().lower() for subtask in subtasks if subtask.strip()]
  list_length = len(subtasks)

  for step in range(max_steps):
    modified_prompt = thought_process
    try:
      modified_thought = mistral_response(modified_prompt)
    except:
      time.sleep(wait_time)  # Wait for wait_time seconds before retrying
      modified_thought = mistral_response(modified_prompt)
    temp_list = modified_thought.strip().split("\n")
    temp_list = [t_list.strip().lower() for t_list in temp_list if t_list.strip()]
    temp_list_length = len(temp_list)
    if temp_list_length < list_length:
      subtasks = temp_list
      list_length = temp_list_length
    else:
      pass

  return subtasks

# Search Query Retrieval

In [None]:
google_search_api = "AIzaSyBYJYj-NVuHvdnNfOsI0xTlSdpu-1bL9uM"
cx = "462d7268f48434ffa"
## Google Search
def google_search(query, k=1):
    try:
        response = requests.get("https://www.googleapis.com/customsearch/v1", params={
            "q": query,
            "key": google_search_api,
            "cx": cx
        })

        if response.status_code != 200:
            raise ConnectionError(f"Google Search API failed with status code: {response.status_code}")

        result_summary = []
        data = response.json()
        for i in range(k):
            if "items" in data and data["items"]:
                result = data["items"][i]
                result_summary.append ({
                    "title": result.get("title"),
                    "link": result.get("link")
                })

        return result_summary

    except requests.exceptions.RequestException as e:
        raise ConnectionError(f"Google Search API request failed: {e}")

In [None]:
bing_search_api = ''

## Bing Search
def bing_search(query, k=1):
    try:
        headers = {"Ocp-Apim-Subscription-Key": bing_search_api}
        params = {"q": query, "textDecorations": True, "textFormat": "HTML"}

        response = requests.get("https://api.bing.microsoft.com/v7.0/search", headers=headers, params=params)

        if response.status_code != 200:
            raise ConnectionError(f"Bing Search API failed with status code: {response.status_code}")

        data = response.json()
        result_summary = []

        for i in range(k):
            if "webPages" in data and "value" in data["webPages"] and data["webPages"]["value"]:
                best_result = data["webPages"]["value"][i]
                link = best_result.get("url")
                result_summary.append ({
                    "title": best_result.get("name"),
                    "link": link
                    })

    except requests.exceptions.RequestException as e:
        raise ConnectionError(f"Bing Search API request failed: {e}")

    return result_summary

In [None]:
## Retrieve text from webpages

def get_full_text(url, word_limit=1000):
    try:
        page_response = requests.get(url)
        if page_response.status_code != 200:
            raise ConnectionError(f"Failed to retrieve the page content with status code: {page_response.status_code}")

        soup = BeautifulSoup(page_response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        full_text = ' '.join([p.get_text() for p in paragraphs])
        words = full_text.split()
        text = ' '.join(words[:word_limit]) if len(words) > word_limit else full_text

        return text

    except requests.exceptions.RequestException as e:
        raise ConnectionError(f"Error retrieving full text from page: {e}")

In [None]:
## Wikipedia Web Crawler

def scrape_wiki(query,k=1,word_limit=1000):
    base_url = "https://en.wikipedia.org/wiki/"
    page_url = f"{base_url}{urllib.parse.quote(query)}"

    try:
        response = requests.get(page_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p', limit=5)
        content = ' '.join([p.get_text().strip() for p in paragraphs if p.get_text().strip()])
        words = content.split()
        limited_content = ' '.join(words[:word_limit]) if len(words) > word_limit else content

        return [{
            "title": query,
            "link": page_url,
            "content": limited_content
        }]

    except requests.exceptions.RequestException as e:
        # force wiki search with gsearch API
        query = query + " Wikipedia"
        result_wiki = google_search(query)
        text = get_full_text(result_wiki[0]['link'])
        return [{
            "title": query,
            "link": result_wiki[0]['link'],
            "content": text
        }]

In [None]:
## Stack Exchange Web Crawler

stackexchange_api_url = 'https://api.stackexchange.com/2.3'
def scrape_stackexchange(query,k=1):
    sites = ['privacy policies', 'law', 'legal laws','policies']
    questions_data = []

    for site in sites:
        # Build API query
        params = {
              'site': site,
              'tagged': 'privacy',
              'q': '',
              'sort': 'votes',
              'order': 'desc'
          }

        try:
            response = requests.get(f"{stackexchange_api_url}/search/advanced", params=params)
            if response.status_code == 200:
                  for question in response.json()['items']:
                      questions_data.append({
                          'site': site,
                          'title': question['title'],
                          'link': question['link'],
                          'score': question['score']
                      })
        except:
            print("StackEx error")

    return questions_data

In [None]:
## CFPB Web Crawler

CFPB_api_url = "https://www.consumerfinance.gov/data-research/consumer-complaints/search/api/v1/"
def scrape_CFPB(query,k=1):
    params = {
                'company': 'pathway',
                'issue': 'policies',
                'date_received_max': datetime.now().strftime('%Y-%m-%d')
            }

    complaints_data = []
    response = requests.get(CFPB_api_url, params=params)

    if response.status_code == 200:
            for complaint in response.json()['hits']['hits']:
                complaint_data = complaint['_source']
                complaints_data.append({
                    'title': complaint_data.get('issue', ''),
                    'content': complaint_data.get('state', ''),
                })

    return complaints_data

In [None]:
## Reddit Web Crawler

def scrape_reddit(query, k=1):
    relevant_subreddits = ['privacy','PrivacyGuides', 'legal']
    url = "https://socialgrep.p.rapidapi.com/search/posts"

    posts_data = []
    for subreddit_name in relevant_subreddits:
            query_form = f"/r/{subreddit_name},{query}"
            querystring = {"query":query_form}

            headers = {
              "x-rapidapi-key": "e578b1448dmshe1c37e86db71948p125563jsn1f53582a5fb1",
              "x-rapidapi-host": "socialgrep.p.rapidapi.com"
            }

            response = requests.get(url, headers=headers, params=querystring)
            try:
                posts_data.append(response.json())
            except Exception as e:
                print("Reddit API failed with error:", e)


    return posts_data


In [None]:
def search(query, k=1):
    methods = [google_search, bing_search, scrape_wiki, scrape_CFPB]
    current_method = 0
    result = []
    for _ in range(len(methods)):
        try:
            result.append(methods[current_method](query,k))
            print(f"Using {methods[current_method].__name__}")
        except ConnectionError as e:
            print(e)
            result.append({})
        current_method = (current_method + 1) % len(methods)
    return result

In [None]:
## Web content retrieval through different web crawlers

def web_rag_retrieval(cot_list):
    methods = ["Google", "Bing", "Wikipedia", "ConsumerFinance"]
    idx = 0
    result_dict = {'Google':[],'Bing':[],'Wikipedia':[],'StackExchange':[],'ConsumerFinance':[]}
    query = cot_list[1]
    stack_result = scrape_stackexchange(query,3)
    for r in stack_result:
        try:
            if 'link' in r:
                substring = r"stack exchange network consists of 183 q&a communities including stack overflow, the largest, most trusted online community for developers to learn, share their knowledge, and build their careers. now available on stack overflow for teams! ai features where you work: search, ide, and chat.\s+ask questions, find answers and collaborate at work with stack overflow for teams.\s+explore teams teams q&a for work connect and share knowledge within a single location that is structured and easy to search."
                text = get_full_text(r['link']).lower().replace('\t','').replace('\n','')
                text = re.sub(substring, '', text)
                result_dict['StackExchange'].append([r['title'],str(text)])
        except:
            print("Error")
            result_dict["StackExchange"].append(['',''])

    for clist in cot_list:
        query = clist
        result = search(query, 5)
        for kth_result in result:
            for r in kth_result:
                try:
                    if 'link' in r:
                        text = get_full_text(r['link']).lower().replace('\t','').replace('\n','')
                        #upload_to_doc(r['link'])
                    else:
                        text = r['content']
                    if idx<2:
                        print(methods[idx]);
                        result_dict[methods[idx]].append([r['link'],str(text)])
                    else:
                        result_dict[methods[idx]].append([r['title'],str(text)])
                except:
                    print("Error")
                    result_dict[methods[idx]].append(['',''])

            idx = (idx + 1) % len(methods)

    return result_dict

In [None]:
# aggregator, context relevancy and embeddings

import faiss
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from typing import Dict, List, Tuple

def encode_texts(texts: List[str], model_name: str = 'all-MiniLM-L6-v2') -> np.ndarray:
    """
    Encode texts into vector embeddings using sentence transformers
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings.cpu().numpy()

def filter_by_sentiment(texts: List[str], sentiment_threshold: float = 0.1) -> List[int]:
    """
    Filter texts based on sentiment score
    Returns indices of texts with positive sentiment above the threshold
    """
    sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

    positive_indices = []
    for idx, text in enumerate(texts):
        text = text[:512]
        result = sentiment_analyzer(text)
        if result[0]['label'] == 'POSITIVE' and result[0]['score'] > sentiment_threshold:
            positive_indices.append(idx)

    return positive_indices

def build_faiss_index(embeddings: np.ndarray) -> faiss.IndexFlatIP:
    """
    Build and return a FAISS index from embeddings
    """
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    return index

def prepare_search_data(results_dict: Dict[str, List[List[str]]], sentiment_threshold: float) -> Tuple[List[str], List[str], List[str]]:
    """
    Prepare flattened lists of texts, URLs, and sources from the nested dictionary
    """
    texts = []
    urls = []
    sources = []

    # Flatten the structure while keeping track of URLs and sources
    for source, url_content_pairs in results_dict.items():
        for url, content in url_content_pairs:
            texts.append(content)
            urls.append(url)
            sources.append(source)

    # Get indices of positive sentiment texts
    positive_indices = filter_by_sentiment(texts, sentiment_threshold)

    # Filter all lists using the positive indices
    filtered_texts = [texts[i] for i in positive_indices]
    filtered_urls = [urls[i] for i in positive_indices]
    filtered_sources = [sources[i] for i in positive_indices]

    return filtered_texts, filtered_urls, filtered_sources

def search_similar(query: str, index: faiss.IndexFlatIP, model_name: str, top_k: int) -> Tuple[np.ndarray, np.ndarray]:
    """
    Search for similar texts using the FAISS index
    """
    query_vector = encode_texts([query], model_name)
    faiss.normalize_L2(query_vector)
    return index.search(query_vector, top_k)

def format_results(scores: np.ndarray, indices: np.ndarray, texts: List[str], urls: List[str], sources: List[str]) -> List[Dict]:
    """
    Format search results into a list of dictionaries including URLs
    """
    results = []
    for score, idx in zip(scores[0], indices[0]):
        results.append({
            'source': sources[idx],
            'url': urls[idx],
            'content': texts[idx],
            'similarity_score': float(score)
        })
    return results

def aggregate_and_rank_results_faiss(
    query: str,
    results_dict: Dict[str, List[List[str]]],
    top_k: int = 5,
    sentiment_threshold: float = 0.1,
    model_name: str = 'all-MiniLM-L6-v2',
    output_file: str = 'search_results.json'
) -> List[Dict]:
    """
    Main function to process query and results using FAISS with sentiment filtering

    Args:
        query: Search query
        results_dict: Dictionary mapping sources to lists of [url, content] pairs
        top_k: Number of top results to return
        sentiment_threshold: Minimum sentiment score to keep a text (0 to 1)
        model_name: Name of the sentence transformer model to use
        output_file: Path to save JSON output
    """
    # Prepare data with sentiment filtering
    texts, urls, sources = prepare_search_data(results_dict, sentiment_threshold)

    if not texts:
        print("No texts passed the sentiment threshold!")
        return []

    # Create embeddings and index
    embeddings = encode_texts(texts, model_name)
    index = build_faiss_index(embeddings)

    # Perform search
    scores, indices = search_similar(query, index, model_name, min(top_k, len(texts)))

    # Format results
    results = format_results(scores, indices, texts, urls, sources)

    # Save results to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    return results

In [None]:
def web_rag_retrieval(cot_list):
    methods = ["Google", "Bing", "Wikipedia", "ConsumerFinance"]
    idx = 0
    result_dict = {'Google':[],'Bing':[],'Wikipedia':[],'StackExchange':[],'ConsumerFinance':[]}
    query = cot_list[1]
    stack_result = scrape_stackexchange(query,3)
    for r in stack_result:
        try:
            print("Search Result:", r)
            if 'link' in r:
                substring = r"stack exchange network consists of 183 q&a communities including stack overflow, the largest, most trusted online community for developers to learn, share their knowledge, and build their careers. now available on stack overflow for teams! ai features where you work: search, ide, and chat.\s+ask questions, find answers and collaborate at work with stack overflow for teams.\s+explore teams teams q&a for work connect and share knowledge within a single location that is structured and easy to search."
                text = get_full_text(r['link']).lower().replace('\t','').replace('\n','')
                text = re.sub(substring, '', text)
                #upload_to_doc(r['link'])
                print("Text Result:",text)
                result_dict['StackExchange'].append([r['title'],str(text)])
        except:
            print("Error")
            result_dict["StackExchange"].append(['',''])

    for clist in cot_list:
        query = clist
        result = search(query, 5)
        for kth_result in result:
            for r in kth_result:
                try:
                    print("Search Result:", r)
                    if 'link' in r:
                        text = get_full_text(r['link']).lower().replace('\t','').replace('\n','')
                        #upload_to_doc(r['link'])
                    else:
                        text = r['content']
                    if idx<2:
                        print("Text Result:", text)
                        print(methods[idx]);
                        result_dict[methods[idx]].append([r['link'],str(text)])
                    else:
                        print("Text Result:",)
                        result_dict[methods[idx]].append([r['title'],str(text)])
                except:
                    print("Error")
                    result_dict[methods[idx]].append(['',''])

            idx = (idx + 1) % len(methods)

    return result_dict

# Retrieval Pipeline - Using Pathway

In [None]:
import pathway as pw

# the static data has to be present under the ./data directory
data_sources = []
data_sources.append(
    pw.io.fs.read(
        "./data",
        format="binary",
        mode="streaming",
        with_metadata=True,
    )
)

In [None]:
from pathway.xpacks.llm.vector_store import VectorStoreServer
from llama_index.core.node_parser import TokenTextSplitter

transformations_example = [
    TokenTextSplitter(
        chunk_size=400,
        chunk_overlap=100,
        separator=" ",
    ),
    embed_model,
]

## Data Preprocessing Pipeline
processing_pipeline = VectorStoreServer.from_llamaindex_components(
    *data_sources,
    transformations=transformations_example,
)

## Setting up Pathway VectorStore Server
PATHWAY_HOST = "127.0.0.1"
PATHWAY_PORT = 8754
processing_pipeline.run_server(
    host=PATHWAY_HOST, port=PATHWAY_PORT, with_cache=False, threaded=True
)

In [None]:
from llama_index.retrievers.pathway import PathwayRetriever

# Initializing the Pathway Retriever which is hooked onto the Pathway VectorStore
retriever = PathwayRetriever(host=PATHWAY_HOST, port=PATHWAY_PORT,)

In [None]:
def rag_query(query_input):
  """
    Retrieves the relevant static data from the Pathway Vector Store.

    Args:
        query_input: string

    Return:
        text_content : string
  """
  results = retriever.retrieve(str_or_query_bundle=query_input)
  text_content = ""
  for result in results:
      text_content += result.node.text
  return text_content

In [None]:
# Testing
rag_query("Does dallascollege store cookies?")

In [None]:
def get_and_dump_retrieved_text(query_input,filename="retrieved.txt"):
  """
    Retrieves the relevant static data from the Pathway Vector Store.

    Args:
        query_input: string

    Return:
       None

  """
  try:
    retrieved_data = rag_query(query_input)
    with open(f"/content/temp/{filename}", 'w') as f:
        f.write(retrieved_data)
    f.close()
  except Exception as e:
    return e

In [None]:
# Testing
get_and_dump_retrieved_text("What information does Yahoo store?")


# Knowledge Graph

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import KnowledgeGraphIndex
from llama_index.core.graph_stores import SimpleGraphStore
from llama_index.core.storage.storage_context import StorageContext

In [None]:
## Generates evidence graph based on the .txt files dumped onto the ./temp/ directory
def generate_evidence_graph(filepath="/content/temp/"):
    documents = SimpleDirectoryReader(filepath).load_data()
    graph_store = SimpleGraphStore()
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    index = KnowledgeGraphIndex.from_documents(
        documents=documents,
        max_triplets_per_chunk=3,
        storage_context=storage_context,
        embed_model=embed_model,
        include_embeddings=True
    )
    storage_context.persist()

    return index


In [None]:
## Visualizes the graph network
from pyvis.network import Network
from IPython.display import display
import IPython
g = index.get_networkx_graph()
net = Network(notebook=True,cdn_resources="in_line",directed=True)
net.from_nx(g)
net.show("graph.html")
net.save_graph("Knowledge_graph.html")
IPython.display.HTML(filename="/content/Knowledge_graph.html")

In [None]:
## Retreives all relevant nodes based on user query and generates relevant context for final response generation
def retrieve_nodes(query):
  query_engine = index.as_query_engine(include_text=True,
                                      response_mode ="tree_summarize",
                                      embedding_mode="hybrid",
                                      similarity_top_k=5,)
  response = query_engine.query(query)
  return response

In [None]:
response = retrieve_nodes("What information does Yahoo Store?")

In [None]:
print(response)

# Combining RAG+Evidence Graph

In [None]:
## Combinining RAG+Evidence Graph Generator
def generate_context(query):
  try:
    get_and_dump_retrieved_text(query)
    time.sleep(0.5)
    index = generate_evidence_graph()
    response = retrieve_nodes(query)
    return response
  except Exception as e:
    return e

In [None]:
#Testing
get_and_dump_retrieved_text("What information does Yahoo Store?")
index = generate_evidence_graph()
g = index.get_networkx_graph()
net = Network(notebook=True,cdn_resources="in_line",directed=True)
net.from_nx(g)
net.show("graph.html")
net.save_graph("Knowledge_graph.html")
IPython.display.HTML(filename="/content/Knowledge_graph.html")

In [None]:
response = retrieve_nodes("What information does Yahoo Store?")
print(response)

In [None]:
query = "What are the online privacy policy guidelines at Bank of America?"
get_and_dump_retrieved_text(query)
index = generate_evidence_graph()
g = index.get_networkx_graph()
net = Network(notebook=True,cdn_resources="in_line",directed=True)
net.from_nx(g)
net.show("graph.html")
net.save_graph("Knowledge_graph.html")
IPython.display.HTML(filename="/content/Knowledge_graph.html")

In [None]:
response = retrieve_nodes(query)
print(response)

# Final Response Generation

In [None]:
import threading
import time

In [None]:
## Main Function
def llm_query_2(query):
    start = time.time()
    get_and_dump_retrieved_text(query)
    cot_list = cot_tasks(None, query)
    result_dict = web_rag_retrieval(cot_list)
    results = aggregate_and_rank_results_faiss(
        query=query,
        results_dict=result_dict,
        top_k=5,
        sentiment_threshold=0.1
    )

    with open('/content/search_results.json', 'r') as file:
        entry = json.load(file)

    content_list_web_rag_retrieval = [
        entry[i]['content'].strip().lower() for i in range(len(entry))
    ]
    content = ''.join(f"Content: {entry}\n" for entry in content_list_web_rag_retrieval)

    with open("/content/temp/search.txt", "w") as f:
        f.write(content)

    index = generate_evidence_graph()
    g = index.get_networkx_graph()
    net = Network(notebook=True,cdn_resources="in_line",directed=True)
    net.from_nx(g)
    net.show("graph.html")
    net.save_graph("Knowledge_graph.html")
    IPython.display.HTML(filename="/content/Knowledge_graph.html")

    time.sleep(1)
    context = retrieve_nodes(query)

    prompt = f"""
    Context information is below.
    ---------------------
    {context}
    ---------------------
    Given the context information and not prior knowledge, answer the query.
    Query: {query}
    Answer:
    """
    final_result = mistral_response(prompt)

    end = time.time()
    time_taken = end - start

    return final_result, time_taken

In [None]:
final_result, time_taken = llm_query_2("What information does Yahoo store?")

In [None]:
!pip install codecov
!pip install guardrails

In [None]:
from guardrails.hub import LogicCheck
from guardrails import Guard

guard = Guard.use(
    LogicCheck()
)

# Validate the LLM response using Guardrails
validation_result = guard.validate(response)

if validation_result.passed:
    print("LLM response is valid:", llm_response)
else:
    print("LLM response failed validation:", llm_response)
    print("Reason:", validation_result.reason)

In [None]:
print(final_result)
print(f"Time Taken:{time_taken}")