In [1]:
# pip install langchain faiss-cpu

# for macOS users, you might need to install faiss with the following command
# pip install faiss-cpu --no-binary :all:

Your markdown or comments here

In [2]:
import os
import time
import numpy as np
from sklearn.decomposition import PCA
import yt_dlp
import faiss
import openai

from langchain_community.document_loaders import PyPDFLoader, YoutubeLoader, UnstructuredPowerPointLoader 
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import GitLoader
from langchain.embeddings.base import Embeddings
from langchain.docstore import InMemoryDocstore
from langchain.vectorstores.faiss import FAISS

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException, TimeoutException, StaleElementReferenceException

from webdriver_manager.chrome import ChromeDriverManager

from urllib.parse import urljoin, urlparse

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
from dotenv import load_dotenv

load_dotenv()
# --- Step 2: Load environment variables from .env file
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [4]:
# WEB CRAWLER
print("Web Crawler 1 started ....")

def crawl_internal_links(start_url, max_pages=50, max_depth=1):
    """
    Crawl internal URLs from a site using Selenium, with support for JavaScript-heavy pages.

    Args:
        start_url (str): The URL to start crawling from.
        max_pages (int): Max number of pages to crawl.
        max_depth (int): Max depth to crawl (0 = just root).

    Returns:
        list: A list of internal URLs that were successfully visited.
    """
    # Selenium Headless Browser Setup
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    visited = set()
    domain = urlparse(start_url).netloc
    to_visit = [(start_url, 0)]

    while to_visit and len(visited) < max_pages:
        url, depth = to_visit.pop(0)
        if url in visited or depth > max_depth:
            continue

        try:
            driver.get(url)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "a")))
            time.sleep(1)  # 🔁 Wait for JS to load

            print(f"Visited ({len(visited)+1}/{max_pages}), Depth {depth}): {url}")
            visited.add(url)

            # If max depth reached, skip link extraction
            if depth == max_depth:
                continue

            # Extract and queue internal links
            links = driver.find_elements(By.TAG_NAME, "a")
            for link in links:
                try:
                    href = link.get_attribute("href")
                    if not href or href.startswith(("mailto:", "tel:", "javascript:")):
                        continue

                    parsed = urlparse(href)
                    if parsed.netloc == domain or parsed.netloc == "":
                        full_url = urljoin(url, href).split("#")[0]
                        if full_url not in visited and all(full_url != q[0] for q in to_visit):
                            to_visit.append((full_url, depth + 1))
                except StaleElementReferenceException:
                    continue

        except (WebDriverException, TimeoutException):
            print(f"⚠️ Skipping (Error): {url}")
            visited.add(url)
            continue

    driver.quit()

    # Print summary
    print(f"\n Total unique internal URLs visited: {len(visited)}")

    if len(visited) < max_pages:
        print("⚠️ Number of crawled URLs is less than max_pages. Possible reasons:")
        print("- Site may not have enough unique pages within the allowed depth.")
        print("- Some links might be hidden behind JavaScript interactions.")
        print("- Some links could be blocked, inaccessible, or slow-loading.")
        print("- Your max_depth may be too shallow to discover deeper links, try changing depth.")
    
    return list(visited)

#### CONFIGURABLE SETTINGS 
start_url = "https://fidesinnova.io/"
max_pages = 100   # Maximum number of pages to visit
max_depth = 2    # 0 = only root, 1 = root + links from root

####  Run the Crawler
web_docs_list1 = crawl_internal_links(start_url, max_pages, max_depth)

def change_web_to_meta_data(doc):
    """
    Change the metadata of the web documents to include the content type.
    """
    doc.metadata["type"] = "Web"
    # remove \n from the content
    doc.page_content = doc.page_content.replace("\n", " ")
    # remove multiple spaces
    doc.page_content = ' '.join(doc.page_content.split())
    return doc

#  Load the web documents
web_docs1 = []
for idx in web_docs_list1:
    a = WebBaseLoader(idx)
    try:
        temp_docs = a.load()
        temp_docs = list(map(change_web_to_meta_data, temp_docs))

        web_docs1.extend(temp_docs)
        print(idx + " is loaded.")
    except:
        print(f"{idx} is not loaded.")

Web Crawler 1 started ....
Visited (1/100), Depth 0): https://fidesinnova.io/
Visited (2/100), Depth 1): https://fidesinnova.io/devices/
Visited (3/100), Depth 1): https://fidesinnova.io/courses/
Visited (4/100), Depth 1): https://fidesinnova.io/Contacts
Visited (5/100), Depth 1): https://fidesinnova.io/About
Visited (6/100), Depth 1): https://fidesinnova.io/Contacts/
Visited (7/100), Depth 1): https://fidesinnova.io/launch-your-iot-device-in-under-a-week-with-fidesinnova-platform-the-fast-track-to-market/
Visited (8/100), Depth 1): https://fidesinnova.io/category/fidesinnova/
Visited (9/100), Depth 1): https://fidesinnova.io/unlocking-the-power-of-digital-twins-in-smart-building-management/
Visited (10/100), Depth 1): https://fidesinnova.io/data-monetization-in-the-fidesinnova-ecosystem-unlocking-new-revenue-streams/
Visited (11/100), Depth 1): https://fidesinnova.io/blog-standard/
Visited (12/100), Depth 2): https://fidesinnova.io/devices-ecardv2/
Visited (13/100), Depth 2): https://

In [5]:
web_docs1

[Document(metadata={'source': 'https://fidesinnova.io/mqtt-mqtts/', 'title': 'MQTT and MQTTS protocols - FidesInnova', 'language': 'en-US', 'type': 'Web'}, page_content='MQTT and MQTTS protocols - FidesInnova Skip to content Skip to footer FidesInnovaA platform for decentralized trusted IoT systems Products Network ZKP ExplorerMobile AppIoT Serverzk-IoT ZKP SDKSample IoT Device Ecosystem Sensors (zkSensor)Transportation (MotionCertified)Oil&Gas (EnergyWiseNetwork)Machine Learning (TrustLearn)Smart Home (TrustSense)Partnership Resources WiKiGithubArticlesAcademyzk-IoT Paper About Us Contact UsAbout Us FidesInnovaA platform for decentralized trusted IoT systems FidesInnovaA platform for decentralized trusted IoT systems Close Products Network ZKP ExplorerMobile AppIoT Serverzk-IoT ZKP SDKSample IoT Device Ecosystem Sensors (zkSensor)Transportation (MotionCertified)Oil&Gas (EnergyWiseNetwork)Machine Learning (TrustLearn)Smart Home (TrustSense)Partnership Resources WiKiGithubArticlesAcadem

In [6]:

print("Web Crawler 2 started ....")

# Headless setup
options = Options()
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--window-size=1920,1080")
options.add_argument("--log-level=3")
options.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

base_url = "https://www.fidesinnova.io/"
visited = set()
web_docs_list2 = set()

def is_valid_internal(url):
    parsed = urlparse(url)
    return parsed.scheme in ["http", "https"] and parsed.netloc.endswith("fidesinnova.io")

def crawl(url):
    if url in visited:
        return
    visited.add(url)

    try:
        driver.get(url)
        time.sleep(2)  # Let dynamic content load

        links = driver.find_elements("tag name", "a")
        for link in links:
            href = link.get_attribute("href")
            if href:
                full_url = urljoin(url, href.split("#")[0])  # remove fragments
                if is_valid_internal(full_url) and full_url not in web_docs_list2:
                    web_docs_list2.add(full_url)
                    crawl(full_url)

    except Exception as e:
        print(f"Error visiting {url}: {e}")

# Start
crawl(base_url)

driver.quit()

# Output
print("\n=== Discovered Internal URLs ===")
for url in sorted(web_docs_list2):
    print(url)
print(f"\nTotal internal URLs web_docs_list2: {len(web_docs_list2)}")


#  Load the web documents
web_docs2 = []
for idx in web_docs_list2:
    a = WebBaseLoader(idx)
    try:
        temp_docs = a.load()
        temp_docs = list(map(change_web_to_meta_data, temp_docs))

        web_docs2.extend(temp_docs)
        print(idx + " is loaded.")
    except:
        print(f"{idx} is not loaded.")


Web Crawler 2 started ....
Error visiting https://fidesinnova.io/: Message: stale element reference: stale element not found
  (Session info: chrome=137.0.7151.57); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
0   chromedriver                        0x00000001045fa654 cxxbridge1$str$ptr + 2723108
1   chromedriver                        0x00000001045f28c8 cxxbridge1$str$ptr + 2690968
2   chromedriver                        0x0000000104146714 cxxbridge1$string$len + 90428
3   chromedriver                        0x000000010415714c cxxbridge1$string$len + 158580
4   chromedriver                        0x0000000104156228 cxxbridge1$string$len + 154704
5   chromedriver                        0x000000010414ccdc cxxbridge1$string$len + 116484
6   chromedriver                        0x000000010414cde8 cxxbridge1$string$len + 116752
7   chromedriver                        0x000

In [7]:
web_docs2

[Document(metadata={'source': 'https://explorer.fidesinnova.io/', 'title': 'Fides Innova Explorer', 'description': 'Explore the power of FidesInnova’s blockchain platform, designed to ensure secure, transparent, and verifiable computations for IoT devices. With zk-IoT technology, trust and reliability are at the core of every interaction—whether managing smart devices or ensuring the accuracy of data across industries like healthcare, transportation, and energy.', 'language': 'en', 'type': 'Web'}, page_content='Fides Innova ExplorerYou need to enable JavaScript to run this app.'),
 Document(metadata={'source': 'https://fidesinnova.io/', 'title': 'FidesInnova-Verified Computations for Reliable Results - FidesInnova', 'language': 'en-US', 'type': 'Web'}, page_content='FidesInnova-Verified Computations for Reliable Results - FidesInnova Skip to content Skip to footer Products Network ZKP ExplorerMobile AppIoT Serverzk-IoT ZKP SDKSample IoT Device Ecosystem Sensors (zkSensor)Transportation

In [8]:
print("GitHub Crawler started ....")

# loading GitHub Repos
github_repos = [
    "https://github.com/TheArchitect2000/Fides-Innova-WiKi",
    "https://github.com/TheArchitect2000/Blockchain-based-IoT-Server",
    "https://github.com/TheArchitect2000/zk-IoT",
    "https://github.com/TheArchitect2000/Smart-Contract-Protocol",
#    "https://github.com/TheArchitect2000/zkiot-riscv-qemu-c", 
#    "https://github.com/TheArchitect2000/ZKP-Blockchain-Explorer",
#    "https://github.com/TheArchitect2000/evm-server",
#    "https://github.com/TheArchitect2000/New-IoT-Device-Integration",
 #   "https://github.com/TheArchitect2000/zkiot-riscv-qemu-rust"
]

def change_GitHub_to_meta_data(doc):
    """
    Change the metadata of the web documents to include the content type.
    """
    doc.metadata["type"] = "GitHub"
    # remove \n from the content
    doc.page_content = doc.page_content.replace("\n", " ")
    # remove multiple spaces
    doc.page_content = ' '.join(doc.page_content.split())
    return doc

github_docs = []
for url in github_repos:
    print(f"📥 Loading repository: {url}")
    repo_name = url.split("/")[-1]
    local_path = f"./cloned_repos/{repo_name}"

    loader = GitLoader(
        repo_path=local_path,
        clone_url=url,
        branch="main",
        file_filter=lambda f: f.endswith((
            ".py", ".md", ".c", ".cpp", ".rs", ".json", ".html",
            ".js", ".ts", ".css", ".java", ".txt", ".yml", ".yaml", ".sh"
        ))
    )
    
    temp_docs = loader.load()
    temp_docs = list(map(change_GitHub_to_meta_data, temp_docs))
    github_docs.extend(temp_docs)
    
    print(f"✅ Loaded {len(github_docs)} documents from {repo_name}")

GitHub Crawler started ....
📥 Loading repository: https://github.com/TheArchitect2000/Fides-Innova-WiKi
✅ Loaded 41 documents from Fides-Innova-WiKi
📥 Loading repository: https://github.com/TheArchitect2000/Blockchain-based-IoT-Server
✅ Loaded 976 documents from Blockchain-based-IoT-Server
📥 Loading repository: https://github.com/TheArchitect2000/zk-IoT
✅ Loaded 1112 documents from zk-IoT
📥 Loading repository: https://github.com/TheArchitect2000/Smart-Contract-Protocol
✅ Loaded 1114 documents from Smart-Contract-Protocol


In [9]:
github_docs

[Document(metadata={'source': 'README.md', 'file_path': 'README.md', 'file_name': 'README.md', 'file_type': '.md', 'type': 'GitHub'}, page_content='<p align="center"> <a href="https://fidesinnova.io/" target="blank"><img src="g-c-web-back.png" /></a> </p> # FidesInnova Wiki <a href="https://www.npmjs.com/~nestjscore" target="_blank"><img src="https://img.shields.io/npm/v/@nestjs/core.svg" alt="NPM Version" /></a> <a href="https://www.npmjs.com/~nestjscore" target="_blank"><img src="https://img.shields.io/npm/l/@nestjs/core.svg" alt="Package License" /></a> <a href="https://www.npmjs.com/~nestjscore" target="_blank"><img src="https://img.shields.io/npm/dm/@nestjs/common.svg" alt="NPM Downloads" /></a> <a href="https://circleci.com/gh/nestjs/nest" target="_blank"><img src="https://img.shields.io/circleci/build/github/nestjs/nest/master" alt="CircleCI" /></a> <a href="https://coveralls.io/github/nestjs/nest?branch=master" target="_blank"><img src="https://coveralls.io/repos/github/nestjs/

In [10]:
print("YouTube Crawler started ....")

# Fides Innova YouTube Channel ID
CHANNEL_ID = "UCrrqGYx98H1dPdZsNb1i9-g"
CHANNEL_URL = f"https://www.youtube.com/channel/{CHANNEL_ID}"

def fetch_video_urls(channel_url: str):
    ydl_opts = {
        'ignoreerrors': True,
        'quiet': True,
        'extract_flat': True,  # Only metadata, not downloading
        'force_generic_extractor': False,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        result = ydl.extract_info(channel_url, download=False)
        video_urls = []
        video_ids = []

        if 'entries' in result:
            for entry in result['entries']:
                if entry and 'id' in entry:
                    video_url = f"https://www.youtube.com/watch?v={entry['id']}"
                    video_urls.append(video_url)
                    video_ids.append(entry['id'])
        return video_ids

# Fetch and display video URLs
video_ids = fetch_video_urls(CHANNEL_URL)
print(f"✅ Found {len(video_ids)} videos on channel.\n")

def change_YouTube_to_meta_data(doc):
    doc.metadata['type']='YouTube'
    return doc

youtube_docs = []
second_try_idx = []
third_try_idx = []
for idx in video_ids:
    a = YoutubeLoader(idx)
    try:
        temp_docs = a.load()
        temp_docs = list(map(change_YouTube_to_meta_data, temp_docs))
        youtube_docs.extend(temp_docs)
        print(idx + " is loaded.")
    except:
        print(f"{idx} is not loaded.")
        second_try_idx.append(str(idx))
        print("\n unloaded list ")
        print(second_try_idx)
        print("\n")

print("\n")
print("second try for unloded videos.")

for idx in second_try_idx:
    print("waiting 1 second ...")
    time.sleep(1)
    a = YoutubeLoader(idx)
    try:
        temp_docs = a.load()
        temp_docs = list(map(change_YouTube_to_meta_data, temp_docs))
        youtube_docs.extend(temp_docs)
        print(idx + " is loaded in the second try.")
    except:
        print("\n")
        print(f"{idx} is not loaded in the second try.")
        third_try_idx.append(str(idx))
        print("\n unloaded list for the second time ")
        print(second_try_idx)
        print("\n")

print("=== ALL youtube videos ===)")
print(len(youtube_docs))
print("=== Loaded youtube videos ===)")
print(len(youtube_docs)-len(third_try_idx))

YouTube Crawler started ....
✅ Found 29 videos on channel.

kgYxyxeDNl4 is loaded.
VZMBE2NLSC4 is loaded.
FtELZXp3qmQ is not loaded.

 unloaded list 
['FtELZXp3qmQ']


loHb12HmJas is not loaded.

 unloaded list 
['FtELZXp3qmQ', 'loHb12HmJas']


YpfFHI3Ivmo is not loaded.

 unloaded list 
['FtELZXp3qmQ', 'loHb12HmJas', 'YpfFHI3Ivmo']


sPWmDRItCNk is not loaded.

 unloaded list 
['FtELZXp3qmQ', 'loHb12HmJas', 'YpfFHI3Ivmo', 'sPWmDRItCNk']


1DAkWOP_uY4 is not loaded.

 unloaded list 
['FtELZXp3qmQ', 'loHb12HmJas', 'YpfFHI3Ivmo', 'sPWmDRItCNk', '1DAkWOP_uY4']


2R_bDZ0sasM is not loaded.

 unloaded list 
['FtELZXp3qmQ', 'loHb12HmJas', 'YpfFHI3Ivmo', 'sPWmDRItCNk', '1DAkWOP_uY4', '2R_bDZ0sasM']


3GVfyu4uzhs is not loaded.

 unloaded list 
['FtELZXp3qmQ', 'loHb12HmJas', 'YpfFHI3Ivmo', 'sPWmDRItCNk', '1DAkWOP_uY4', '2R_bDZ0sasM', '3GVfyu4uzhs']


c8dQtxByOcY is loaded.
RcaMNALojIo is not loaded.

 unloaded list 
['FtELZXp3qmQ', 'loHb12HmJas', 'YpfFHI3Ivmo', 'sPWmDRItCNk', '1DAkWOP_uY4', '2

In [11]:
youtube_docs

[Document(metadata={'source': 'kgYxyxeDNl4', 'type': 'YouTube'}, page_content="hello and welcome in this video we will guide you through the process of integrating ZK iot with zimmons iot 2050 that consist of four steps step one setting up the iot server and device first creating an account on an iot server such as the motion certified panel next connect to the iot 2050 device yeah to prepare it for the following steps before proceeding install git and clone the required Repository now that we have the repository let's install iot 2050 using the setup script follow the onscreen instructions to complete the installation e a step two generating a commitment before generating a commitment we need a we need to install the required dependencies run the following command to install the necessary libraries now that the dependencies are installed and the environment is set up we need to configure the device edit the device config.js file and fill out the iot developer name fi with your company

In [12]:
# --- Step 6: Load PDFs ---
print("PDF Crawler started ....")

pdf_docs = []
pdf_files = [
    "PDF/zkIoT.pdf",
    "PDF/Consensus Algorithms.pdf",
    "PDF/Data Monetization.pdf",
    "pdf/Decentralized Delegated Proof.pdf",
    "pdf/Digital Twins.pdf",
    "pdf/Fides service contracts.pdf",
    "pdf/fides_innova_gitbook_placeholder.pdf",
    "pdf/IEEEpaper_RAMS2017.pdf",
    "pdf/IoT Startups.pdf",
    "pdf/MIoTN.pdf",
    "pdf/MQTT and MQTTS protocols.pdf",
    "pdf/p86-chatzopoulos.pdf",
    "pdf/Service Contract .pdf",
    "pdf/Service Market.pdf",
    "pdf/What’s Web 3.0.pdf"
]

for path in pdf_files:
    try:
        loader = PyPDFLoader(path)
        pdf_docs.extend(loader.load())
        print(len(pdf_docs))
    except Exception as e:
        print(f"Error loading PDF {path}: {e}")

def change_pdf_doc(doc):
    doc.metadata['type']='PDF'
    return doc

pdf_docs = list(map(change_pdf_doc, pdf_docs))

print("=== ALL PDF docs ===)")
print(len(pdf_docs))

PDF Crawler started ....
7
11
17
21
27
32
33
40
45
51
55
58
66
73
77
=== ALL PDF docs ===)
77


In [13]:
pdf_docs

[Document(metadata={'producer': 'pdfTeX-1.40.20; modified using iText® Core 7.2.4 (AGPL version) ©2000-2022 iText Group NV', 'creator': 'Appligent StampPDF Batch 7.0.3', 'creationdate': '2024-08-12T13:42:43-04:00', 'meeting starting date': '27 May 2024', 'moddate': '2024-08-18T06:27:04-04:00', 'ieee article id': '10634342', 'trapped': 'False', 'ieee issue id': '10634334', 'subject': '2024 IEEE International Conference on Blockchain and Cryptocurrency (ICBC);2024; ; ;10.1109/ICBC59979.2024.10634342', 'ieee publication id': '10634319', 'title': 'zk-IoT: Securing the Internet of Things with Zero-Knowledge Proofs on Blockchain Platforms', 'meeting ending date': '31 May 2024', 'appligent': 'StampPDF Batch 7.0.3 Linux 64 bit Feb 17 2022 Library APDFL18.0.3d', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.20 (TeX Live 2019/W32TeX) kpathsea version 6.3.1', 'source': 'PDF/zkIoT.pdf', 'total_pages': 7, 'page': 0, 'page_label': '1', 'type': 'PDF'}, page_content='zk-IoT: Securing

In [14]:
# --- Step 6: Load PDFs ---
print("PPTX Crawler started ....")

pptx_docs = []
pptx_files = [
    "PPTX/FidesinnovaDeck-v11.pptx"
]

for path in pptx_files:
    try:
        loader = UnstructuredPowerPointLoader(path)
        pptx_docs.extend(loader.load())
        print(f"✅ Loaded: {path}")
    except Exception as e:
        print(f"❌ Error loading PPTX {path}: {e}")

# Add metadata
def change_pptx_doc(doc):
    doc.metadata['type'] = 'PPTX'
    return doc

pptx_docs = list(map(change_pptx_doc, pptx_docs))

print("=== ALL PPTX docs ===)")
print(len(pptx_docs))

PPTX Crawler started ....
✅ Loaded: PPTX/FidesinnovaDeck-v11.pptx
=== ALL PPTX docs ===)
1


In [15]:
pptx_docs

[Document(metadata={'source': 'PPTX/FidesinnovaDeck-v11.pptx', 'type': 'PPTX'}, page_content="FIDES INNOVA\n\nVerifiable Computing Ecosystem\n\nW W W . F I D E S I N N O V A . I O\n\n\n\nFIDES INNOVA LABS\n\nThe Future is Verifiable\n\nA pioneer in enabling secure computation across IoT and ML \n\nRevolutionizing trust by introducing Agent-based Verifiable Computing \n\nleveraging zero-knowledge proof (ZKP), DePIN and Agentic AI. \n\n\n\nFOUNDERS\n\nMel McCann\n\nCo-founder\n\nReza Ramezan, Ph.D.\n\nCo-Founder\n\nFormer Research Lead at Cardano Foundation\n\nWith over 20 years of executive experience at leading firms like AT&T, Huawei, Telus, and Cardano Foundation, Reza is a proven innovator and entrepreneur. He holds a Ph.D. from the University of British Columbia and has numerous patents and publications in blockchain, zero- knowledge proofs, and IoT.\n\nHamed Shah-Mansouri, Ph.D.\n\nCo-founder\n\nWith over 10 years of research and development experience, Hamed is a C-level executiv

In [16]:
# --- Step 7: Split & Vectorize ---
print("Splitter and db started ....")
# all_docs =  youtube_docs + pdf_docs + pptx_docs + web_docs1 + web_docs2 + github_docs

splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)
print("Splitting web_docs1 ...")
split_web_docs1 = splitter.split_documents(web_docs1)
print("Splitting web_docs2 ...")
split_web_docs2 = splitter.split_documents(web_docs2)
print("Splitting github_docs ...")
split_github_docs = splitter.split_documents(github_docs)
print("Splitting youtube_docs ...")
split_youtube_docs = splitter.split_documents(youtube_docs)
print("Splitting pdf_docs ...")
split_pdf_docs = splitter.split_documents(pdf_docs)
print("Splitting pptx_docs ...")
split_pptx_docs = splitter.split_documents(pptx_docs)


# --- Step 8: Store in Vector DB ---
# Combine all split documents
all_split_docs = (
    split_youtube_docs +
    split_pdf_docs +
    split_pptx_docs +
    split_web_docs1 +
    split_web_docs2 +
    split_github_docs
)
print("Splitting done.")


Splitter and db started ....
Splitting web_docs1 ...
Splitting web_docs2 ...
Splitting github_docs ...
Splitting youtube_docs ...
Splitting pdf_docs ...
Splitting pptx_docs ...
Splitting done.


In [17]:
# ---------------------- PCA Wrapper ----------------------
class PCAWrapper(Embeddings):
    def __init__(self, base_embedder, n_components=256):
        self.base_embedder = base_embedder
        self.pca = PCA(n_components=n_components)
        self.fitted = False

    def fit(self, vectors):
        self.pca.fit(vectors)
        self.fitted = True

    def embed_documents(self, texts):
        raw_vectors = self.base_embedder.embed_documents(texts)
        if not self.fitted:
            raise ValueError("PCAWrapper must be fitted before calling embed_documents.")
        reduced_vectors = self.pca.transform(raw_vectors)
        return reduced_vectors.tolist()

    def embed_query(self, text):
        raw_vector = self.base_embedder.embed_query(text)
        if self.fitted:
            return self.pca.transform([raw_vector])[0].tolist()
        else:
            return raw_vector  # Return original until PCA is trained


In [18]:
# ---------------------- Batching Utility ----------------------
def batch(iterable, batch_size):
    for i in range(0, len(iterable), batch_size):
        yield iterable[i:i + batch_size]

In [19]:
# ---------------------- Document Preparation ----------------------
texts = [d.page_content for d in all_split_docs]
metadatas = [d.metadata for d in all_split_docs]

In [20]:
# ---------------------- Step 1: Embed All for PCA ----------------------
base_embedder = OpenAIEmbeddings(model="text-embedding-3-large")
all_raw_vectors = []
for chunk in batch(texts, 200):  # Larger batch for training PCA
    vectors = base_embedder.embed_documents(chunk)
    all_raw_vectors.extend(vectors)

In [21]:
# ---------------------- Step 2: Fit PCA ----------------------
pca_embedder = PCAWrapper(base_embedder, n_components=256)
pca_embedder.fit(np.array(all_raw_vectors))

  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = qr_normalizer(A @ Q)
  Q, _ = qr_normalizer(A @ Q)
  Q, _ = qr_normalizer(A @ Q)
  B = Q.T @ M
  B = Q.T @ M
  B = Q.T @ M
  U = Q @ Uhat
  U = Q @ Uhat
  U = Q @ Uhat


In [22]:

# Transform all vectors
reduced_vectors = pca_embedder.pca.transform(np.array(all_raw_vectors)).astype("float32")

# ---------------------- Step 3: Build IVF FAISS Index ----------------------
# Set dimensions and cluster count
embedding_dim = 256
n_clusters = min(64, len(reduced_vectors))  # Adjust based on dataset size

# Step 1: Create quantizer and IVF index
quantizer = faiss.IndexFlatL2(embedding_dim)
index = faiss.IndexIVFFlat(quantizer, embedding_dim, n_clusters)

# Step 2: Train the index
index.train(reduced_vectors)

# Step 3: Add vectors in chunks (streaming possible)
for i in range(0, len(reduced_vectors), 200):
    index.add(reduced_vectors[i:i + 200])

  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T


In [23]:
import os
import pickle
import faiss
from langchain_community.vectorstores import FAISS as LC_FAISS

# Define save path
save_path = "./faiss_fides_crawled_data"
os.makedirs(save_path, exist_ok=True)

# Save FAISS index
index_path = os.path.join(save_path, "index.faiss")
faiss.write_index(index, index_path)
print(f"✅ FAISS index saved at: {index_path}")

from langchain_community.vectorstores.faiss import InMemoryDocstore

# Assuming `all_split_docs` exists
docstore = InMemoryDocstore({str(i): all_split_docs[i] for i in range(len(all_split_docs))})
index_to_docstore_id = list(map(str, range(len(all_split_docs))))

# Save docstore and index-to-docstore-id
docstore_path = os.path.join(save_path, "docstore.pkl")
with open(docstore_path, "wb") as f:
    pickle.dump((docstore, index_to_docstore_id), f)
print(f"✅ Docstore and index mappings saved at: {docstore_path}")

# Optional: Save the PCA model
pca_model_path = os.path.join(save_path, "pca_model.pkl")
with open(pca_model_path, "wb") as f:
    pickle.dump(pca_embedder.pca, f)
print(f"✅ PCA model saved at: {pca_model_path}")

✅ FAISS index saved at: ./faiss_fides_crawled_data/index.faiss
✅ Docstore and index mappings saved at: ./faiss_fides_crawled_data/docstore.pkl
✅ PCA model saved at: ./faiss_fides_crawled_data/pca_model.pkl


In [24]:
import os
import pickle
import faiss
from langchain_community.vectorstores import FAISS as LC_FAISS

load_path = "./faiss_fides_crawled_data"
index = faiss.read_index(os.path.join(load_path, "index.faiss"))

with open(os.path.join(load_path, "docstore.pkl"), "rb") as f:
    docstore, index_to_docstore_id = pickle.load(f)

faiss_db = LC_FAISS(
    pca_embedder,
    index,
    docstore,
    index_to_docstore_id
)

In [27]:
query = "What is zk-IoT?"
query_vector = pca_embedder.embed_query(query)

results = faiss_db.similarity_search_by_vector(query_vector, k=3)
for result in results:
    print(f"Metadata: {result.metadata}, Content: {result.page_content[:100]}...")

Metadata: {'source': 'https://fidesinnova.io/', 'title': 'FidesInnova-Verified Computations for Reliable Results - FidesInnova', 'language': 'en-US', 'type': 'Web'}, Content: Twins in Smart Building ManagementRead more FIdesInnovaOctober 9, 20240CommentsData Monetization in ...
Metadata: {'source': 'https://fidesinnova.io/', 'title': 'FidesInnova-Verified Computations for Reliable Results - FidesInnova', 'language': 'en-US', 'type': 'Web'}, Content: Twins in Smart Building ManagementRead more FIdesInnovaOctober 9, 20240CommentsData Monetization in ...
Metadata: {'source': 'tech-stack/zkp-enabled-javascript-execution.md', 'file_path': 'tech-stack/zkp-enabled-javascript-execution.md', 'file_name': 'zkp-enabled-javascript-execution.md', 'file_type': '.md', 'type': 'GitHub'}, Content: equipped for generating and verifying ZKPs represents a significant advancement in the realm of dece...


  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
