In [1]:

!python3.10 -m venv venv
!source venv/bin/activate
!python3.10 -m pip install git+https://github.com/openai/whisper.git -q
!python3.10 -m pip install yt-dlp langchain faiss-cpu whisper scikit-learn regex selenium webdriver_manager GitPython -q

In [2]:
!export AIRFLOW_HOME=~/airflow
AIRFLOW_VERSION="3.0.2"
# !PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')"
PYTHON_VERSION="3.10"
CONSTRAINT_URL=f"https://raw.githubusercontent.com/apache/airflow/constraints-{AIRFLOW_VERSION}/constraints-{PYTHON_VERSION}.txt"
# For example this would install 3.0.0 with python 3.9: https://raw.githubusercontent.com/apache/airflow/constraints-3.0.2/constraints-3.9.txt
arg = f"apache-airflow=={AIRFLOW_VERSION} --constraint {CONSTRAINT_URL}"
print(arg)
!pip install {arg} -q

apache-airflow==3.0.2 --constraint https://raw.githubusercontent.com/apache/airflow/constraints-3.0.2/constraints-3.10.txt


In [119]:
import os
from dotenv import load_dotenv
load_dotenv()
# --- Step 2: Load environment variables from .env file
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

airflow_ui = os.environ.get('DLAI_LOCAL_URL').format(port=8080)
airflow_ui #username:airflow password:airflow (if asked)

'http://localhost:8080'

In [None]:
%%writefile dags/youtube_crawler_dag.py

#@dag(schedule_interval=None, start_date="2023-10-01", catchup=False, tags=["youtube_crawler"])
@dag
def youtube_crawler_dag():
    """
    A simple DAG to crawl YouTube videos, transcribe them, and store the results.
    """
    from airflow.sdk import dag, task, chain
    import yt_dlp

    @task 
    def fetch_video():
        ydl_opts = {
            'ignoreerrors': True,
            'quiet': True,
            'extract_flat': True,
            'force_generic_extractor': False,
        }
        # Channel Details
        CHANNEL_ID = "UCrrqGYx98H1dPdZsNb1i9-g"
        CHANNEL_URL = f"https://www.youtube.com/channel/{CHANNEL_ID}"
        print("🎬 YouTube Crawler started ...")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            result = ydl.extract_info(CHANNEL_URL, download=False)
            video_urls, video_ids = [], []

            if 'entries' in result:
                for entry in result['entries']:
                    if entry and 'id' in entry:
                        video_ids.append(entry['id'])
                        video_urls.append(f"https://www.youtube.com/watch?v={entry['id']}")
                        print("url", entry['id'], "appended.")

        print(f"✅ Found {len(video_ids)} videos on channel: {CHANNEL_URL}")
        return video_ids, video_urls

    @task 
    def transcribe_youtube_videos(video_ids, video_urls):
        import whisper
        from langchain_core.documents import Document
        import os
        from pathlib import Path
        print("🎤 Transcribing YouTube videos ...")
        os.makedirs("YouTube", exist_ok=True)
        model = whisper.load_model("tiny", device="cpu")
        youtube_docs = []

        for video_url, video_id in zip(video_urls, video_ids):
            print(f"📥 Processing: {video_id}")
            output_file = f"YouTube/{video_id}.mp3"

            # Download .mp3 if not already downloaded
            if not Path(output_file).exists():
                ret = os.system(f'yt-dlp -x --audio-format mp3 -o "{output_file}" {video_url}')
                if ret != 0:
                    print(f"❌ Failed to download: {video_url}")
                    continue

            try:
                result = model.transcribe(output_file)
                print(f"📝 Transcribed: {video_id}")
                doc = Document(
                    page_content=result["text"],
                    metadata={
                        "source": video_url,
                        "video_id": video_id,
                        "type": "YouTube"
                    }
                )
                youtube_docs.append(doc)
            except Exception as e:
                print(f"⚠️ Transcription failed for {video_id}: {e}")

            print(f"\n✅ Completed. Total documents created: {len(youtube_docs)}")
            print(youtube_docs[:2])

        return youtube_docs

        _fetch_video = fetch_video()
        _transcribe_youtube_videos = transcribe_youtube_videos(video_urls=_fetch_video[0], video_ids=_fetch_video[1])

        chain(
            _fetch_video,
            _transcribe_youtube_videos
        )

# Register the DAGs
from airflow.sdk import register_dag
register_dag(youtube_crawler_dag, "youtube_crawler_dag")

youtube_crawler_dag()


Overwriting dags/youtube_crawler_dag.py


In [None]:
import time
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse, urljoin
from selenium.common.exceptions import WebDriverException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
from langchain_community.document_loaders import WebBaseLoader

def crawl_internal_links(start_url, max_pages=10, max_depth=1):
    """
    Crawl internal URLs from a site using Selenium, with support for JavaScript-heavy pages.

    Args:
        start_url (str): The URL to start crawling from.
        max_pages (int): Max number of pages to crawl.
        max_depth (int): Max depth to crawl (0 = just root).

    Returns:
        list: A list of internal URLs that were successfully visited.
    """
    # Selenium Headless Browser Setup
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    visited = set()
    domain = urlparse(start_url).netloc
    to_visit = [(start_url, 0)]

    while to_visit and len(visited) < max_pages:
        url, depth = to_visit.pop(0)
        if url in visited or depth > max_depth:
            continue
        try:
            driver.get(url)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "a")))
            time.sleep(1)  # 🔁 Wait for JS to load

            print(f"Visited ({len(visited)+1}/{max_pages}), Depth {depth}): {url}")
            visited.add(url)

            # If max depth reached, skip link extraction
            if depth == max_depth:
                continue

            # Extract and queue internal links
            links = driver.find_elements(By.TAG_NAME, "a")
            for link in links:
                try:
                    href = link.get_attribute("href")
                    if not href or href.startswith(("mailto:", "tel:", "javascript:")):
                        continue

                    parsed = urlparse(href)
                    if parsed.netloc == domain or parsed.netloc == "":
                        full_url = urljoin(url, href).split("#")[0]
                        if full_url not in visited and all(full_url != q[0] for q in to_visit):
                            to_visit.append((full_url, depth + 1))
                except StaleElementReferenceException:
                    continue

        except (WebDriverException, TimeoutException):
            print(f"⚠️ Skipping (Error): {url}")
            visited.add(url)
            continue

    driver.quit()

    # Print summary
    print(f"\n Total unique internal URLs visited: {len(visited)}")

    if len(visited) < max_pages:
        print("⚠️ Number of crawled URLs is less than max_pages. Possible reasons:")
        print("- Site may not have enough unique pages within the allowed depth.")
        print("- Some links might be hidden behind JavaScript interactions.")
        print("- Some links could be blocked, inaccessible, or slow-loading.")
        print("- Your max_depth may be too shallow to discover deeper links, try changing depth.")
    
    return list(visited)

def change_web_to_meta_data(doc):
    """
    Change the metadata of the web documents to include the content type.
    """
    doc.metadata["type"] = "Web"
    # remove \n from the content
    doc.page_content = doc.page_content.replace("\n", " ")
    # remove multiple spaces
    doc.page_content = ' '.join(doc.page_content.split())
    return doc


# WEB CRAWLER
print("Web Crawler started ....")

#### CONFIGURABLE SETTINGS 
start_url = "https://fidesinnova.io/"
max_pages = 3   # Maximum number of pages to visit
max_depth = 3     # 0 = only root, 1 = root + links from root

####  Run the Crawler
web_docs_list1 = crawl_internal_links(start_url, max_pages, max_depth)

#  Load the web documents
web_docs1 = []
for idx in web_docs_list1:
    a = WebBaseLoader(idx)
    try:
        temp_docs = a.load()
        temp_docs = list(map(change_web_to_meta_data, temp_docs))

        web_docs1.extend(temp_docs)
        print(idx + " is loaded.")
    except:
        print(f"{idx} is not loaded.")

print(f"Total web documents loaded: {len(web_docs1)}")
print(web_docs1[:2])  # Print first 2 documents for verification

Web Crawler started ....
Visited (1/3), Depth 0): https://fidesinnova.io/
Visited (2/3), Depth 1): https://fidesinnova.io/devices/
Visited (3/3), Depth 1): https://fidesinnova.io/courses/

 Total unique internal URLs visited: 3
https://fidesinnova.io/ is loaded.
https://fidesinnova.io/courses/ is loaded.
https://fidesinnova.io/devices/ is loaded.
Total web documents loaded: 3
[Document(metadata={'source': 'https://fidesinnova.io/', 'title': 'FidesInnova-Verified Computations for Reliable Results - FidesInnova', 'language': 'en-US', 'type': 'Web'}, page_content='FidesInnova-Verified Computations for Reliable Results - FidesInnova Skip to content Skip to footer Products Network ZKP ExplorerMobile AppIoT Serverzk-IoT ZKP SDKSample IoT Device Ecosystem Sensors (zkSensor)Transportation (MotionCertified)Oil&Gas (EnergyWiseNetwork)Machine Learning (TrustLearn)Smart Home (TrustSense)Partnership Resources WiKiGithubArticlesAcademyzk-IoT Paper About Us Contact UsAbout Us X-twitter Youtube Github

In [None]:

# print("Web Crawler 2 started ....")

# # Headless setup
# options = Options()
# options.add_argument("--headless=new")
# options.add_argument("--disable-gpu")
# options.add_argument("--no-sandbox")
# options.add_argument("--disable-dev-shm-usage")
# options.add_argument("--window-size=1920,1080")
# options.add_argument("--log-level=3")
# options.add_argument("--disable-blink-features=AutomationControlled")

# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# base_url = "https://www.fidesinnova.io/"
# visited = set()
# web_docs_list2 = set()

# def is_valid_internal(url):
#     parsed = urlparse(url)
#     return parsed.scheme in ["http", "https"] and parsed.netloc.endswith("fidesinnova.io")

# def crawl(url):
#     if url in visited:
#         return
#     visited.add(url)

#     try:
#         driver.get(url)
#         time.sleep(2)  # Let dynamic content load

#         links = driver.find_elements("tag name", "a")
#         for link in links:
#             href = link.get_attribute("href")
#             if href:
#                 full_url = urljoin(url, href.split("#")[0])  # remove fragments
#                 if is_valid_internal(full_url) and full_url not in web_docs_list2:
#                     web_docs_list2.add(full_url)
#                     crawl(full_url)

#     except Exception as e:
#         print(f"Error visiting {url}: {e}")

# # Start
# crawl(base_url)

# driver.quit()

# # Output
# print("\n=== Discovered Internal URLs ===")
# for url in sorted(web_docs_list2):
#     print(url)
# print(f"\nTotal internal URLs web_docs_list2: {len(web_docs_list2)}")


# #  Load the web documents
# web_docs2 = []
# for idx in web_docs_list2:
#     a = WebBaseLoader(idx)
#     try:
#         temp_docs = a.load()
#         temp_docs = list(map(change_web_to_meta_data, temp_docs))

#         web_docs2.extend(temp_docs)
#         print(idx + " is loaded.")
#     except:
#         print(f"{idx} is not loaded.")


In [None]:
from langchain_community.document_loaders import GitLoader

def change_GitHub_to_meta_data(doc):
    """
    Change the metadata of the web documents to include the content type.
    """
    doc.metadata["type"] = "GitHub"
    # remove \n from the content
    doc.page_content = doc.page_content.replace("\n", " ")
    # remove multiple spaces
    doc.page_content = ' '.join(doc.page_content.split())
    return doc

github_docs = []
for url in github_repos:
    print(f"📥 Loading repository: {url}")
    repo_name = url.split("/")[-1]
    local_path = f"./cloned_repos/{repo_name}"

    loader = GitLoader(
        repo_path=local_path,
        clone_url=url,
        branch="main",
        file_filter=lambda f: f.endswith((
            # ".py", ".md", ".c", ".cpp", ".rs", ".json", ".html",
            # ".js", ".ts", ".css", ".java", ".txt", ".yml", ".yaml", ".sh"
            ".md"
        ))
    )
    
    temp_docs = loader.load()
    temp_docs = list(map(change_GitHub_to_meta_data, temp_docs))
    github_docs.extend(temp_docs)
    
    print(f"✅ Loaded {len(github_docs)} documents from {repo_name}")

print("GitHub Crawler started ....")

# loading GitHub Repos
github_repos = [
    "https://github.com/TheArchitect2000/Fides-Innova-WiKi",
    # "https://github.com/TheArchitect2000/Blockchain-based-IoT-Server",
    # "https://github.com/TheArchitect2000/zk-IoT",
    # "https://github.com/TheArchitect2000/Smart-Contract-Protocol",

#    "https://github.com/TheArchitect2000/zkiot-riscv-qemu-c", 
#    "https://github.com/TheArchitect2000/ZKP-Blockchain-Explorer",
#    "https://github.com/TheArchitect2000/evm-server",
#    "https://github.com/TheArchitect2000/New-IoT-Device-Integration",
#   "https://github.com/TheArchitect2000/zkiot-riscv-qemu-rust"
]
print(f"Total GitHub documents loaded: {len(github_docs)}")
print(github_docs[:2])  # Print first 2 documents for verification

GitHub Crawler started ....
📥 Loading repository: https://github.com/TheArchitect2000/Fides-Innova-WiKi
✅ Loaded 41 documents from Fides-Innova-WiKi
Total GitHub documents loaded: 41
[Document(metadata={'source': 'README.md', 'file_path': 'README.md', 'file_name': 'README.md', 'file_type': '.md', 'type': 'GitHub'}, page_content='<p align="center"> <a href="https://fidesinnova.io/" target="blank"><img src="g-c-web-back.png" /></a> </p> # FidesInnova Wiki <a href="https://www.npmjs.com/~nestjscore" target="_blank"><img src="https://img.shields.io/npm/v/@nestjs/core.svg" alt="NPM Version" /></a> <a href="https://www.npmjs.com/~nestjscore" target="_blank"><img src="https://img.shields.io/npm/l/@nestjs/core.svg" alt="Package License" /></a> <a href="https://www.npmjs.com/~nestjscore" target="_blank"><img src="https://img.shields.io/npm/dm/@nestjs/common.svg" alt="NPM Downloads" /></a> <a href="https://circleci.com/gh/nestjs/nest" target="_blank"><img src="https://img.shields.io/circleci/bui

In [None]:
# print("YouTube Crawler started ....")

# # Fides Innova YouTube Channel ID
# CHANNEL_ID = "UCrrqGYx98H1dPdZsNb1i9-g"
# CHANNEL_URL = f"https://www.youtube.com/channel/{CHANNEL_ID}"
# # https://www.youtube.com/channel/UCrrqGYx98H1dPdZsNb1i9-g

# def fetch_video_urls(channel_url: str):
#     ydl_opts = {
#         'ignoreerrors': True,
#         'quiet': True,
#         'extract_flat': True,  # Only metadata, not downloading
#         'force_generic_extractor': False,
#     }

#     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
#         result = ydl.extract_info(channel_url, download=False)
#         video_urls = []
#         video_ids = []

#         if 'entries' in result:
#             for entry in result['entries']:
#                 if entry and 'id' in entry:
#                     video_url = f"https://www.youtube.com/watch?v={entry['id']}"
#                     video_urls.append(video_url)
#                     video_ids.append(entry['id'])
#         return video_ids, video_urls

# # Fetch and display video URLs
# video_ids, video_urls = fetch_video_urls(CHANNEL_URL)
# print(f"✅ Found {len(video_ids)} videos on channel.\n")

# youtube_docs = []
# #######################
# import whisper
# from langchain_core.documents import Document

# for video_url, video_id in zip(video_urls, video_ids):
#     output_file = f"YouTube/{video_id}.mp3"
#     # Download audio using yt-dlp
#     os.system(f"yt-dlp -x --audio-format mp3 -o {output_file} {video_url}")

#     model = whisper.load_model("base")
#     result = model.transcribe(output_file)

#     print(result["text"])
#     # Create a class of documents objects import from LangChain
#     doc = Document(
#         page_content=result["text"],
#         metadata={
#             "source": video_url,
#             "video_id": video_id,
#             "type": "YouTube"
#         }
#     )
#     youtube_docs.append(doc)
######################

# def change_YouTube_to_meta_data(doc):
#     doc.metadata['type']='YouTube'
#     return doc

# for idx in video_ids:
#     print("loading video: " + idx)
#     a = YoutubeLoader(idx)
#     try:
#         temp_docs = a.load()
#         temp_docs = list(map(change_YouTube_to_meta_data, temp_docs))
#         youtube_docs.extend(temp_docs)
#         print(idx + " is loaded.")
#     except Exception as e:
#         print(f"{idx} is not loaded. Exception: ", e)
#         second_try_idx.append(str(idx))
#         print("\n unloaded list", second_try_idx, "\n")

# second_try_idx = []
# third_try_idx = []
# print("\n The second try for unloaded videos.")

# for idx in second_try_idx:
#     print("waiting 1 second ...")
#     time.sleep(1)
#     a = YoutubeLoader(idx)
#     try:
#         temp_docs = a.load()
#         temp_docs = list(map(change_YouTube_to_meta_data, temp_docs))
#         youtube_docs.extend(temp_docs)
#         print(idx, " is loaded in the second try.")
#     except:
#         print("\n")
#         print(idx, " is not loaded in the second try.")
#         third_try_idx.append(str(idx))
#         print("\n unloaded list for the second time ", third_try_idx, "\n")

In [None]:
from langchain_community.document_loaders import PyPDFLoader

def change_pdf_doc(doc):
    doc.metadata['type']='PDF'
    return doc

# --- Step 6: Load PDFs ---
print("PDF Crawler started ....")

pdf_docs = []
pdf_files = [
    "PDF/zkIoT.pdf",
    "PDF/Consensus Algorithms.pdf",
    "PDF/Data Monetization.pdf",
    "pdf/Decentralized Delegated Proof.pdf",
    "pdf/Digital Twins.pdf",
    "pdf/Fides service contracts.pdf",
    "pdf/fides_innova_gitbook_placeholder.pdf",
    "pdf/IoT Startups.pdf",
    "pdf/MIoTN.pdf",
    "pdf/MQTT and MQTTS protocols.pdf",
    "pdf/Service Contract.pdf",
    "pdf/Service Market.pdf",
    "pdf/What’s Web 3.0.pdf"
]

for path in pdf_files:
    try:
        loader = PyPDFLoader(path)
        pdf_docs.extend(loader.load())
        print(len(pdf_docs))
    except Exception as e:
        print(f"Error loading PDF {path}: {e}")

pdf_docs = list(map(change_pdf_doc, pdf_docs))

print("Loaded all PDF files:", len(pdf_files))
print("Created docs:", len(pdf_docs))
print(pdf_docs[:2])  # Print first 2 documents for verification

PDF Crawler started ....
7
11
17
21
27
32
33
38
44
48
56
63
67
Loaded all PDF files: 13
Created docs: 67
[Document(metadata={'producer': 'pdfTeX-1.40.20; modified using iText® Core 7.2.4 (AGPL version) ©2000-2022 iText Group NV', 'creator': 'Appligent StampPDF Batch 7.0.3', 'creationdate': '2024-08-12T13:42:43-04:00', 'meeting starting date': '27 May 2024', 'moddate': '2024-08-18T06:27:04-04:00', 'ieee article id': '10634342', 'trapped': 'False', 'ieee issue id': '10634334', 'subject': '2024 IEEE International Conference on Blockchain and Cryptocurrency (ICBC);2024; ; ;10.1109/ICBC59979.2024.10634342', 'ieee publication id': '10634319', 'title': 'zk-IoT: Securing the Internet of Things with Zero-Knowledge Proofs on Blockchain Platforms', 'meeting ending date': '31 May 2024', 'appligent': 'StampPDF Batch 7.0.3 Linux 64 bit Feb 17 2022 Library APDFL18.0.3d', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.20 (TeX Live 2019/W32TeX) kpathsea version 6.3.1', 'source': 'PDF/z

In [None]:
from langchain_community.document_loaders import UnstructuredPowerPointLoader 

# Add metadata
def change_pptx_doc(doc):
    doc.metadata['type'] = 'PPTX'
    return doc

# --- Step 6: Load PDFs ---
print("PPTX Crawler started ....")

pptx_docs = []
pptx_files = [
    "PPTX/FidesinnovaDeck-v11.pptx"
]

for path in pptx_files:
    try:
        loader = UnstructuredPowerPointLoader(path)
        pptx_docs.extend(loader.load())
        print(f"✅ Loaded: {path}")
    except Exception as e:
        print(f"❌ Error loading PPTX {path}: {e}")

pptx_docs = list(map(change_pptx_doc, pptx_docs))

print("Loaded PPTX files:", len(pptx_files))
print("Created docs:", len(pptx_docs))
print(pptx_docs[:2])  # Print first 2 documents for verification

PPTX Crawler started ....
✅ Loaded: PPTX/FidesinnovaDeck-v11.pptx
Loaded PPTX files: 1
Created docs: 1
[Document(metadata={'source': 'PPTX/FidesinnovaDeck-v11.pptx', 'type': 'PPTX'}, page_content="FIDES INNOVA\n\nVerifiable Computing Ecosystem\n\nW W W . F I D E S I N N O V A . I O\n\n\n\nFIDES INNOVA LABS\n\nThe Future is Verifiable\n\nA pioneer in enabling secure computation across IoT and ML \n\nRevolutionizing trust by introducing Agent-based Verifiable Computing \n\nleveraging zero-knowledge proof (ZKP), DePIN and Agentic AI. \n\n\n\nFOUNDERS\n\nMel McCann\n\nCo-founder\n\nReza Ramezan, Ph.D.\n\nCo-Founder\n\nFormer Research Lead at Cardano Foundation\n\nWith over 20 years of executive experience at leading firms like AT&T, Huawei, Telus, and Cardano Foundation, Reza is a proven innovator and entrepreneur. He holds a Ph.D. from the University of British Columbia and has numerous patents and publications in blockchain, zero- knowledge proofs, and IoT.\n\nHamed Shah-Mansouri, Ph.D.\

In [20]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# --- Step 7: Split & Vectorize ---
print("Splitter and db started ....")
# all_docs =  youtube_docs + pdf_docs + pptx_docs + web_docs1 + github_docs

splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
print("Splitting web_docs1 ...")
split_web_docs1 = splitter.split_documents(web_docs1)
print("Splitting github_docs ...")
split_github_docs = splitter.split_documents(github_docs)
# print("Splitting youtube_docs ...")
# split_youtube_docs = splitter.split_documents(youtube_docs)
print("Splitting pdf_docs ...")
split_pdf_docs = splitter.split_documents(pdf_docs)
print("Splitting pptx_docs ...")
split_pptx_docs = splitter.split_documents(pptx_docs)


# --- Step 8: Store in Vector DB ---
# Combine all split documents
all_split_docs = (
#    split_youtube_docs +
    split_pdf_docs +
    split_pptx_docs +
    split_web_docs1 +
    split_github_docs
)
print("Splitting done.", len(all_split_docs), " documents created.")


Splitter and db started ....
Splitting web_docs1 ...
Splitting github_docs ...
Splitting pdf_docs ...
Splitting pptx_docs ...
Splitting done. 905  documents created.


In [110]:
from langchain_openai import OpenAIEmbeddings
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from langchain.vectorstores.faiss import FAISS
from langchain.docstore import InMemoryDocstore
import numpy as np
import pickle
import faiss
import os

# ---------------------- Step 1: Embed All ----------------------
base_embedder = OpenAIEmbeddings(model="text-embedding-3-small")
texts_docs = [(doc.page_content, doc) for doc in all_split_docs if doc.page_content.strip()]
texts, docs = zip(*texts_docs)
raw_vectors = np.array(base_embedder.embed_documents(list(texts)))

# ---------------------- Step 2: Filter Invalid Vectors ----------------------
filtered_vectors = []
filtered_docs = []
for vec, doc in zip(raw_vectors, docs):
    if np.all(np.isfinite(vec)) and np.linalg.norm(vec) > 1e-6:
        filtered_vectors.append(vec)
        filtered_docs.append(doc)

raw_vectors = np.array(filtered_vectors)

# ---------------------- Step 3: Clip Outliers ----------------------
raw_vectors = np.clip(raw_vectors, -1000, 1000)
# ---------------------- Step 4: Normalize ----------------------
scaler = StandardScaler()
normalized_vectors = scaler.fit_transform(raw_vectors)

# ---------------------- Step 4.5: Drop Low-Variance Features ----------------------
variances = np.var(normalized_vectors, axis=0)
stable_mask = variances > 1e-6
normalized_vectors = normalized_vectors[:, stable_mask]
print(f"✅ Retained {np.sum(stable_mask)} stable features out of {len(stable_mask)}.")

# ---------------------- Step 5: PCA Fit ----------------------
pca = PCA(n_components=256, svd_solver='full')
pca.fit(normalized_vectors)

# ---------------------- Step 6: Final Check Before PCA Transform ----------------------
clean_rows = []
clean_docs = []

for vec, doc in zip(normalized_vectors, filtered_docs):
    if np.all(np.isfinite(vec)) and np.linalg.norm(vec) < 100:  # tighter threshold
        clean_rows.append(vec)
        clean_docs.append(doc)

normalized_vectors = np.array(clean_rows)
filtered_docs = clean_docs

# Now safely transform
transformed_vectors = pca.transform(normalized_vectors)

print("PCA transformation completed successfully.")
print("Transformed vectors shape:", transformed_vectors.shape)
print("Any NaN in transformed vectors?", np.isnan(transformed_vectors).any())
print("Max component magnitude:", np.max(np.abs(transformed_vectors)))
print("PCA components shape:", pca.components_.shape)
print("Any NaN in PCA components?", np.isnan(pca.components_).any())
print("Max component magnitude:", np.max(np.abs(pca.components_)))

✅ Retained 1536 stable features out of 1536.
PCA transformation completed successfully.
Transformed vectors shape: (905, 256)
Any NaN in transformed vectors? False
Max component magnitude: 26.58131988923045
PCA components shape: (256, 1536)
Any NaN in PCA components? False
Max component magnitude: 0.1265460647665814


  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T


The FAISS database Creation.

In [None]:

# ---------------------- Step 7: Create FAISS Index ----------------------
index = faiss.IndexFlatL2(transformed_vectors.shape[1])
index.add(transformed_vectors.astype("float32"))

docstore = InMemoryDocstore(dict(enumerate(filtered_docs)))  # <-- use aligned docs
index_to_docstore_id = {i: i for i in range(len(filtered_docs))}

faiss_index = FAISS(
    embedding_function=None,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# ---------------------- Step 8: Save Everything ----------------------
save_dir = "fides_faiss_crawled_data"
os.makedirs(save_dir, exist_ok=True)

faiss_index.save_local(os.path.join(save_dir, "fides_faiss_pca_256"))

with open(os.path.join(save_dir, "fides_pca_256_model.pkl"), "wb") as f:
    pickle.dump(pca, f)

with open(os.path.join(save_dir, "fides_scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

with open(os.path.join(save_dir, "fides_feature_mask.npy"), "wb") as f:
    np.save(f, stable_mask)

print("✅ All components saved: FAISS index, PCA model, Scaler, and feature mask.")

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


✅ All components saved: FAISS index, PCA model, Scaler, and feature mask.


In [109]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import pickle
import faiss
import os

# ---------------------- Step 1: Load Components ----------------------
save_dir = "fides_faiss_crawled_data"

# Load FAISS index
faiss_index = FAISS.load_local(
    folder_path=os.path.join(save_dir, "fides_faiss_pca_256"),
    embeddings=None,
    index_name="index",  # default inside LangChain
    allow_dangerous_deserialization=True
)

# Load PCA
with open(os.path.join(save_dir, "fides_pca_256_model.pkl"), "rb") as f:
    pca = pickle.load(f)

# Load scaler
with open(os.path.join(save_dir, "fides_scaler.pkl"), "rb") as f:
    scaler = pickle.load(f)

# Load feature mask
stable_mask = np.load(os.path.join(save_dir, "fides_feature_mask.npy"))

# ---------------------- Step 2: Embed and Preprocess Query ----------------------
def preprocess_query(text: str) -> np.ndarray:
    embedder = base_embedder
    vector = embedder.embed_query(text)

    # Ensure vector is safe
    if not np.all(np.isfinite(vector)) or np.linalg.norm(vector) <= 1e-6:
        raise ValueError("Invalid or zero vector produced by embedder.")

    # Clip + normalize + select features
    vector = np.clip(vector, -1000, 1000)
    vector = scaler.transform([vector])
    vector = vector[:, stable_mask]
    vector = pca.transform(vector)

    return vector.astype("float32")

# ---------------------- Step 3: Perform Similarity Search ----------------------
def search(query: str, k=5):
    query_vector = preprocess_query(query)
    return faiss_index.similarity_search_by_vector(query_vector[0], k=k)

# ---------------------- Example Usage ----------------------
results = search("zero-knowledge proof for IoT security", k=3)
for i, res in enumerate(results, 1):
    print(f"\nResult {i}")
    print("Content:", res.page_content)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.



Result 1
Content: vol. 12, no. 5, 2023.
[6] M. Walshe, G. Epiphaniou, H. Al-Khateeb, M. Hammoudeh,
V . Katos, and A. Dehghantanha, “Non-interactive zero knowl-
edge proofs for the authentication of iot devices in reduced con-
nectivity environments,” Ad Hoc Networks, vol. 95, p. 101988,
2019.
[7] Z. Mahmood and J. Vacius, “Privacy-preserving block-chain
framework based on ring signatures (rss) and zero-knowledge
proofs (zkps),” in 2020 International Conference on Innovation

Result 2
Content: zk-IoT: Securing the Internet of Things with
Zero-Knowledge Proofs on Blockchain
Platforms
Gholamreza Ramezan
FidesInnova
g2n@ﬁdesinnova.io
Ehsan Meamari
FidesInnova
e2i@ﬁdesinnova.io
Abstract—This paper introduces the zk-IoT frame-
work, a novel approach to enhancing the security of
Internet of Things (IoT) ecosystems through the use of
Zero-Knowledge Proofs (ZKPs) on blockchain platforms.
Our framework ensures the integrity of ﬁrmware execu-

Result 3
Content: and vehicles. IoT technology enabl

  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
