In [None]:

!python3.10 -m venv venv
!source venv/bin/activate
!python3.10 -m pip install git+https://github.com/openai/whisper.git
!python3.10 -m pip install pypdf bs4 yt-dlp langchain faiss-cpu whisper scikit-learn regex selenium 
!python3.10 -m pip install python-pptx webdriver_manager GitPython unstructured langchain_openai langchain-community arxiv wikipedia

In [2]:
!export AIRFLOW_HOME=~/airflow
AIRFLOW_VERSION="3.0.2"
# !PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')"
PYTHON_VERSION="3.10"
CONSTRAINT_URL=f"https://raw.githubusercontent.com/apache/airflow/constraints-{AIRFLOW_VERSION}/constraints-{PYTHON_VERSION}.txt"
# For example this would install 3.0.0 with python 3.9: https://raw.githubusercontent.com/apache/airflow/constraints-3.0.2/constraints-3.9.txt
arg = f"apache-airflow=={AIRFLOW_VERSION} --constraint {CONSTRAINT_URL}"
print(arg)
!pip install {arg} -q

apache-airflow==3.0.2 --constraint https://raw.githubusercontent.com/apache/airflow/constraints-3.0.2/constraints-3.10.txt


In [12]:
def setup_environment():
    import os
    from dotenv import load_dotenv
    load_dotenv()
    # --- Step 2: Load environment variables from .env file
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
    print("Environment setup complete.")

setup_environment()

Environment setup complete.


In [4]:

%%writefile crawler_youtube.py

def fetch_video():
    ydl_opts = {
        'ignoreerrors': True,
        'quiet': True,
        'extract_flat': True,
        'force_generic_extractor': False,
    }
    # Channel Details
    CHANNEL_ID = "UCrrqGYx98H1dPdZsNb1i9-g"
    CHANNEL_URL = f"https://www.youtube.com/channel/{CHANNEL_ID}"
    print("🎬 YouTube Crawler started ...")
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        result = ydl.extract_info(CHANNEL_URL, download=False)
        video_urls, video_ids = [], []

        if 'entries' in result:
            for entry in result['entries']:
                if entry and 'id' in entry:
                    video_ids.append(entry['id'])
                    video_urls.append(f"https://www.youtube.com/watch?v={entry['id']}")
                    print("url", entry['id'], "appended.")

    print(f"✅ Found {len(video_ids)} videos on channel: {CHANNEL_URL}")
    return video_ids, video_urls

def crawler_youtube(video_ids, video_urls):
    import yt_dlp
    import whisper
    from langchain_core.documents import Document
    import os
    from pathlib import Path
    print("🎤 Transcribing YouTube videos ...")
    os.makedirs("YouTube", exist_ok=True)
    model = whisper.load_model("tiny", device="cpu")
    youtube_docs = []

    for video_url, video_id in zip(video_urls, video_ids):
        print(f"📥 Processing: {video_id}")
        output_file = f"YouTube/{video_id}.mp3"

        # Download .mp3 if not already downloaded
        if not Path(output_file).exists():
            ret = os.system(f'yt-dlp -x --audio-format mp3 -o "{output_file}" {video_url}')
            if ret != 0:
                print(f"❌ Failed to download: {video_url}")
                continue

        try:
            result = model.transcribe(output_file)
            print(f"📝 Transcribed: {video_id}")
            doc = Document(
                page_content=result["text"],
                metadata={
                    "source": video_url,
                    "video_id": video_id,
                    "type": "YouTube"
                }
            )
            youtube_docs.append(doc)
        except Exception as e:
            print(f"⚠️ Transcription failed for {video_id}: {e}")

        print(f"\n✅ Completed. Total documents created: {len(youtube_docs)}")
        print("Sample Doc:", youtube_docs[:2])

    return youtube_docs

youtube_docs = crawler_youtube(fetch_video()[0], fetch_video()[1])


Writing crawler_youtube.py


In [None]:

%%writefile crawler_web.py

def crawl_internal_links(start_url, max_pages=10, max_depth=1):
    from selenium.webdriver.chrome.options import Options
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    from urllib.parse import urlparse, urljoin
    from selenium.common.exceptions import WebDriverException, TimeoutException
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from selenium.common.exceptions import StaleElementReferenceException
    import time


    """
    Crawl internal URLs from a site using Selenium, with support for JavaScript-heavy pages.

    Args:
        start_url (str): The URL to start crawling from.
        max_pages (int): Max number of pages to crawl.
        max_depth (int): Max depth to crawl (0 = just root).

    Returns:
        list: A list of internal URLs that were successfully visited.
    """
    # Selenium Headless Browser Setup
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    visited = set()
    domain = urlparse(start_url).netloc
    to_visit = [(start_url, 0)]

    while to_visit and len(visited) < max_pages:
        url, depth = to_visit.pop(0)
        if url in visited or depth > max_depth:
            continue
        try:
            driver.get(url)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "a")))
            time.sleep(1)  # 🔁 Wait for JS to load

            print(f"Visited ({len(visited)+1}/{max_pages}), Depth {depth}): {url}")
            visited.add(url)

            # If max depth reached, skip link extraction
            if depth == max_depth:
                continue

            # Extract and queue internal links
            links = driver.find_elements(By.TAG_NAME, "a")
            for link in links:
                try:
                    href = link.get_attribute("href")
                    if not href or href.startswith(("mailto:", "tel:", "javascript:")):
                        continue

                    parsed = urlparse(href)
                    if parsed.netloc == domain or parsed.netloc == "":
                        full_url = urljoin(url, href).split("#")[0]
                        if full_url not in visited and all(full_url != q[0] for q in to_visit):
                            to_visit.append((full_url, depth + 1))
                except StaleElementReferenceException:
                    continue

        except (WebDriverException, TimeoutException):
            print(f"⚠️ Skipping (Error): {url}")
            visited.add(url)
            continue

    driver.quit()

    # Print summary
    print(f"\n Total unique internal URLs visited: {len(visited)}")

    if len(visited) < max_pages:
        print("⚠️ Number of crawled URLs is less than max_pages. Possible reasons:")
        print("- Site may not have enough unique pages within the allowed depth.")
        print("- Some links might be hidden behind JavaScript interactions.")
        print("- Some links could be blocked, inaccessible, or slow-loading.")
        print("- Your max_depth may be too shallow to discover deeper links, try changing depth.")
    
    return list(visited)

def change_web_to_meta_data(doc):
    """
    Change the metadata of the web documents to include the content type.
    """
    doc.metadata["type"] = "Web"
    # remove \n from the content
    doc.page_content = doc.page_content.replace("\n", " ")
    # remove multiple spaces
    doc.page_content = ' '.join(doc.page_content.split())
    return doc

def crawler_web():
    from langchain_community.document_loaders import WebBaseLoader
    print("Web Crawler started ....")

    #### CONFIGURABLE SETTINGS 
    start_url = "https://fidesinnova.io/"
    max_pages = 50   # Maximum number of pages to visit
    max_depth = 3     # 0 = only root, 1 = root + links from root

    ####  Run the Crawler
    web_docs_list1 = crawl_internal_links(start_url, max_pages, max_depth)

    #  Load the web documents
    web_docs = []
    for idx in web_docs_list1:
        a = WebBaseLoader(idx)
        print(f"Loading {idx} ...")
        try:
            temp_docs = a.load()
            temp_docs = list(map(change_web_to_meta_data, temp_docs))

            web_docs.extend(temp_docs)
            print(idx + " is loaded.")
        except Exception as e:
            print(f"{idx} is not loaded. Error: {e}")

    print(f"Total web documents loaded: {len(web_docs)}")
    print("Sample doc: ",web_docs[:2])  # Print first 2 documents for verification
    return web_docs

web_docs = crawler_web()

Overwriting crawler_web.py


In [None]:
%%writefile crawler_github.py

def change_GitHub_to_meta_data(doc):
    """
    Change the metadata of the web documents to include the content type.
    """
    doc.metadata["type"] = "GitHub"
    # remove \n from the content
    doc.page_content = doc.page_content.replace("\n", " ")
    # remove multiple spaces
    doc.page_content = ' '.join(doc.page_content.split())
    return doc

def crawler_github():
    from langchain_community.document_loaders import GitLoader

    # loading GitHub Repos
    github_repos = [
        "https://github.com/TheArchitect2000/Fides-Innova-WiKi",
        "https://github.com/TheArchitect2000/Blockchain-based-IoT-Server",
        "https://github.com/TheArchitect2000/zk-IoT",
        "https://github.com/TheArchitect2000/Smart-Contract-Protocol",

    #    "https://github.com/TheArchitect2000/zkiot-riscv-qemu-c", 
    #    "https://github.com/TheArchitect2000/ZKP-Blockchain-Explorer",
    #    "https://github.com/TheArchitect2000/evm-server",
    #    "https://github.com/TheArchitect2000/New-IoT-Device-Integration",
    #   "https://github.com/TheArchitect2000/zkiot-riscv-qemu-rust"
    ]
    
    github_docs = []
    for url in github_repos:
        print(f"📥 Loading repository: {url}")
        repo_name = url.split("/")[-1]
        local_path = f"./cloned_repos/{repo_name}"

        loader = GitLoader(
            repo_path=local_path,
            clone_url=url,
            branch="main",
            file_filter=lambda f: f.endswith((
                # ".py", ".md", ".c", ".cpp", ".rs", ".json", ".html",
                # ".js", ".ts", ".css", ".java", ".txt", ".yml", ".yaml", ".sh"
                ".md"
            ))
        )
        
        temp_docs = loader.load()
        temp_docs = list(map(change_GitHub_to_meta_data, temp_docs))
        github_docs.extend(temp_docs)
        
        print(f"✅ Loaded {len(github_docs)} documents from {repo_name}")

    print(f"Total GitHub documents loaded: {len(github_docs)}")
    return github_docs

github_docs = crawler_github()

Writing crawler_github.py


In [None]:
%%writefile crawler_pdf.py

def change_pdf_doc(doc):
    doc.metadata['type']='PDF'
    return doc

def crawler_pdf():
    from langchain_community.document_loaders import PyPDFLoader
    print("PDF Crawler started ....")

    pdf_docs = []
    pdf_files = [
        "PDF/zkIoT.pdf",
        "PDF/Consensus Algorithms.pdf",
        "PDF/Data Monetization.pdf",
        "pdf/Decentralized Delegated Proof.pdf",
        "pdf/Digital Twins.pdf",
        "pdf/Fides service contracts.pdf",
        "pdf/fides_innova_gitbook_placeholder.pdf",
        "pdf/IoT Startups.pdf",
        "pdf/MIoTN.pdf",
        "pdf/MQTT and MQTTS protocols.pdf",
        "pdf/Service Contract.pdf",
        "pdf/Service Market.pdf",
        "pdf/What’s Web 3.0.pdf"
    ]

    for path in pdf_files:
        try:
            loader = PyPDFLoader(path)
            pdf_docs.extend(loader.load())
            print(len(pdf_docs))
        except Exception as e:
            print(f"Error loading PDF {path}: {e}")

    pdf_docs = list(map(change_pdf_doc, pdf_docs))

    print("Loaded all PDF files:", len(pdf_files))
    print("Created docs:", len(pdf_docs))
    return pdf_docs

pdf_docs = crawler_pdf()

Writing crawler_pdf.py


In [None]:
%%writefile crawler_pptx.py

# Add metadata
def change_pptx_doc(doc):
    doc.metadata['type'] = 'PPTX'
    return doc

def crawler_pptx():
    from langchain_community.document_loaders import UnstructuredPowerPointLoader 
    print("PPTX Crawler started ....")

    pptx_docs = []
    pptx_files = [
        "PPTX/FidesinnovaDeck-v11.pptx"
    ]

    for path in pptx_files:
        print(f"📥 Loading PPTX: {path}")
        try:
            loader = UnstructuredPowerPointLoader(path)
            pptx_docs.extend(loader.load())
            print(f"✅ Loaded: {path}")
        except Exception as e:
            print(f"❌ Error loading PPTX {path}: {e}")

    pptx_docs = list(map(change_pptx_doc, pptx_docs))

    print("Loaded PPTX files:", len(pptx_files))
    print("Created docs:", len(pptx_docs))
    return pptx_docs

pptx_docs = crawler_pptx()

Writing crawler_pptx.py


In [9]:

%%writefile combine_docs.py

def combine_docs(web_docs, github_docs, youtube_docs, pdf_docs, pptx_docs):
    from langchain_text_splitters import RecursiveCharacterTextSplitter

    print("Splitter started ....")

    splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
    print("Splitting web_docs ...")
    split_web_docs = splitter.split_documents(web_docs or [])
    print("Splitting github_docs ...")
    split_github_docs = splitter.split_documents(github_docs or [])
    print("Splitting youtube_docs ...")
    split_youtube_docs = splitter.split_documents(youtube_docs or [])
    print("Splitting pdf_docs ...")
    split_pdf_docs = splitter.split_documents(pdf_docs or [])
    print("Splitting pptx_docs ...")
    split_pptx_docs = splitter.split_documents(pptx_docs or [])


    # --- Step 8: Store in Vector DB ---
    # Combine all split documents
    all_split_docs = (
        split_youtube_docs +
        split_pdf_docs +
        split_pptx_docs +
        split_web_docs +
        split_github_docs
    )
    print("Splitting done.", len(all_split_docs), " documents created.")
    return all_split_docs

all_split_docs = combine_docs(web_docs, github_docs, youtube_docs, pdf_docs, pptx_docs)


Writing combine_docs.py


In [10]:
%%writefile create_faiss_index.py

def create_faiss_index(all_split_docs):
    from langchain_openai import OpenAIEmbeddings
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler
    from langchain.vectorstores.faiss import FAISS
    from langchain.docstore import InMemoryDocstore
    import numpy as np
    import pickle
    import faiss
    import os

    # ---------------------- Step 1: Embed All ----------------------
    base_embedder = OpenAIEmbeddings(model="text-embedding-3-small")
    texts_docs = [(doc.page_content, doc) for doc in all_split_docs if doc.page_content.strip()]
    texts, docs = zip(*texts_docs)
    raw_vectors = np.array(base_embedder.embed_documents(list(texts)))

    # ---------------------- Step 2: Filter Invalid Vectors ----------------------
    filtered_vectors = []
    filtered_docs = []
    for vec, doc in zip(raw_vectors, docs):
        if np.all(np.isfinite(vec)) and np.linalg.norm(vec) > 1e-6:
            filtered_vectors.append(vec)
            filtered_docs.append(doc)

    raw_vectors = np.array(filtered_vectors)

    # ---------------------- Step 3: Clip Outliers ----------------------
    raw_vectors = np.clip(raw_vectors, -1000, 1000)
    # ---------------------- Step 4: Normalize ----------------------
    scaler = StandardScaler()
    normalized_vectors = scaler.fit_transform(raw_vectors)

    # ---------------------- Step 4.5: Drop Low-Variance Features ----------------------
    variances = np.var(normalized_vectors, axis=0)
    stable_mask = variances > 1e-6
    normalized_vectors = normalized_vectors[:, stable_mask]
    print(f"✅ Retained {np.sum(stable_mask)} stable features out of {len(stable_mask)}.")

    # ---------------------- Step 5: PCA Fit ----------------------
    pca = PCA(n_components=256, svd_solver='full')
    pca.fit(normalized_vectors)

    # ---------------------- Step 6: Final Check Before PCA Transform ----------------------
    clean_rows = []
    clean_docs = []

    for vec, doc in zip(normalized_vectors, filtered_docs):
        if np.all(np.isfinite(vec)) and np.linalg.norm(vec) < 100:  # tighter threshold
            clean_rows.append(vec)
            clean_docs.append(doc)

    normalized_vectors = np.array(clean_rows)
    filtered_docs = clean_docs

    # Now safely transform
    transformed_vectors = pca.transform(normalized_vectors)

    print("PCA transformation completed successfully.")
    print("Transformed vectors shape:", transformed_vectors.shape)
    print("Any NaN in transformed vectors?", np.isnan(transformed_vectors).any())
    print("Max component magnitude:", np.max(np.abs(transformed_vectors)))
    print("PCA components shape:", pca.components_.shape)
    print("Any NaN in PCA components?", np.isnan(pca.components_).any())
    print("Max component magnitude:", np.max(np.abs(pca.components_)))


    # ---------------------- Step 7: Create FAISS Index ----------------------
    index = faiss.IndexFlatL2(transformed_vectors.shape[1])
    index.add(transformed_vectors.astype("float32"))

    docstore = InMemoryDocstore(dict(enumerate(filtered_docs)))  # <-- use aligned docs
    index_to_docstore_id = {i: i for i in range(len(filtered_docs))}

    faiss_index = FAISS(
        embedding_function=None,
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id
    )

    # ---------------------- Step 8: Save Everything ----------------------
    saved_dir = "fides_faiss_crawled_data"
    os.makedirs(saved_dir, exist_ok=True)

    faiss_index.save_local(os.path.join(saved_dir, "fides_faiss_pca_256"))

    with open(os.path.join(saved_dir, "fides_pca_256_model.pkl"), "wb") as f:
        pickle.dump(pca, f)

    with open(os.path.join(save_dir, "fides_scaler.pkl"), "wb") as f:
        pickle.dump(scaler, f)

    with open(os.path.join(save_dir, "fides_feature_mask.npy"), "wb") as f:
        np.save(f, stable_mask)

    print("✅ All components saved: FAISS index, PCA model, Scaler, and feature mask.")
    return

create_faiss_index(all_split_docs)

Writing create_faiss_index.py


In [11]:
%%writefile search.py

def load_faiss_components():
    from langchain.vectorstores.faiss import FAISS
    import numpy as np
    import pickle
    import os

    # ---------------------- Step 1: Load Components ----------------------
    save_dir = "fides_faiss_crawled_data"

    # Load FAISS index
    faiss_index = FAISS.load_local(
        folder_path=os.path.join(save_dir, "fides_faiss_pca_256"),
        embeddings=None,
        index_name="index",  # default inside LangChain
        allow_dangerous_deserialization=True
    )

    # Load PCA
    with open(os.path.join(save_dir, "fides_pca_256_model.pkl"), "rb") as f:
        pca = pickle.load(f)

    # Load scaler
    with open(os.path.join(save_dir, "fides_scaler.pkl"), "rb") as f:
        scaler = pickle.load(f)

    # Load feature mask
    stable_mask = np.load(os.path.join(save_dir, "fides_feature_mask.npy"))
    return

# ---------------------- Step 2: Embed and Preprocess Query ----------------------
def preprocess_query(text: str) -> np.ndarray:
    embedder = base_embedder
    vector = embedder.embed_query(text)

    # Ensure vector is safe
    if not np.all(np.isfinite(vector)) or np.linalg.norm(vector) <= 1e-6:
        raise ValueError("Invalid or zero vector produced by embedder.")

    # Clip + normalize + select features
    vector = np.clip(vector, -1000, 1000)
    vector = scaler.transform([vector])
    vector = vector[:, stable_mask]
    vector = pca.transform(vector)

    return vector.astype("float32")

# ---------------------- Step 3: Perform Similarity Search ----------------------
def search(query: str, k=3):
    query_vector = preprocess_query(query)
    load_faiss_components()
    return faiss_index.similarity_search_by_vector(query_vector[0], k=k)


Writing search.py
