In [None]:
#%pip install pyaudio -q
%pip install --upgrade pip -q
%pip install python-dotenv langchain_openai langchain_community chromadb youtube-transcript-api pytube pypdf web3 -q
%pip install youtube-transcript-api bs4 pypdf -q
%pip install SpeechRecognition -q 
%pip install opencv-python -q

In [None]:
from dotenv import load_dotenv
import os
from langchain_community.document_loaders import WebBaseLoader, YoutubeLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pytube import YouTube
from web3 import Web3

In [None]:
############################# Document instances
## Create a crawl to list all the webpages, and then load them here
web1 = WebBaseLoader("https://www.fidesinnova.io")
web2 = WebBaseLoader("https://fidesinnova.io/devices/")
web3 = WebBaseLoader("https://fidesinnova.io/Contacts/")


## Adding all repositories
git1 = WebBaseLoader("https://github.com/TheArchitect2000/iot-server")
git2 = WebBaseLoader("https://github.com/TheArchitect2000/zkiot-riscv-qemu-c")
git3 = WebBaseLoader("https://github.com/TheArchitect2000/Fides-Innova-WiKi")


## Addming all PDfs
pdf1 = PyPDFLoader("zkIoT.pdf")

## Adding all YouTube videos
video1 = YoutubeLoader("kgYxyxeDNl4")
video2 = YoutubeLoader("VZMBE2NLSC4")
video3 = YoutubeLoader("YpfFHI3Ivmo")


## Fides Blockchain
rpc_url = "https://rpc1.fidesinnova.io"
w3 = Web3(Web3.HTTPProvider(rpc_url))

if w3.is_connected():
    print("✅ Connected to FidesInnova Blockchain!")
    print("Latest block:", w3.eth.block_number)
else:
    print("❌ Connection failed.")

contract_address = Web3.to_checksum_address("0x4b08ea934e6bfb7c72a376c842c911e1dd2aa74f")
with open("NodeServiceDeviceManagement.abi") as f1:
   abi =  f1.read() # Smart contract ABI

contract = w3.eth.contract(address=contract_address, abi=abi)

#sensor_data = contract.functions.getSensorData(sensor_id).call()

############################# Loading
loaders = [web1, web2, web3, git1, git2, git3, pdf1, video1, video2, video3]

docs = []
for loader in loaders:
    loaded_docs = loader.load()
    for doc in loaded_docs:
        if isinstance(loader, YoutubeLoader):
            doc.metadata['type'] = 'YouTube'
        elif isinstance(loader, PyPDFLoader):
            doc.metadata['type'] = 'PDF'
        elif isinstance(loader, WebBaseLoader):
            doc.metadata['type'] = 'Web'
        else:
            doc.metadata['type'] = 'Other'
        print(doc.metadata)
        docs.append(doc)

In [None]:
############################# split the documents
splitter1 = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
totaldoc2 = splitter1.split_documents(docs)

############################# Creating Vector database
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

#######################
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the API key
api_key1 = os.getenv("API_KEY1")

# Use the API key
print(f"Using API Key: {api_key1}")
#######################

embedding1 = OpenAIEmbeddings(model="text-embedding-3-large", api_key=api_key1)

# on RAM memory
#mydb1 = Chroma.from_documents(totaldoc2, embedding=embedding1) 
# on storage
mydb1 = Chroma(
    collection_name="example_collection",
    embedding_function=embedding1,
    persist_directory="chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

mydb1.add_documents(totaldoc2)

In [None]:
# with open("a.txt","w") as f1:
#     f1.write(web1.load()[0].page_content)