<a href="https://colab.research.google.com/github/Storm00212/JARVIS/blob/main/JARVIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
!pip install -q --upgrade \
  langchain \
  langchain-community \
  langchain-text-splitters \
  langchain-huggingface \
  sentence-transformers \
  faiss-cpu \
  pypdf \
  python-docx \
  python-pptx \
  tqdm

In [43]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
import os

BASE_DIR = "/content/drive/MyDrive/jarvis-ai"
DATA_DIR = f"{BASE_DIR}/data/raw"
VECTORSTORE_DIR = f"{BASE_DIR}/vectorstore/faiss_index"

os.makedirs(VECTORSTORE_DIR, exist_ok=True)

print("Data dir exists:", os.path.exists(DATA_DIR))


Data dir exists: True


In [45]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    Docx2txtLoader,
    UnstructuredPowerPointLoader
)
from langchain_core.documents import Document
from tqdm import tqdm

In [46]:
def load_documents(folder_path):
    documents = []

    for file in tqdm(os.listdir(folder_path)):
        path = os.path.join(folder_path, file)

        try:
            if file.lower().endswith(".pdf"):
                loader = PyPDFLoader(path)

            elif file.lower().endswith(".docx"):
                loader = Docx2txtLoader(path)

            elif file.lower().endswith(".pptx"):
                loader = UnstructuredPowerPointLoader(path)

            else:
                continue

            documents.extend(loader.load())

        except Exception as e:
            print(f"Failed to load {file}: {e}")

    return documents


In [47]:
docs = load_documents(DATA_DIR)
print(f"Loaded {len(docs)} raw documents/pages")


 34%|███▍      | 28/82 [04:29<20:26, 22.72s/it]

Failed to load EEE 3207 ELECTRICAL MACHINES 2 (2).pptx: unstructured package not found, please install it with `pip install unstructured`


100%|██████████| 82/82 [09:08<00:00,  6.68s/it]

Loaded 11843 raw documents/pages





In [48]:
# chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,        # safe for LLMs
    chunk_overlap=200       # preserves context
)

chunks = text_splitter.split_documents(docs)
print(f"Created {len(chunks)} text chunks")

Created 26187 text chunks


In [51]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

In [53]:
# FAISS vector store
from langchain_community.vectorstores import FAISS

# Prepare lists for texts and metadatas, ensuring proper cleaning
texts_to_embed = []
metadatas_to_embed = []

for chunk in chunks:
    if chunk.page_content is not None:
        # Explicitly convert to str, and aggressively clean problematic characters
        content = str(chunk.page_content)
        try:
            content = content.encode("ascii", "ignore").decode("utf-8").strip()
        except (UnicodeEncodeError, UnicodeDecodeError):
            # Fallback to simple strip if encoding/decoding fails for some reason
            content = content.strip()

        if content: # Only include non-empty strings after stripping
            texts_to_embed.append(content)
            metadatas_to_embed.append(chunk.metadata)

# Create the FAISS vectorstore from texts and metadatas
vectorstore = FAISS.from_texts(
    texts=texts_to_embed,
    embedding=embeddings,
    metadatas=metadatas_to_embed
)
print(f"Created FAISS vectorstore with {len(texts_to_embed)} valid chunks.")

Created FAISS vectorstore with 26187 valid chunks.


In [54]:
vectorstore.save_local(VECTORSTORE_DIR)
print("FAISS vector store saved to Drive.")


FAISS vector store saved to Drive.
