<a href="https://colab.research.google.com/github/Storm00212/JARVIS/blob/main/colab_ingestion_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# JARVIS RAG Ingestion Notebook (Colab-ready)

**Purpose:** This notebook walks you through an end-to-end prototype ingestion pipeline that:
- Accepts PDF / DOCX / PPTX documents
- Extracts clean text (with optional OCR)
- Splits documents into semantic chunks
- Generates embeddings for chunks
- Stores chunks + embeddings into a local Chroma vector store
- Exposes a simple `ask(question)` function that uses retrieval + prompt assembly (RAG)

**Notes & assumptions**
- Designed for Google Colab interactive use.
- Includes a sample path from this session: `/mnt/data/jarvis-ai.zip` which you can inspect or replace with your own uploads.
- Each code cell includes detailed comments to help you follow along.


In [None]:

# SECTION 1: Install required packages
# Run this cell in Google Colab to install dependencies. It may take 1-2 minutes.Additional dependencies to be added
!pip install --quiet pypdf python-docx python-pptx sentence-transformers chromadb langchain tiktoken PyMuPDF langchain_text_splitters faiss-cpu llama-cpp-python together llama-cpp-python==0.2.30 langchain langchain-community langchain-core
print('Dependencies installed (or already present).')


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


## SECTION 2: Upload files (use UI) or use sample path

You can upload files interactively using the cell below, or skip upload and use the sample file `'/mnt/data/jarvis-ai.zip'` if present.


In [None]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# setting up the directory to upload the files
import os

BASE_DIR = "/content/drive/MyDrive/jarvis-ai"
RAW_DATA_DIR = f"{BASE_DIR}/data/raw"

# Create folders if they don't exist
os.makedirs(RAW_DATA_DIR, exist_ok=True)

print("Base project folder:", BASE_DIR)
print("Raw data folder:", RAW_DATA_DIR)


Base project folder: /content/drive/MyDrive/jarvis-ai
Raw data folder: /content/drive/MyDrive/jarvis-ai/data/raw


In [None]:
# uploading files to directory
from google.colab import files
import shutil # Import shutil for cross-device moves

uploaded_files = files.upload()  # choose multiple files


# Move uploaded files into the Drive folder
for filename in uploaded_files.keys():
    src = f"/content/{filename}"
    dst = f"{RAW_DATA_DIR}/{filename}"
    print(f"Moving {src} → {dst}")
    # Use shutil.move to handle cross-device links (copy then delete)
    shutil.move(src, dst)

print("\nUpload complete!")

print("Files in your study notes folder:")
print(os.listdir(RAW_DATA_DIR))

KeyboardInterrupt: 

# Reading the pdfs from my drive

In [None]:
# STEP 1 — Load PDFs from Google Drive

from google.colab import drive
drive.mount('/content/drive')

import os
import fitz  # PyMuPDF for PDFs
import docx  # DOCX reader
from pptx import Presentation  # PPTX reader

# CHANGE THIS to your folder
DATA_FOLDER = "/content/drive/MyDrive/jarvis-ai/data/raw"

documents = {}  # filename → extracted text


# function to read docx
def extract_docx(path):
    doc = docx.Document(path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text


# function to read pptx
def extract_pptx(path):
    prs = Presentation(path)
    text = []

    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)

    return "\n".join(text)


# function to read pdf
def extract_pdf(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text


# iterate through the study notes folder
for filename in os.listdir(DATA_FOLDER):
    path = os.path.join(DATA_FOLDER, filename)

    if filename.lower().endswith(".pdf"):
        print(f"Extracting PDF: {filename}")
        documents[filename] = extract_pdf(path)

    elif filename.lower().endswith(".docx"):
        print(f"Extracting DOCX: {filename}")
        documents[filename] = extract_docx(path)

    elif filename.lower().endswith(".pptx"):
        print(f"Extracting PPTX: {filename}")
        documents[filename] = extract_pptx(path)

    else:
        print(f"Skipping unsupported file: {filename}")


print("\n✔ Extraction complete!")
print(f"Total loaded documents: {len(documents)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracting PDF: 1. Amplifiers with Negative Feedback (2).pdf
Extracting PDF: 3.1 Resources  (2).pdf
Extracting PDF: churchillbrown (2).pdf
Extracting PDF: EEE 3208 ELECTROMAGNETICS III lec1 notes (2).pdf
Extracting PDF: eee.eti.3104.cat.ii.make_up.ms (2).pdf
Extracting PDF: eee3102 [1-20] (2).pdf
Extracting PDF: EEE 2206_EET 2204_Electromagnetics I_Exam (2).pdf
Extracting PDF: eee3102 [21-33] (2).pdf
Extracting PDF: EEE_ETI3105_Assignment ONE (2).pdf
Extracting PDF: Design_of_Analog_Filters_Rolf_Schaumann (2).pdf
Extracting PDF: digielec (2).pdf
Extracting PDF: EEE 3208 ELECTROMAGNETICS IIILecture 2 3 and4 notes (3).pdf
Extracting PDF: eee3104eti3104 [1-68] (2).pdf
Extracting PDF: Electromagnetics (2).pdf
Extracting PDF: A textbook of Electrical Technology B. L. Thereja All Volumes ( PDFDrive (2).pdf
Extracting PDF: EEE_ETI 3101_SUP_EXAM_ANALOGUE ELECTRONICS 

# Next we will split the documents into chunks before building the FAISS AND EMBEDDINGS

In [None]:
# CREATING THE SPLITTER

from langchain_text_splitters import RecursiveCharacterTextSplitter

# Recursive splitter -- best for mixed text types
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=[
        "\n\n",  # prefer splitting at paragraphs
        "\n",
        ". ",
        "! ",
        "? ",
        "; ",
        ", ",
        " ",    # fallback: whitespace
        ""      # absolute fallback
    ]
)

all_chunks = {}  # filename → list of text chunks

for filename, text in documents.items():
    print(f"Chunking: {filename}")

    chunks = splitter.split_text(text)
    all_chunks[filename] = chunks

print("\n✔ Chunking complete!")

Chunking: 1. Amplifiers with Negative Feedback (2).pdf
Chunking: 3.1 Resources  (2).pdf
Chunking: churchillbrown (2).pdf
Chunking: EEE 3208 ELECTROMAGNETICS III lec1 notes (2).pdf
Chunking: eee.eti.3104.cat.ii.make_up.ms (2).pdf
Chunking: eee3102 [1-20] (2).pdf
Chunking: EEE 2206_EET 2204_Electromagnetics I_Exam (2).pdf
Chunking: eee3102 [21-33] (2).pdf
Chunking: EEE_ETI3105_Assignment ONE (2).pdf
Chunking: Design_of_Analog_Filters_Rolf_Schaumann (2).pdf
Chunking: digielec (2).pdf
Chunking: EEE 3208 ELECTROMAGNETICS IIILecture 2 3 and4 notes (3).pdf
Chunking: eee3104eti3104 [1-68] (2).pdf
Chunking: Electromagnetics (2).pdf
Chunking: A textbook of Electrical Technology B. L. Thereja All Volumes ( PDFDrive (2).pdf
Chunking: EEE_ETI 3101_SUP_EXAM_ANALOGUE ELECTRONICS 1 (2).pdf
Chunking: EEE 3208 ELECTROMAGNETICS IIILecture 2 3 and4 notes (1) (2).pdf
Chunking: EEE2205 Electromagnetics I (2).pdf
Chunking: 3.2 Past Papers   (2).pdf
Chunking: EEE 3207 ELECTRICAL MACHINES 2 (2).pptx
Chunking: 

# Embedding and FAISS

In [None]:
from sentence_transformers import SentenceTransformer

# FREE embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_text(texts):
    return embedding_model.encode(texts, convert_to_numpy=True)


In [None]:
# creating the faiss vector store
import faiss
import numpy as np

# Flatten chunks
texts = []
meta = []
for filename, chunks in all_chunks.items():
    for chunk in chunks:
        texts.append(chunk)
        meta.append({"source": filename})

# Generate embeddings
embeddings = embed_text(texts)

# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print(f"FAISS index size: {index.ntotal}")


FAISS index size: 22293


In [None]:
import pickle
FAISS_PATH = "/content/drive/MyDrive/jarvis-ai/faiss_index"

np.save(FAISS_PATH + "_vectors.npy", embeddings)
with open(FAISS_PATH + "_metadata.pkl", "wb") as f:
    pickle.dump(meta, f)

faiss.write_index(index, FAISS_PATH + "_index.faiss")

print("✔ Vectorstore saved!")

In [None]:
from llama_cpp import Llama
import os

# Download the model file if it doesn't exist
model_name = "llama-3.1-8b-instruct.Q4_K_M.gguf"
model_path = f"/content/{model_name}"

if not os.path.exists(model_path):
    print(f"Downloading {model_name}...")
    !wget https://huggingface.co/lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -O {model_path}
    print("Download complete.")
else:
    print(f"{model_name} already exists.")

# Load quantized Llama model (fastest offline option)
model = Llama(
    model_path=model_path,
    n_gpu_layers=35,      # Use more GPU layers = MUCH FASTER than previous llm
    n_ctx=4096,
    f16_kv=True,
    logits_all=False,
    use_mlock=True        # Speed up memory access
)

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import os

# Load free embedding model
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Example: your text chunks should be stored in `chunks` list
# chunks = ["text1...", "text2...", ...]

print(f"Total Chunks to Embed: {len(chunks)}")

# Generate embeddings
embeddings = embedder.encode(chunks, show_progress_bar=True, convert_to_numpy=True)

print("Embedding shape:", embeddings.shape)

# Save embeddings + chunks
output_folder = "/content/drive/MyDrive/jarvis-ai/embeddings"
os.makedirs(output_folder, exist_ok=True)

np.save(os.path.join(output_folder, "chunk_embeddings.npy"), embeddings)

with open(os.path.join(output_folder, "chunks.json"), "w") as f:
    json.dump(chunks, f)

print("Embeddings + chunk text saved to Google Drive.")

In [None]:
import os, json, numpy as np

# importing chunks from google drive.
DRIVE_BASE = "/content/drive/MyDrive/jarvis-ai/embeddings"
chunks_path = os.path.join(DRIVE_BASE, "chunks.json")
emb_path = os.path.join(DRIVE_BASE, "chunk_embeddings.npy")
meta_path = os.path.join(DRIVE_BASE, "metadata.json")  # optional: if you stored metadata

# Load
with open(chunks_path, "r", encoding="utf-8") as f:
    chunks = json.load(f)  # list of strings (chunk texts)

embeddings = np.load(emb_path)  # shape: (N, dim)

# Optional metadata mapping (source filename, chunk index)
metadata = None
if os.path.exists(meta_path):
    with open(meta_path, "r", encoding="utf-8") as f:
        metadata = json.load(f)  # expected list/dict aligned with chunks

print(f"Loaded {len(chunks)} chunks, embeddings shape = {embeddings.shape}")
if metadata:
    print(f"Loaded metadata entries: {len(metadata)}")


In [None]:
import faiss

# Ensure embeddings are float32
emb = embeddings.astype('float32')
# Normalize to unit vectors for cosine similarity
faiss.normalize_L2(emb)

d = emb.shape[1]  # embedding dim

# Create index (inner product) and add vectors
index = faiss.IndexFlatIP(d)   # inner product on normalized vectors = cosine similarity
index.add(emb)
print("FAISS index ntotal:", index.ntotal)

# Optionally save index to Drive for reuse
DRIVE_BASE2 = "/content/drive/MyDrive/jarvis-ai"
FAISS_INDEX_PATH = os.path.join(DRIVE_BASE2, "faiss_index.faiss")
faiss.write_index(index, FAISS_INDEX_PATH)
print("Saved FAISS index to:", FAISS_INDEX_PATH)


In [None]:
# to load later
# Load index
index = faiss.read_index(FAISS_INDEX_PATH)


In [None]:
# Simple semantic search
from sentence_transformers import SentenceTransformer

# Reuse MiniLM for query embeddings
query_embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def semantic_search(query, k=5):
    """
    Returns list of (score, chunk_text, metadata(optional), chunk_id).
    Higher score = more similar (cosine).
    """
    q_emb = query_embedder.encode([query], convert_to_numpy=True).astype('float32')
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)  # D: similarities, I: indices
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0:
            continue
        chunk_text = chunks[idx]
        meta = metadata[idx] if (metadata and idx < len(metadata)) else None
        results.append({"score": float(score), "chunk_id": idx, "text": chunk_text, "meta": meta})
    return results

# Quick test
res = semantic_search("Explain Maxwell's equations", k=3)
for r in res:
    print("score:", r["score"], "meta:", r["meta"])
    print(r["text"][:400].replace("\n"," ") + "...\n")


In [None]:
# Assembling a RAG prompt.
# A naive char-based truncation strategy (safe fallback).
def assemble_prompt(question, top_k_results, max_context_chars=3000):
    """
    top_k_results: output of semantic_search
    max_context_chars: maximum total characters across all context chunks
    """
    intro = "You are JARVIS, an expert teaching assistant for engineering. Use the context below to answer the question factually.\n\n"
    context = ""
    chars_used = 0
    for r in top_k_results:
        chunk = r["text"]
        header = f"[Source: {r['meta'].get('source') if r['meta'] else 'unknown'} | chunk_id: {r['chunk_id']} | score: {r['score']:.3f}]\n"
        # if adding this chunk will exceed the budget, take a prefix of the chunk
        if chars_used + len(chunk) + len(header) > max_context_chars:
            remaining = max_context_chars - chars_used - len(header)
            if remaining <= 0:
                break
            chunk = chunk[:remaining]
        context += header + chunk + "\n\n"
        chars_used += len(header) + len(chunk)
    prompt = intro + "Context:\n" + context + "\nQuestion: " + question + "\nAnswer concisely and cite relevant sources in brackets when possible."
    return prompt

# Example
top = semantic_search("What is a bilinear transfer function?", k=5)
prompt = assemble_prompt("Explain bilinear transfer functions and give an example.", top, max_context_chars=3000)
print(prompt[:1500])


In [None]:
# Example using llama-cpp-python (adjust model_path to your GGUF file)
from llama_cpp import Llama
import os

# Path to your GGUF model file (put the model file into Drive and use the path)
MODEL_PATH = model_path #"/content/drive/MyDrive/JARVIS_MODELS/mistral-7b-instruct.gguf"  # change to your model file

def call_local_llm(prompt, max_tokens=512, temperature=0.1):
    if not os.path.exists(MODEL_PATH):
        # Fallback: return the prompt for inspection
        return {"error": "Model not found", "prompt": prompt}
    # Initialize the model (you could keep a global model object to avoid reloading)
    model = Llama(model_path=MODEL_PATH, n_ctx=4096)
    out = model(prompt, max_tokens=max_tokens, temperature=temperature)
    # The llama-cpp-python response has 'choices' etc; we return text
    text = out['choices'][0]['text']
    return {"text": text, "raw": out}

# Usage
out = call_local_llm(prompt, max_tokens=400)
if 'text' in out:
    print(out['text'][:1500])
else:
    print("No model available. Inspect prompt:\n", out['prompt'][:1200])


In [None]:
def answer_question(query, k=6, max_context_chars=3000, max_tokens=400, temperature=0.1):
    # 1) retrieve
    top = semantic_search(query, k=k)
    # 2) assemble
    prompt = assemble_prompt(query, top, max_context_chars=max_context_chars)
    # 3) call LLM
    result = call_local_llm(prompt, max_tokens=max_tokens, temperature=temperature)
    # 4) return everything useful
    return {
        "query": query,
        "prompt": prompt,
        "retrieved": top,
        "response": result
    }




In [None]:
# Example (inspect output)
res = answer_question("What is a 3phase transformer?", k=5)
if 'text' in res['response']:
    print("LLM answer (truncated):\n", res['response']['text'][:1200])
else:
    print("Model didn't run; prompt to inspect:\n", res['prompt'][:1200])
#will add fine tuning soon