1. Install dependencies

In [None]:
!pip install sentence-transformers chromadb PyMuPDF pandas
!pip install transformers accelerate


Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

2. Imports

In [None]:
import fitz  # PyMuPDF
import pandas as pd
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

3. Set path

In [None]:
CHROMA_DIR = "./chroma_db"
COLLECTION_NAME = "study_material"
EMBED_MODEL = "all-MiniLM-L6-v2"

Initialize Chroma client

In [None]:
client = chromadb.PersistentClient(path=CHROMA_DIR)

try:
    collection = client.get_collection(COLLECTION_NAME)
except:
    collection = client.create_collection(COLLECTION_NAME)

embedder = SentenceTransformer(EMBED_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# # Load embedding model
# embedder = SentenceTransformer(EMBED_MODEL)
# LLM_MODEL = "facebook/opt-350m"  # lightweight
# tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
# model = AutoModelForCausalLM.from_pretrained(LLM_MODEL, device_map="auto")
# generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# LLM_MODEL = "tiiuae/falcon-7b-instruct"
# # Load model + tokenizer
# tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
# model = AutoModelForCausalLM.from_pretrained(LLM_MODEL, device_map="auto", torch_dtype=torch.float16)
# generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# Use a free ungated model
LLM_MODEL = "google/flan-t5-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL)

# Create pipeline
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Test summarization
prompt = "Summarize: Machine Learning is a subset of AI that enables systems to learn from data."
result = generator(prompt, max_new_tokens=100)

print(result[0]["generated_text"])



tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


Understand how machine learning works.


4. File upload helper

In [None]:
from google.colab import files

def upload_and_extract():
    uploaded = files.upload()
    text = ""
    for fname in uploaded.keys():
        if fname.endswith(".pdf"):
            doc = fitz.open(fname)
            for page in doc:
                text += page.get_text()
        elif fname.endswith(".csv"):
            df = pd.read_csv(fname)
            text = "\n".join(df.astype(str).apply(lambda r: " ".join(r.values), axis=1))
        elif fname.endswith(".txt"):
            with open(fname, "r", encoding="utf-8") as f:
                text = f.read()
    return text

5. Chunking

In [None]:
def chunk_text(text, chunk_size=700, overlap=50):
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
        i += (chunk_size - overlap)
    return chunks

Ingest into ChromaDB

In [None]:
def ingest_text(text):
    chunks = chunk_text(text)
    embeddings = embedder.encode(chunks).tolist()
    ids = [f"chunk_{i}" for i in range(len(chunks))]
    collection.upsert(ids=ids, embeddings=embeddings, documents=chunks)
    print(f"Ingested {len(chunks)} chunks.")

In [None]:
# 7. Retrieval
def retrieve(query, top_k=5):
    results = collection.query(query_texts=[query], n_results=top_k)
    return results["documents"][0]

# 8. LLM helper
def run_llm(prompt, max_new_tokens=200):
    return generator(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]["generated_text"]

9. Study Guide Tools

In [None]:
def generate_summary(query, level="Beginner"):
    docs = retrieve(query, top_k=5)
    context = "\n".join(docs)
    prompt = f"""Summarize the following educational text at {level} level.
Context:
{context}
Summary:"""
    return run_llm(prompt)

def generate_qa(query, num_qs=5):
    docs = retrieve(query, top_k=5)
    context = "\n".join(docs)
    prompt = f"""Generate {num_qs} question-answer pairs based only on this context:
{context}
Q&A:"""
    return run_llm(prompt)

def generate_flashcards(query, num_cards=5):
    docs = retrieve(query, top_k=5)
    context = "\n".join(docs)
    prompt = f"""Generate {num_cards} flashcards (Q and A) for revision from this context:
{context}
Flashcards:"""
    return run_llm(prompt)

10. Run Demo

In [None]:
print("📂 Upload your file (PDF, CSV, TXT)")
text = upload_and_extract()

print("✅ Ingesting into Chroma...")
ingest_text(text)

# # Example usage
topic = "Machine Learning"
# print("\n🔹 SUMMARY:")
# print(generate_summary(topic))

# print("\n🔹 Q&A:")
# print(generate_qa(topic, num_qs=3))

# print("\n🔹 FLASHCARDS:")
# print(generate_flashcards(topic, num_cards=3))

import textwrap

def pretty_print(text, width=100):
    return "\n".join(textwrap.wrap(text, width=width))

print("\n🔹 SUMMARY:")
print(pretty_print(generate_summary(topic)))

print("\n🔹 Q&A:")
print(pretty_print(generate_qa(topic, num_qs=3)))

print("\n🔹 FLASHCARDS:")
print(pretty_print(generate_flashcards(topic, num_cards=3)))


📂 Upload your file (PDF, CSV, TXT)


Saving BTech_HRM_2.pdf to BTech_HRM_2 (1).pdf
✅ Ingesting into Chroma...
Ingested 9 chunks.

🔹 SUMMARY:


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:08<00:00, 9.79MiB/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (4662 > 512). Running this sequence through the model will result in indexing errors


Understand the different types of HR audits. Understand the different types of HR audits. Understand
the different types of HR audits. Understand the different types of HR audits.

🔹 Q&A:
Question: What is the purpose of an HR audit? Answer: A. Compliance Audit This type of audit ensures
that the organization is complying with labor laws, regulations, and industry standards. B.
Strategic HR Audit This type of audit evaluates whether the organization is effectively contributing
to organizational goals and business strategy. D. Health Check Audit A more informal audit, this
type of audit involves evaluating the overall well-being and health of the HR function in terms of
morale, communication, and processes. D. Health Check Audit A more informal audit, this type of
audit involves evaluating the overall well-being and health of the HR function in terms of morale,
communication, and processes.

🔹 FLASHCARDS:
Human Resource Accounting (HRA) and HR Audit are important concepts in HR manageme