In [1]:
import os

# Define folder path
folder_path = "/content/pdfs_to_read"

# Create folder
os.makedirs(folder_path, exist_ok=True)

print(f"Folder created at: {folder_path}")


Folder created at: /content/pdfs_to_read


In [2]:
!pip install chromadb sentence-transformers transformers PyPDF2

Collecting chromadb
  Downloading chromadb-1.3.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31

In [None]:
import torch
import time
import chromadb
from chromadb.errors import InternalError
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2
import os
import re

In [None]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# model_name = 'openlm-research/open_llama_3b_v2'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def extract_text_from_file(file_path):
    """Extract text from a PDF"""

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} does not exist.")

    if file_path.endswith('.pdf'):
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ""
        return text

    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()

    else:
        raise ValueError("Unsupported file format. Use .pdf or .txt.")

In [None]:

def chunk_text(text, chunk_size=500, chunk_overlap=50):
    """Split text into chunks ending with complete sentences."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", r"\. ", r"\? ", r"\! "],
        add_start_index=True
    )
    chunks = splitter.split_text(text)
    adjusted_chunks = []
    for chunk in chunks:
        # Find the last sentence boundary (., ?, ! followed by space)
        last_sentence_end = max(
            [m.end() for m in re.finditer(r'(\. |\? |\! )', chunk)] + [len(chunk)]
        )
        # Trim chunk to the last complete sentence
        adjusted_chunk = chunk[:last_sentence_end].strip()
        if adjusted_chunk:
            adjusted_chunks.append(adjusted_chunk)
    return adjusted_chunks

In [None]:
print("Loading embedding model")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
print("Setting up ChromaDB")
chroma_client = chromadb.Client()
try:
    collection = chroma_client.create_collection(name="user_collection")
    print("Collection created.")
except InternalError as e:
    if "already exists" in str(e):
        print("Collection already exists")
        collection = chroma_client.get_collection(name="user_collection")
    else:
        raise e

In [None]:

!pip install pycryptodome

In [None]:
pip install pypdf


In [None]:
import os
# from PyPDF2 import PdfReader
from tqdm import tqdm
from Crypto.Cipher import AES

from pypdf import PdfReader



folder_path = "pdfs_to_read"

all_chunks = []
all_chunk_ids = []
all_metadata = []

if not os.path.isdir(folder_path):
    raise NotADirectoryError(f"Path {folder_path} is not a valid directory")

print("Reading and splitting documenst\n")

for file_idx, file_name in enumerate(tqdm(os.listdir(folder_path))):
    file_path = os.path.join(folder_path, file_name)
    if not file_path.endswith(".pdf"):
        continue

    reader = PdfReader(file_path)
    for page_num, page in enumerate(reader.pages):
        text = page.extract_text()
        if not text:
            continue

        # Split page text into paragraphs
        paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
        for para_idx, para in enumerate(paragraphs):
            chunk_id = f"{file_idx}_{page_num}_{para_idx}"
            all_chunks.append(para)
            all_chunk_ids.append(chunk_id)
            all_metadata.append({
                "document_name": file_name,
                "page_number": page_num + 1,
                "paragraph_number": para_idx + 1
            })

print(f"\n Total chunks created {len(all_chunks)}")

# --- Add chunks to Chroma ---

collection.add(
    ids=all_chunk_ids,
    embeddings=embedder.encode(all_chunks).tolist(),
    documents=all_chunks,
    metadatas=all_metadata
)
print("Embeddings added")


In [None]:
query = "Identify any risky clauses and explain why they may be problematic. Print out 3 risky clauses that you found from each document. Also print out the exact risky line"

query_tokens = tokenizer(query, return_tensors="pt").to(device)
print(f"Query tokens (input_ids shape): {query_tokens['input_ids'].shape}")
print(query_tokens)

In [None]:
print("\nGenerating query embedding")
query_embedding = embedder.encode([query])[0]
print(query_embedding)

In [None]:
print("\n Querying ChromaDB")
start_time = time.time()
chroma_results = collection.query(query_embeddings=[query_embedding.tolist()], n_results=2)
chroma_time = time.time() - start_time

# Combine results into a structured format
contexts = []
for text, meta in zip(chroma_results["documents"][0], chroma_results["metadatas"][0]):
    contexts.append({
        "text": text,
        "source": meta["document_name"],
        "page": meta["page_number"],
        "paragraph": meta["paragraph_number"]
    })

print(f"\n Retrieved {len(contexts)} relevant chunks (in {chroma_time:.2f}s):\n")
for c in contexts:
    preview = c["text"][:120].replace("\n", " ")
    print(f"• {c['source']} (page {c['page']}, para {c['paragraph']}): {preview}...")

# For RAG prompt creation
rag_context = " ".join([c["text"] for c in contexts])


In [None]:
print(chroma_results)

In [None]:
print("Loading TinyLlama model ")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model.to(device)

In [None]:
def generate_response(prompt, max_length=150):
    """Generate response from the model"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return response

In [None]:
print("\n Generating response")

# Token-aware trimming
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
max_new_tokens = 1024 #256
safety_margin = 32
model_max_tokens = getattr(tokenizer, "model_max_length", 2048)
input_token_budget = model_max_tokens - max_new_tokens - safety_margin

def count_tokens(text):
    return len(tokenizer.encode(text, truncation=False))

rag_chunks = []
used_tokens = 0
for c in contexts:
    t = c["text"]
    t_tokens = count_tokens(t)
    if used_tokens + t_tokens <= input_token_budget:
        rag_chunks.append(t)
        used_tokens += t_tokens
    else:
        remaining = input_token_budget - used_tokens
        if remaining > 50:
            truncated = tokenizer.decode(tokenizer.encode(t)[:remaining])
            rag_chunks.append(truncated)
            used_tokens += remaining
        break

rag_context = " ".join(rag_chunks)
print(f"Context tokens used {used_tokens} / {input_token_budget}")

# Build prompt using rag_context
rag_prompt = f"""<human>
You are a contract intelligence assistant.

Context information from the contract:
{rag_context}

Instructions:
- Use ONLY the context above (not any prior knowledge).
- If you don't know the answer, just say that you don't know. Don't try to make up an answer.


Question: {query}
</human>

<assistant>"""

no_rag_prompt = f"""<human>
{query}
</human>

<assistant>"""

#Generate
rag_response = generate_response(rag_prompt)
no_rag_response = generate_response(no_rag_prompt)

# Output

print(" RAG vs No-RAG Comparison")
print("\n")
print(f"Query: {query}")

print(f"\ RAG Response \n{rag_response}")
print(f"\n No-RAG Response (without context):\n{no_rag_response}")

print("Sources used:")
for c in contexts[:len(rag_chunks)]:  # show only sources actually included
    print(f"- {c['source']} (page {c['page']}, paragraph {c['paragraph']})")

print(f"\n Retrieval Time: {chroma_time:.4f} seconds")



In [None]:
# # Generate responses
# print("\nGenerating responses...")
# rag_response = generate_response(rag_prompt)
# no_rag_response = generate_response(no_rag_prompt)

# print("\n--- RAG vs No-RAG Comparison ---")
# print(f"Query: {query}")
# print(f"\nRAG Response (with context):\n{rag_response}")
# print(f"\nNo-RAG Response (without context):\n{no_rag_response}")
# print(f"\nChromaDB Query Time: {chroma_time:.4f} seconds")

In [None]:


print("\n RAG Response")
print(rag_response)

print("\nSources:")
for c in contexts:
    print(f"- {c['source']} (page {c['page']}, paragraph {c['paragraph']})")


In [None]:
# Cleanup
try:
    chroma_client.delete_collection("user_collection")
    print("\nCleaned up ChromaDB collection.")
except Exception as e:
    if "does not exist" in str(e).lower():
        print("\nNo collection to clean up.")
    else:
        raise e