<a href="https://colab.research.google.com/github/ShaunakSoni28/RAG_Systems/blob/main/RAG_Systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# Creating project directory structure
import os
project_dir = '/content/drive/MyDrive/RAG_Project/'
os.makedirs(project_dir, exist_ok=True)
os.makedirs(f'{project_dir}/papers', exist_ok=True)
os.makedirs(f'{project_dir}/data', exist_ok=True)
os.makedirs(f'{project_dir}/results', exist_ok=True)
os.makedirs(f'{project_dir}/evaluation', exist_ok=True)

print(f"✅ Project directory: {project_dir}")
print("✅ All work will be saved to Google Drive!")
print("✅ Safe from disconnects!")

In [None]:
# Installing required libraries
!pip install -q transformers accelerate sentence-transformers faiss-cpu pypdf langchain huggingface_hub

# Downloading papers directly in Colab
!pip install arxiv


# Importing basic libraries
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import arxiv

print("✅ Setup complete!")
print(f"CUDA available: {torch.cuda.is_available()}")

project_dir = '/content/drive/MyDrive/RAG_Project/'


In [None]:

import arxiv
import os

# Creating the directory if it doesn't exist
os.makedirs("/content/drive/MyDrive/RAG_Project/papers", exist_ok=True)

# Searching for NLP papers
search = arxiv.Search(
    query="cat:cs.CL",  # Computer Science - Computation and Language
    max_results=50,
    sort_by=arxiv.SortCriterion.SubmittedDate
)

papers = []
for result in search.results():
    papers.append({
        'title': result.title,
        'pdf_url': result.pdf_url,
        'summary': result.summary,
        'authors': [author.name for author in result.authors]
    })
    # Downloading PDF
    result.download_pdf(filename=f"/content/drive/MyDrive/RAG_Project/papers/{result.get_short_id()}.pdf")

print(f"✅ Downloaded {len(papers)} papers!")

In [None]:
!pip install -q pyPDF2

from PyPDF2 import PdfReader
import pickle
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
  try:
    reader = PdfReader(pdf_path)
    text=""
    for page in reader.pages:
      text += page.extract_text() + "\n"
    return text.strip()
  except Exception as e:
    print(f"Error with {pdf_path}: {e}")
    return ""

print("\n Processing 50 Downloaded Papers!")
all_papers=[]

paper_files = [f for f in os.listdir(f"{project_dir}papers/") if f.endswith(".pdf") and not f.startswith("distractor_")]

for pdf_file in tqdm(paper_files, desc="Processing PDFs"):
  pdf_path = f"{project_dir}/papers/{pdf_file}"
  text = extract_text_from_pdf(pdf_path)

  if text and len(text.split()) > 100:
    all_papers.append({
        'filename ' : pdf_file,
        'text' : text,
        'word_count' : len(text.split()),
        'is_distractor' : False
    })

    print(f"Succesfuly processed {len(all_papers)} papers!")
    print(f"Avergae words per paper: {sum(p['word_count'] for p in all_papers)//len(all_papers)}")

    # Saving the files in the drive

    with open(f'{project_dir}data/main_papers.pkl','wb') as f:
      pickle.dump(all_papers,f)
    print(f"Saved in Google Drive: {project_dir}data/main_papers.pkl")

In [None]:
print("Downloading 100 distractor papers from broader AI topics...")

# Distractor Papers

distractor_queries=[
    "cat:cs.AI", # Artificial Intelligence
    "cat:cs.LG", # Machine Learning
    "cat:cs.CV", # Computer Vision
]

distractor_count = 0

target_distractor = 100
downloads_ids = set() # creating a set that will help to store the ids of the distractor sequenctially

for query in distractor_queries:
  if distractor_count >= target_distractor:
    break

  print("Searching...")

  search = arxiv.Search(
      query = query ,
      max_results = 40 ,
      sort_by = arxiv.SortCriterion.SubmittedDate, # Here we are sorthing the data according to the publishing/submitting date
  )

  for result in search.results():
    if distractor_count >= target_distractor:
      break

    paper_id = result.get_short_id() # getting the paper id

    if paper_id == downloads_ids: # If the paper is downloaded then skip it
      continue

    try:
      filename = f"/content/drive/MyDrive/RAG_Project/papers/distractor_{paper_id}.pdf"

      if os.path.exists(filename):
        distractor_count += 1
        downloads_ids.add(paper_id)
        continue

      result.download_pdf(filename = filename)
      downloads_ids.add(paper_id)
      distractor_count +=1

      if distractor_count % 10 == 0 :
        print(f"Downloaded {distractor_count}/{target_distractor}")

    except Exception as e:
      print(f"Failed to download {paper_id}: {e}")
      continue

    print(f"Total paper downloaded {distractor_count}")

In [None]:
# 1. SETUP: Load embedding model
print("Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Embedding model loaded!")

# 2. PREPARE: Chunk your documents
def chunk_text(text, chunk_size=500, overlap=50):
    """Split text into overlapping chunks"""
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if chunk.strip():  # Only add non-empty chunks
            chunks.append(chunk)

    # If no chunks created (text too short), use the whole text
    if len(chunks) == 0:
        chunks = [text.strip()]

    return chunks

# Example with one paper - LONGER sample text
sample_text = """
Attention mechanisms have become integral to sequence modeling tasks in natural language processing.
The Transformer architecture, introduced in the paper Attention is All You Need, relies entirely on self-attention mechanisms
to compute representations of input and output sequences without using recurrent or convolutional layers.
BERT uses bidirectional transformers for language understanding and has achieved state-of-the-art results on many NLP benchmarks.
The key innovation of transformers is the multi-head attention mechanism which allows the model to jointly attend to information
from different representation subspaces at different positions. This enables the model to capture long-range dependencies more
effectively than traditional RNNs or LSTMs. GPT models use a decoder-only transformer architecture and are trained using
a language modeling objective. These models have shown impressive performance on various downstream tasks through fine-tuning
or few-shot learning. Recent work has focused on making transformers more efficient through techniques like sparse attention,
linear attention mechanisms, and improved positional encodings. The scalability of transformers has enabled training of very
large language models with billions of parameters that demonstrate emergent capabilities on complex reasoning tasks.
"""

# Use smaller chunk size for this demo
chunks = chunk_text(sample_text, chunk_size=50, overlap=10)
print(f"✅ Created {len(chunks)} chunks")
print(f"Sample chunk 1: {chunks[0][:100]}...")
if len(chunks) > 1:
    print(f"Sample chunk 2: {chunks[1][:100]}...")

# 3. INDEX: Create FAISS vector database
print("\nCreating embeddings...")
chunk_embeddings = embedding_model.encode(chunks)
print(f"✅ Embeddings shape: {chunk_embeddings.shape}")

# Build FAISS index
dimension = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_embeddings)
print(f"✅ FAISS index created with {index.ntotal} vectors")

# 4. RETRIEVE: Search function
def retrieve_relevant_chunks(query, top_k=3):
    """Retrieve most relevant chunks for a query"""
    query_embedding = embedding_model.encode([query])

    # Make sure we don't ask for more chunks than we have
    top_k = min(top_k, len(chunks))

    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        results.append({
            'chunk': chunks[idx],
            'distance': float(dist),
            'chunk_id': int(idx)
        })

    return results

# Test retrieval
test_query = "What are attention mechanisms?"
results = retrieve_relevant_chunks(test_query, top_k=3)

print(f"\n{'='*60}")
print(f"✅ RETRIEVAL TEST")
print(f"{'='*60}")
print(f"Query: {test_query}\n")
for i, result in enumerate(results, 1):
    print(f"{i}. Distance: {result['distance']:.4f}")
    print(f"   Chunk: {result['chunk'][:150]}...")
    print()

In [None]:
# 5. GENERATE: Use Flan-T5 (works immediately, no authentication needed)
from transformers import pipeline

print("Loading Flan-T5 model...")
generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device_map="auto"
)
print("✅ Flan-T5 loaded!")

def generate_answer(query, retrieved_chunks):
    """Generate answer using retrieved context"""

    # Combine retrieved chunks into context
    context = "\n\n".join([chunk['chunk'] for chunk in retrieved_chunks])

    # Create prompt - Flan-T5 uses simpler format
    prompt = f"""Answer the question based on the context below.

Context:
{context}

Question: {query}

Answer:"""

    # Generate
    response = generator(
        prompt,
        max_length=256,
        temperature=0.7,
        do_sample=True
    )

    return response[0]['generated_text']

# Test the full pipeline
test_query = "What are attention mechanisms?"
print(f"\n🔍 Query: {test_query}")

# Retrieve
retrieved = retrieve_relevant_chunks(test_query, top_k=3)
print(f"\n📚 Retrieved {len(retrieved)} chunks")

# Generate
answer = generate_answer(test_query, retrieved)
print(f"\n💡 Answer: {answer}")

In [None]:
# COMPLETE DEMO FUNCTION
def rag_qa_system(question):
    """Complete RAG QA pipeline"""
    print("="*60)
    print(f"QUESTION: {question}")
    print("="*60)

    # Step 1: Retrieve
    print("\n🔍 RETRIEVING relevant documents...")
    retrieved_chunks = retrieve_relevant_chunks(question, top_k=3)

    for i, chunk in enumerate(retrieved_chunks, 1):
        print(f"\n  [{i}] Similarity: {1/(1+chunk['distance']):.3f}")
        print(f"      {chunk['chunk'][:100]}...")

    # Step 2: Generate
    print("\n\n💭 GENERATING answer...")
    answer = generate_answer(question, retrieved_chunks)

    print("\n" + "="*60)
    print("ANSWER:")
    print("="*60)
    print(answer)
    print("="*60)

    return answer

# Demo questions
demo_questions = [
    "What are attention mechanisms in transformers?",
    "How does BERT work?",
    "What is the difference between GPT and BERT?"
]

for q in demo_questions:
    rag_qa_system(q)
    print("\n\n")