In [14]:
!pip install PyPDF2 python-docx sentence-transformers
import os
import numpy as np
from typing import List, Dict
import json

# Install these first:
# pip install sentence-transformers pypdf2 python-docx transformers torch

from sentence_transformers import SentenceTransformer
import PyPDF2
from docx import Document

# ============================================================================
# FILE LOADING
# ============================================================================

class DocumentLoader:
    @staticmethod
    def load_txt(file_path: str) -> str:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()

    @staticmethod
    def load_pdf(file_path: str) -> str:
        text = ""
        with open(file_path, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text

    @staticmethod
    def load_docx(file_path: str) -> str:
        doc = Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text

    @staticmethod
    def load_document(file_path: str) -> str:
        if file_path.endswith('.txt'):
            return DocumentLoader.load_txt(file_path)
        elif file_path.endswith('.pdf'):
            return DocumentLoader.load_pdf(file_path)
        elif file_path.endswith('.docx'):
            return DocumentLoader.load_docx(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_path}")

# ============================================================================
# TEXT CHUNKING
# ============================================================================

class TextChunker:
    @staticmethod
    def chunk_by_sentences(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
        words = text.split()
        chunks = []
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if chunk:
                chunks.append(chunk)
        return chunks

# ============================================================================
# EMBEDDING GENERATION
# ============================================================================

class EmbeddingGenerator:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        print(f"Loading embedding model: {model_name}")
        self.model = SentenceTransformer(model_name)
        print("Model loaded!")

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        print(f"Generating embeddings for {len(texts)} chunks...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        return embeddings

# ============================================================================
# VECTOR DATABASE
# ============================================================================

class SimpleVectorDB:
    def __init__(self):
        self.embeddings = []
        self.texts = []
        self.metadata = []

    def add_documents(self, texts: List[str], embeddings: np.ndarray, metadata: List[Dict] = None):
        self.texts.extend(texts)
        self.embeddings.extend(embeddings)
        if metadata:
            self.metadata.extend(metadata)
        else:
            self.metadata.extend([{}] * len(texts))
        print(f"Added {len(texts)} documents. Total: {len(self.texts)}")

    def search(self, query_embedding: np.ndarray, top_k: int = 3) -> List[Dict]:
        if not self.embeddings:
            return []

        similarities = []
        for idx, doc_embedding in enumerate(self.embeddings):
            similarity = self._cosine_similarity(query_embedding, doc_embedding)
            similarities.append((idx, similarity))

        similarities.sort(key=lambda x: x[1], reverse=True)

        results = []
        for idx, score in similarities[:top_k]:
            results.append({
                'text': self.texts[idx],
                'score': float(score),
                'metadata': self.metadata[idx]
            })
        return results

    @staticmethod
    def _cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
        dot_product = np.dot(vec1, vec2)
        norm1 = np.linalg.norm(vec1)
        norm2 = np.linalg.norm(vec2)
        return dot_product / (norm1 * norm2)

    def save(self, filepath: str):
        data = {
            'embeddings': [emb.tolist() for emb in self.embeddings],
            'texts': self.texts,
            'metadata': self.metadata
        }
        with open(filepath, 'w') as f:
            json.dump(data, f)
        print(f"Database saved to {filepath}")

    def load(self, filepath: str):
        with open(filepath, 'r') as f:
            data = json.load(f)
        self.embeddings = [np.array(emb) for emb in data['embeddings']]
        self.texts = data['texts']
        self.metadata = data['metadata']
        print(f"Database loaded from {filepath}")

# ============================================================================
# RESPONSE GENERATION
# ============================================================================

class ResponseGenerator:
    def __init__(self, method: str = 'simple'):
        self.method = method
        if method == 'openai':
            import openai
            self.client = openai.OpenAI()

    def generate(self, query: str, retrieved_chunks: List[Dict]) -> str:
        context = "\n\n".join([chunk['text'] for chunk in retrieved_chunks])

        if self.method == 'simple':
            return self._simple_response(context, retrieved_chunks)
        elif self.method == 'openai':
            return self._openai_response(query, context)
        elif self.method == 'ollama':
            return self._ollama_response(query, context)

    def _simple_response(self, context: str, chunks: List[Dict]) -> str:
        response = "Retrieved Information:\n\n"
        for i, chunk in enumerate(chunks, 1):
            response += f"[Result {i}] (Similarity: {chunk['score']:.3f})\n"
            response += f"{chunk['text'][:300]}...\n\n"
        return response

    def _openai_response(self, query: str, context: str) -> str:
        prompt = f"""Based on the following context, answer the question.

Context:
{context}

Question: {query}

Answer:"""

        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=500
        )
        return response.choices[0].message.content

    def _ollama_response(self, query: str, context: str) -> str:
        import requests
        prompt = f"""Based on the following context, answer the question.

Context:
{context}

Question: {query}

Answer:"""

        response = requests.post(
            'http://localhost:11434/api/generate',
            json={'model': 'llama2', 'prompt': prompt, 'stream': False}
        )
        return response.json()['response']

# ============================================================================
# COMPLETE RAG SYSTEM
# ============================================================================

class RAGSystem:
    def __init__(self, embedding_model: str = 'all-MiniLM-L6-v2', response_method: str = 'simple'):
        self.loader = DocumentLoader()
        self.chunker = TextChunker()
        self.embedder = EmbeddingGenerator(embedding_model)
        self.vector_db = SimpleVectorDB()
        self.generator = ResponseGenerator(response_method)

    def add_documents(self, file_paths: List[str], chunk_size: int = 500):
        all_chunks = []
        all_metadata = []

        for file_path in file_paths:
            print(f"\nProcessing: {file_path}")
            text = self.loader.load_document(file_path)
            print(f"Loaded {len(text)} characters")

            chunks = self.chunker.chunk_by_sentences(text, chunk_size=chunk_size)
            print(f"Created {len(chunks)} chunks")

            for i, chunk in enumerate(chunks):
                all_chunks.append(chunk)
                all_metadata.append({'source': file_path, 'chunk_id': i})

        embeddings = self.embedder.generate_embeddings(all_chunks)
        self.vector_db.add_documents(all_chunks, embeddings, all_metadata)

    def query(self, question: str, top_k: int = 3) -> str:
        print(f"\nQuery: {question}")
        query_embedding = self.embedder.generate_embeddings([question])[0]
        results = self.vector_db.search(query_embedding, top_k=top_k)

        if not results:
            return "No relevant information found."

        response = self.generator.generate(question, results)
        return response

    def save_database(self, filepath: str = 'rag_database.json'):
        self.vector_db.save(filepath)

    def load_database(self, filepath: str = 'rag_database.json'):
        self.vector_db.load(filepath)

# ============================================================================
# USAGE
# ============================================================================

if __name__ == "__main__":
    # Initialize RAG
    rag = RAGSystem(response_method='simple')

    # Add your documents
    file_paths = [
       "/content/m1a make sense of data mini lecture series notes.docx"
    ]
    rag.add_documents(file_paths, chunk_size=500)

    # Save database
    rag.save_database('my_rag_db.json')

    # Query
    response = rag.query("what is set 1", top_k=3)
    print(response)

Loading embedding model: all-MiniLM-L6-v2
Model loaded!

Processing: /content/m1a make sense of data mini lecture series notes.docx
Loaded 30695 characters
Created 13 chunks
Generating embeddings for 13 chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Added 13 documents. Total: 13
Database saved to my_rag_db.json

Query: what is set 1
Generating embeddings for 1 chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved Information:

[Result 1] (Similarity: 0.162)
knowledge from a dataset with such data manipulation and visualization. I choose a dataset in real estate because it is relative small in size and its data elements have clear meanings to most of us. However, these concepts and tools apply to any kind of dataset for insights and knowledge extraction...

[Result 2] (Similarity: 0.089)
among those houses? Let us start with question 1a). Since it involves column Style and Value. Let us first take a closer look at them. I tend to freeze header so that when I scroll down I can always see the header. You can do so by going to View/Freeze Pane and click on Freeze Top Row. You can scrol...

[Result 3] (Similarity: 0.063)
built. If you go over the other styles, you will find out they are more recent style. So, I am going to focus on comparing cape and colonial. It will be great if we can show these two styles in the form of bar chart at the same time. My first attempt to do this to go filte