<a href="https://colab.research.google.com/github/Vasuman-RNQ/Assignment_rag_done/blob/main/RAG_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
#downloader.py
import os
import requests

def download_10k(cik, year, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    url = f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=10-K&dateb=&owner=exclude&count=10&output=atom'
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(url, headers=headers)
    if r.ok and str(year) in r.text:
        with open(f'{save_dir}/{cik}_{year}.html', 'w', encoding='utf-8') as f:
            f.write(r.text)
        print(f'Downloaded {save_dir}/{cik}_{year}.html')
    else:
        print(f'Could not download {cik} {year}')


In [76]:
#parser.py
import pdfplumber
from bs4 import BeautifulSoup

def parse_filing(filepath):
    text = ""
    if filepath.endswith('.pdf'):
        with pdfplumber.open(filepath) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"
    elif filepath.endswith('.html') or filepath.endswith('.htm'):
        with open(filepath, encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
            text = soup.get_text(separator="\n")
    return text


In [77]:
#chunking_and_embeddings.py
import re
from sentence_transformers import SentenceTransformer

def chunk_text(text, size=500):
    paras = re.split(r'\n{2,}', text)
    chunks, buff = [], ""
    for para in paras:
        if len(buff) + len(para) < size:
            buff += para + "\n"
        else:
            chunks.append(buff.strip())
            buff = para + "\n"
    if buff:
        chunks.append(buff.strip())
    return [c for c in chunks if c]

def embed_chunks(chunks, model):
    return model.encode(chunks)


In [78]:
import faiss

import numpy as np


def build_vector_index(embeddings, chunks):
    # embeddings: numpy array shape (num_chunks, embedding_dim)
    dim = embeddings.shape[1]  # Corrected dimension calculation
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

def query_index(query, model, index, chunks, top_k=1):
    # Encode query to vector
    q_emb = model.encode([query])

    # Search index
    dists, ids = index.search(q_emb, top_k)
    # Defensive checks to avoid IndexError
    if ids.size == 0 or ids.shape[1] == 0 or len(ids[0]) == 0:
        print("No results returned from index search.")
        return []

    # Check if indices are within range
    valid_indices = [i for i in ids[0] if i < len(chunks)]
    if not valid_indices:
        print("No valid indices in search results.")
        return []

    # Return a list of dictionaries with the chunk text in the 'text' key
    return [{'text': chunks[i]} for i in valid_indices]

In [79]:
from google.colab import files
uploaded = files.upload()

Saving Company Financial Information 2023.docx to Company Financial Information 2023 (5).docx


In [80]:
from sentence_transformers import SentenceTransformer
from docx import Document
from io import BytesIO

# Load and chunk text from the Word document
#file_path = uploaded
file_name = list(uploaded.keys())[0]
file_content = uploaded[file_name]

file_stream = BytesIO(file_content)
doc = Document(file_stream)
full_text = " ".join([para.text.strip() for para in doc.paragraphs if para.text.strip() != ""])

# Simple chunk function already done, example:
def chunk_text(text, chunk_size=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

chunks = chunk_text(full_text, chunk_size=20)  # smaller chunk size for more granularity

# Load a pretrained sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode chunks to get embeddings
embeddings = model.encode(chunks)

# Now you have chunks (text pieces) and embeddings (vectors) aligned
print("Chunks:", chunks)
print("Embeddings shape:", embeddings.shape)


Chunks: ['Company Financial Information 2023 Operating margin was 42.1% in 2023 for Microsoft. Google achieved an operating margin of 29.8% in', '2023. NVIDIA’s operating margin was 29.6% for the year 2023. Revenue and segment data are included in this report.']
Embeddings shape: (2, 384)


In [81]:
def decompose_query(query):
    """
    Decompose the original complex query into simpler sub-queries.

    For demonstration, this function splits by common delimiters such as 'and',
    'or', or commas. In practical usage you might resort to NLP parsing,
    dependency parsing, or semantic segmentation.
    """
    import re
    # Split on 'and', 'or', ',', ';'
    split_patterns = r'\band\b|\bor\b|,|;'
    sub_queries = [q.strip() for q in re.split(split_patterns, query) if q.strip()]
    return sub_queries


def synthesize_results(results):
    """
    Synthesize multiple search results into a coherent response.

    This mock implementation:
    - combines answer texts into one paragraph,
    - concatenates reasoning texts,
    - merges unique source lists.

    Your real implementation might leverage an LLM, summarization model, or custom logic.
    """
    class Answer:
        def __init__(self, text, reasoning, sources):
            self.text = text
            self.reasoning = reasoning
            self.sources = sources

    combined_text = " ".join([res['text'] for res in results])
    combined_reasoning = " ".join([res.get('reasoning', '') for res in results]).strip()
    combined_sources = list({source for res in results for source in res.get('sources', [])})

    return Answer(combined_text, combined_reasoning, combined_sources)


In [82]:
#from parser import parse_filing
#from chunking_and_embeddings import chunk_text, embed_chunks
#from sentence_transformers import SentenceTransformer
#from vector_store import build_vector_index, query_index
#from agent import decompose_query, synthesize_results
import numpy as np
from sentence_transformers import SentenceTransformer
from docx import Document
from io import BytesIO

def answer_query(query):
    # Load and parse filings (simulate with one file example; in real use, loop all filings)
    file_name = list(uploaded.keys())[0]
    file_content = uploaded[file_name]

    file_stream = BytesIO(file_content)
    doc = Document(file_stream)
    full_text = " ".join([para.text.strip() for para in doc.paragraphs if para.text.strip() != ""])

    # Simple chunk function already done, example:
    def chunk_text(text, chunk_size=100):
        words = text.split()
        chunks = []
        for i in range(0, len(words), chunk_size):
            chunk = " ".join(words[i:i+chunk_size])
            chunks.append(chunk)
        return chunks

    chunks = chunk_text(full_text, chunk_size=20)  # smaller chunk size for more granularity

    # Load a pretrained sentence embedding model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Encode chunks to get embeddings
    embeddings = model.encode(chunks)

    # Ensure embeddings are a numpy array
    embeddings = np.array(embeddings)

    index = build_vector_index(embeddings, chunks)

    decomposed = decompose_query(query)
    all_results = []
    for subq in decomposed:
        res = query_index(subq, model, index, chunks, top_k=1)
        if res:
            all_results.extend(res)
    answer = synthesize_results(all_results)
    return {
        "query": query,
        "answer": answer.text,
        "reasoning": answer.reasoning,
        "sub_queries": decomposed,
        "sources": answer.sources
    }

if __name__ == "__main__":
    query = "Which company had the highest operating margin in 2023?"
    print(answer_query(query))

{'query': 'Which company had the highest operating margin in 2023?', 'answer': 'Company Financial Information 2023 Operating margin was 42.1% in 2023 for Microsoft. Google achieved an operating margin of 29.8% in', 'reasoning': '', 'sub_queries': ['Which company had the highest operating margin in 2023?'], 'sources': []}
