In [None]:
!pip install sentence-transformers faiss-cpu camelot-py[cv] transformers accelerate datasets rouge-score




In [None]:
!pip install pymupdf faiss-cpu sentence-transformers camelot-py[cv] transformers accelerate datasets rouge-score


Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [None]:
import fitz, re, faiss, numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import pipeline
import camelot
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [None]:
# -------------------------
# 1. PDF Text Extraction
# -------------------------
def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return re.sub(r'\n+', '\n', text).strip()

pdf_path = "/content/Meta’s Q1 2024 Financial Report.pdf"
text = extract_pdf_text(pdf_path)

In [None]:
# -------------------------
# 2. Chunking
# -------------------------
def chunk_text(text, chunk_size=100, overlap=20):
    words = text.split()
    chunks, i = [], 0
    while i < len(words):
        chunk = words[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return chunks

chunks = chunk_text(text)

In [None]:

# -------------------------
# 3. Embeddings + FAISS
# -------------------------
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode(chunks)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

def retrieve(query, top_k=5):
    query_emb = embed_model.encode([query])
    distances, indices = index.search(np.array(query_emb), top_k)
    return [(chunks[i], distances[0][j]) for j, i in enumerate(indices[0])]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# -------------------------
# 4. Table Extraction
# -------------------------
tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
dataframes = [t.df for t in tables]

def search_tables(keyword):
    results = []
    for df in dataframes:
        mask = df.apply(lambda row: row.astype(str).str.contains(keyword, case=False).any(), axis=1)
        filtered = df[mask]
        if not filtered.empty:
            results.append(filtered)
    return results


  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


In [None]:
# -------------------------
# 5. Reranking with Cross-Encoder
# -------------------------
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rerank(query, retrieved_chunks):
    pairs = [(query, c[0]) for c in retrieved_chunks]
    scores = reranker.predict(pairs)
    ranked = sorted(zip(retrieved_chunks, scores), key=lambda x: x[1], reverse=True)
    return [r[0] for r in ranked]


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [None]:
# -------------------------
# 6. Query Optimization (Rewrite)
# -------------------------
rewrite_model = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)

def rewrite_query(query):
    prompt = f"Rewrite this question to be more clear and detailed:\n{query}"
    return rewrite_model(prompt, max_length=50)[0]['generated_text']




config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:
# -------------------------
# 7. Answer Generation
# -------------------------
gen_model = pipeline("text2text-generation", model="google/flan-t5-large", device=-1)

def answer_query(query, use_reranker=True):
    query = rewrite_query(query)
    retrieved = retrieve(query, 5)
    if use_reranker:
        retrieved = rerank(query, retrieved)
    context = " ".join([c[0] for c in retrieved[:3]])
    prompt = f"Answer based on context:\n{context}\nQuestion: {query}"
    output = gen_model(prompt, max_length=300)[0]['generated_text']
    return output, retrieved


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [None]:
# -------------------------
# 8. Evaluation Metrics
# -------------------------
def evaluate_retrieval(query, ground_truth, k=3):
    retrieved = retrieve(query, k)
    relevant = any(ground_truth.lower() in c[0].lower() for c in retrieved)
    precision = 1.0 if relevant else 0.0
    recall = precision  # simplified for demo
    return precision, recall

def evaluate_generation(pred, reference):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge = scorer.score(reference, pred)['rougeL'].fmeasure
    bleu = sentence_bleu([reference.split()], pred.split())
    return {"ROUGE-L": rouge, "BLEU": bleu}

In [None]:
# -------------------------
# 9. Sample Test Queries
# -------------------------
queries = [
    "What was Meta’s revenue in Q1 2024?",
    "What were the key financial highlights for Meta in Q1 2024?",
    "What was Meta’s net income in Q1 2024 compared to Q1 2023?",
    "Summarize Meta’s operating expenses in Q1 2024."
]

for q in queries:
    answer, chunks_used = answer_query(q)
    print(f"\nQ: {q}\nA: {answer}\n---")

# -------------------------
# 10. Ablation Study Example
# -------------------------
def compare_reranking(query):
    _, chunks_no_rerank = answer_query(query, use_reranker=False)
    _, chunks_rerank = answer_query(query, use_reranker=True)
    print("\nWithout Reranking:\n", [c[0][:80] for c in chunks_no_rerank])
    print("\nWith Reranking:\n", [c[0][:80] for c in chunks_rerank])

compare_reranking("What was Meta’s revenue in Q1 2024?")

Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Q: What was Meta’s revenue in Q1 2024?
A: $ 36,455
---


Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Q: What were the key financial highlights for Meta in Q1 2024?
A: Revenue $ 36,455 $ 28,645 27 % Costs and expenses 22,637 21,418 6 % Income from operations $ 13,818 $ 7,227 91 % Operating margin 38 % 25 % Provision for income taxes $ 1,814 $ 1,598 14 % Effective tax rate 13 % 22 % Net income $ 12,369 $ 5,709 117 % Diluted earnings per share (EPS) $ 4.71 $ 2.20 114 %
---


Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Q: What was Meta’s net income in Q1 2024 compared to Q1 2023?
A: $ 12,369 $ 5,709
---


Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Q: Summarize Meta’s operating expenses in Q1 2024.
A: Depreciation and amortization
---


Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Without Reranking:
 ['Meta Reports First Quarter 2024 Results MENLO PARK, Calif. – April 24, 2024 – Me', 'believe that this methodology can provide useful supplemental information to hel', 'well." First Quarter 2024 Financial Highlights Three Months Ended March 31, % Ch', 'marketable securities were $58.12 billion as of March 31, 2024. Free cash flow w', 'following table presents our segment information of revenue and income (loss) fr']

With Reranking:
 ['Meta Reports First Quarter 2024 Results MENLO PARK, Calif. – April 24, 2024 – Me', 'believe that this methodology can provide useful supplemental information to hel', 'marketable securities were $58.12 billion as of March 31, 2024. Free cash flow w', 'well." First Quarter 2024 Financial Highlights Three Months Ended March 31, % Ch', 'following table presents our segment information of revenue and income (loss) fr']


In [None]:
# evaluation

queries = [
    ("What was Meta’s revenue in Q1 2024?", "36,455"),
    ("What was Meta’s net income in Q1 2024 compared to Q1 2023?", "12,369")
]

for q, truth in queries:
    answer, _ = answer_query(q)  # Get predicted answer from your pipeline

    precision, recall = evaluate_retrieval(q, truth, k=3)
    print(f"\nQ: {q}")
    print("Answer:", answer)
    print("Retrieval -> Precision:", precision, "Recall:", recall)

    metrics = evaluate_generation(answer, f"Meta's {truth} answer.")
    print("Generation ->", metrics)


Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Q: What was Meta’s revenue in Q1 2024?
Answer: $ 36,455
Retrieval -> Precision: 1.0 Recall: 1.0
Generation -> {'ROUGE-L': 0.5714285714285715, 'BLEU': 9.291879812217675e-232}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `


Q: What was Meta’s net income in Q1 2024 compared to Q1 2023?
Answer: $ 12,369 $ 5,709
Retrieval -> Precision: 1.0 Recall: 1.0
Generation -> {'ROUGE-L': 0.4444444444444445, 'BLEU': 1.2882297539194154e-231}
