In [7]:
from sentence_transformers import SentenceTransformer, util
import torch
from pypdf import PdfReader
from pdf2image import convert_from_path
from PIL import Image
import matplotlib.pyplot as plt
from IPython.display import display
import pdfplumber
import os
import io
from collections import defaultdict
import json

In [18]:
def match_page(file_name, prompts, top_k=5):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    reader = PdfReader(file_name)
    corpus = []
    for i, page in enumerate(reader.pages[:]):
        raw_context = page.extract_text()
        context = raw_context.replace('\n', ' ')[:500]
        corpus.append(context)
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(prompts, convert_to_tensor=True)
    top_k_indices = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
    return top_k_indices

In [19]:
# mannually labeled, starting from 0!
pdf_name = "ir-Q4-2020-full-announcement.pdf"
pdf_folder = "./"
pdf_path = os.path.join(pdf_folder, pdf_name)
prompts = { 
    "beauty&personal care": 2,
    "full year operational review": 4,
    "income statement": 12,
    "changes in equity": 13,
    "balance sheet": 14,
    "cash flow": 15,
    "return on invested capital": 10,
    "underlying sales growth in North America": 4,
    "finance and liquidity": 5,
    "non-gaap measures": 6,
}
n_samples = len(prompts)
correct = 0
for prompt, label in prompts.items():
    top_k = match_page(pdf_path, prompt)
    corpus_ids = [item['corpus_id'] for sublist in top_k for item in sublist]
    if label in corpus_ids:
        correct += 1
        
print(f"{pdf_path}'s accuracy is {correct/n_samples*100}%")

./ir-Q4-2020-full-announcement.pdf's accuracy is 80.0%


In [20]:
# mannually labeled, starting from 0!
pdf_name = "ir-q2-2019-full-announcement.pdf"
pdf_folder = "./"
pdf_path = os.path.join(pdf_folder, pdf_name)
prompts = {
    "beauty&personal care": 1,
    "condensed financial statements": 16,
    "income statement": 15,
    "geographical area": 17,
    "acquisitions and disposals": 19,
    "cash flow": 21,
    "foods and refreshment": 2,
    "competition investigation": 5,
    "non-gaap measures": 6,
    "impact on segment information": 23,
}
n_samples = len(prompts)
correct = 0
for prompt, label in prompts.items():
    top_k = match_page(pdf_path, prompt)
    corpus_ids = [item['corpus_id'] for sublist in top_k for item in sublist]
    if label in corpus_ids:
        correct += 1
        
print(f"{pdf_path}'s accuracy is {correct/n_samples*100}%")

./ir-q2-2019-full-announcement.pdf's accuracy is 80.0%


In [21]:
# mannually labeled, starting from 0!
pdf_name = "unilever-h1-2023-results-presentation.pdf"
pdf_folder = "./"
pdf_path = os.path.join(pdf_folder, pdf_name)
prompts = {
    "performance summary": 6,
    "impacted by three factors": 7,
    "china": 9,
    "underlying sales growth": 12,
    "beauty and wellbeing": 13,
    "personal care": 14,
    "home care": 15,
    "nutrition": 16,
    "africa regional growth": 18,
    "europe regional growth": 18,
}
n_samples = len(prompts)
correct = 0
for prompt, label in prompts.items():
    top_k = match_page(pdf_path, prompt)
    corpus_ids = [item['corpus_id'] for sublist in top_k for item in sublist]
    if label in corpus_ids:
        correct += 1
        
print(f"{pdf_path}'s accuracy is {correct/n_samples*100}%")

./unilever-h1-2023-results-presentation.pdf's accuracy is 100.0%


In [22]:
# mannually labeled, starting from 0!
pdf_name = "ir-Q2-2020-full-announcement.pdf"
pdf_folder = "./"
pdf_path = os.path.join(pdf_folder, pdf_name)
prompts = { 
    "Covid-19": 2,
    "geographical area": 4,
    "aisa amet rub": 4,
    "changes in equity": 8,
    "taxation": 19,
    "ACQUISITIONS AND DISPOSALS": 21,
    "principal risk factors": 10,
    "fresh cash flow": 9,
    "competion investigations": 6,
    "non-gaap measures": 0,
}
n_samples = len(prompts)
correct = 0
for prompt, label in prompts.items():
    top_k = match_page(pdf_path, prompt)
    corpus_ids = [item['corpus_id'] for sublist in top_k for item in sublist]
    if label in corpus_ids:
        correct += 1
        
print(f"{pdf_path}'s accuracy is {correct/n_samples*100}%")

./ir-Q2-2020-full-announcement.pdf's accuracy is 100.0%
