In [2]:
from sentence_transformers import SentenceTransformer, util
import torch
from pypdf import PdfReader
from PIL import Image
import os
import io
from collections import defaultdict
import json

In [3]:
def preprocess_context(context:str):
    rows = context.split('\n')
    rows = [row[:77] for row in rows if row != '']
    return rows

In [4]:
#Load CLIP model
model = SentenceTransformer('clip-ViT-B-32')
def get_similar_sentences(image_emb, corpus_emb, top_k=1):

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(image_emb, corpus_emb)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    return top_results

Downloading .gitattributes: 100%|██████████| 690/690 [00:00<?, ?B/s] 
Downloading (…)LIPModel/config.json: 100%|██████████| 4.03k/4.03k [00:00<?, ?B/s]
Downloading 0_CLIPModel/merges.txt: 100%|██████████| 525k/525k [00:00<00:00, 9.59MB/s]
Downloading (…)rocessor_config.json: 100%|██████████| 316/316 [00:00<?, ?B/s] 
Downloading pytorch_model.bin: 100%|██████████| 605M/605M [00:59<00:00, 10.2MB/s] 
Downloading (…)cial_tokens_map.json: 100%|██████████| 389/389 [00:00<?, ?B/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 604/604 [00:00<?, ?B/s] 
Downloading 0_CLIPModel/vocab.json: 100%|██████████| 961k/961k [00:00<00:00, 11.6MB/s]
Downloading README.md: 100%|██████████| 1.88k/1.88k [00:00<?, ?B/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<?, ?B/s] 
Downloading modules.json: 100%|██████████| 122/122 [00:00<?, ?B/s] 


In [8]:
def image_extract(folder, pdf_name):
    reader = PdfReader(os.path.join(folder, pdf_name))
    count = 0
    caption = defaultdict(list)

    for i, page in enumerate(reader.pages[:]):
        raw_context = page.extract_text() # extract text from this page
        context = preprocess_context(raw_context)
        count = 0
        top_k = min(len(context), 3)
        corpus_emb = model.encode(context, convert_to_tensor=True)
        for image_file_object in page.images:
            file_name = os.path.splitext(image_file_object.name)[0]
            file_name = f"page_{i}_image_{count}.jpg"
            image = Image.open(io.BytesIO(image_file_object.data))
            image = image.convert('RGB')
            image_emb = model.encode(image, convert_to_tensor=True)
            top_results = get_similar_sentences(image_emb, corpus_emb, top_k=top_k)
            for score, idx in zip(top_results[0], top_results[1]):
                if score < 0.25:
                    break
                caption[file_name].append(context[idx])
            image.save(os.path.join(folder,file_name))
            count += 1
    json.dump(caption, open(os.path.join(folder, "caption.json"), "w"))

In [6]:
company = "Unilever"
root_folder = "webApp2/data/result"

In [9]:
for year in range(2016, 2023):
    folder = os.path.join(root_folder, f"{company}_{year}")
    pdf_name = f"ir-q4-{year}-full-announcement.pdf"
    image_extract(folder, pdf_name)