In [11]:
from sentence_transformers import SentenceTransformer, util
import torch
from pypdf import PdfReader
from PIL import Image
import os
import io
from collections import defaultdict
import json
import re

In [12]:
def preprocess_context(context:str):
    context = re.sub(r"\\u[0-9A-Fa-f]{4}", "", context)
    rows = context.split('\n')
    rows = [row[:77] for row in rows if row != '']
    return rows

In [13]:
#Load CLIP model
model = SentenceTransformer('clip-ViT-B-32')
def get_similar_sentences(image_emb, corpus_emb, top_k=1):

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(image_emb, corpus_emb)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    return top_results

In [15]:
def image_extract(folder, pdf_name):
    reader = PdfReader(os.path.join(folder, pdf_name))
    caption = defaultdict(list)

    for i, page in enumerate(reader.pages[:]):
        candidate_images = []
        try:
            for image_file_object in page.images:                
                image = Image.open(io.BytesIO(image_file_object.data))
                width, height = image.size
                if width < 100 or height < 100:
                    continue
                image = image.convert('RGB')
                file_name = "page_{}_image_{}.png".format(i, len(candidate_images))
                image.save(os.path.join(folder,file_name))
                candidate_images.append(image)

        except Exception as e:
            print(f"image extracton failed in {pdf_name}, page:{i}")
        if len(candidate_images) == 0:
            continue

        raw_context = page.extract_text() # extract text from this page
        context = preprocess_context(raw_context)
        if len(context) == 0:
            continue
        top_k = min(len(context), 3)
        corpus_emb = model.encode(context, convert_to_tensor=True)

        image_emb = model.encode(candidate_images, convert_to_tensor=True)
        top_results = util.semantic_search(image_emb, corpus_emb, top_k=top_k)
        for j in range(len(candidate_images)):
            file_name = f"page_{i}_image_{j}.png"
            candidate_images[j].save(os.path.join(folder,file_name))
            results = top_results[j]
            for k in range(top_k):
                if results[k]["score"] < 0.3:
                    break
                caption[file_name].append(context[results[k]["corpus_id"]])
    json.dump(caption, open(os.path.join(folder, "caption.json"), "w"))

In [16]:
def delete_image_files(directory):
    # List all files in the given directory
    files = os.listdir(directory)

    for file in files:
        if "page" in file and "image" in file and file.endswith('.png'):
            file_path = os.path.join(directory, file)

            # Delete the file
            os.remove(file_path)
        if "caption" in file:
            file_path = os.path.join(directory, file)

            # Delete the file
            os.remove(file_path)

In [10]:
root_folder = "webApp2/data/result"
for company in ["Colgate", "L_Oreal", "Nestle", "P&G"]:
    for year in range(2015, 2023):
        folder = os.path.join(root_folder, f"{company}_{year}")
        pdf_name = f"{company}_{year}.pdf"
        delete_image_files(folder)
        image_extract(folder, pdf_name)

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'webApp2/data/result\\Colgate_2015\\caption.json'