In [47]:
import os
import fitz  # PyMuPDF
import io
from PIL import Image
import spacy
import pandas as pd
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import spacy
nlp = spacy.load('en_core_web_sm')

In [77]:
# file path you want to extract images from
file = "nke-20210531-(2)-Exhibits_IR.pdf"
# open the file
pdf_file = fitz.open(file)

# Output directory for the extracted images
output_dir = "extracted_images"
# Desired output image format
output_format = "png"
# Minimum width and height for extracted images
min_width = 50
min_height = 50
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# Iterate over PDF pages
for page_index in range(len(pdf_file)):
    # Get the page itself
    page = pdf_file[page_index]
    # Get image list
    image_list = page.get_images(full=True)
    # Print the number of images found on this page
    # if image_list:
        # print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    # else:
        # print(f"[!] No images found on page {page_index}")
    # Iterate over the images on the page
    for image_index, img in enumerate(image_list, start=1):
        # Get the XREF of the image
        xref = img[0]
        # Extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        # Get the image extension
        image_ext = base_image["ext"]
        # Load it to PIL
        image = Image.open(io.BytesIO(image_bytes))
        # Check if the image meets the minimum dimensions and save it
        if image.width >= min_width and image.height >= min_height:
            image.save(
                open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
                format=output_format.upper())
        else:
            print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")

In [96]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

raw_image = Image.open("extracted_images/image1_1.png").convert('RGB')

# conditional image captioning
text = "trademark of"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
text = processor.decode(out[0], skip_special_tokens=True)

inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
text2 = processor.decode(out[0], skip_special_tokens=True)



In [97]:
text = text.replace('trademark of','')
text

' the nike logo'

In [98]:
text2 = text.replace('trademark of','')
text2

' the nike logo'

In [99]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

# Find named entities, phrases and concepts
entities = [entity.text for entity in doc.ents]
 
nouns = [token.lemma_ for token in doc if token.pos_ == "NOUN"]

In [100]:
entities

['nike']

In [101]:
keyword = entities[0]

In [102]:
import fitz
doc = fitz.open('nke-20210531-(2)-Exhibits_IR.pdf')
text = ""
for page in doc:
    text+=page.get_text()
text = ' '.join(text.split()).lower()

In [103]:
relevant = []

for t in text.split('. '):
    if keyword in t:
        relevant.append(t)

In [119]:
# relevant

In [105]:
dependency = []
for txt in relevant:
    doc = nlp(txt)

    chunks = []
    for chunk in doc.noun_chunks:
        out = {}
        root = chunk.root
        out[root.pos_] = root
        for tok in chunk:
            if tok != root:
                out[tok.pos_] = tok
        chunks.append(out)
    
    count = 0
    for chunk in chunks:
        if keyword in str(chunk):
            count += len(chunk)
    dependency.append(count)
    count = 0

In [106]:
chart = pd.DataFrame(columns = [])
chart['RelevantText'] = relevant
chart['DependencyCount'] = dependency

In [107]:
chart.sort_values(by='DependencyCount',ascending=False)

Unnamed: 0,RelevantText,DependencyCount
84,the risks and uncertainties are detailed from ...,21
247,nike entities primarily purchase product in tw...,10
145,"revenues $ 44,538 $ 37,403 19 % 17 % $ 39,117 ...",10
115,air manufacturing innovation manufactures cush...,9
19,the company's reportable operating segments fo...,8
...,...,...
141,nike brand wholesale equivalent revenues consi...,0
143,"(1) (2),(3) 2021 form 10-k 30 table of content...",0
147,(3) corporate revenues primarily consist of fo...,0
151,higher revenues in north america contributed a...,0


In [118]:
chart['RelevantText'][19]

"the company's reportable operating segments for the nike brand are: north america; europe, middle east & africa (emea); greater china; and asia pacific & latin america (apla), and include results for the nike and jordan brands"

In [108]:
# https://stackoverflow.com/questions/67821137/spacy-how-to-get-all-words-that-describe-a-noun
# https://www.thepythoncode.com/article/extract-pdf-images-in-python
# https://neurondai.medium.com/how-to-extract-text-from-a-pdf-using-pymupdf-and-python-caa8487cf9d