In [1]:
import os
import fitz  # PyMuPDF
import io
from PIL import Image
import spacy

import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

2023-06-22 12:13:42.317451: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# file path you want to extract images from
file = "267980655.pdf"
# open the file
pdf_file = fitz.open(file)

# Output directory for the extracted images
output_dir = "extracted_images"
# Desired output image format
output_format = "png"
# Minimum width and height for extracted images
min_width = 1000
min_height = 1000
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# Iterate over PDF pages
for page_index in range(len(pdf_file)):
    # Get the page itself
    page = pdf_file[page_index]
    # Get image list
    image_list = page.get_images(full=True)
    # Print the number of images found on this page
    # if image_list:
        # print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
    # else:
        # print(f"[!] No images found on page {page_index}")
    # Iterate over the images on the page
    for image_index, img in enumerate(image_list, start=1):
        # Get the XREF of the image
        xref = img[0]
        # Extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]
        # Get the image extension
        image_ext = base_image["ext"]
        # Load it to PIL
        image = Image.open(io.BytesIO(image_bytes))
        # Check if the image meets the minimum dimensions and save it
        if image.width >= min_width and image.height >= min_height:
            image.save(
                open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
                format=output_format.upper())
        else:
            print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")

[-] Skipping image 1 on page 0 due to its small size.
[-] Skipping image 2 on page 0 due to its small size.
[-] Skipping image 1 on page 7 due to its small size.
[-] Skipping image 2 on page 7 due to its small size.
[-] Skipping image 3 on page 7 due to its small size.
[-] Skipping image 4 on page 7 due to its small size.
[-] Skipping image 1 on page 8 due to its small size.
[-] Skipping image 2 on page 8 due to its small size.
[-] Skipping image 1 on page 52 due to its small size.
[-] Skipping image 2 on page 52 due to its small size.
[-] Skipping image 3 on page 52 due to its small size.
[-] Skipping image 4 on page 52 due to its small size.
[-] Skipping image 1 on page 53 due to its small size.
[-] Skipping image 2 on page 53 due to its small size.


In [3]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

raw_image = Image.open("extracted_images/image8_3.png").convert('RGB')

# conditional image captioning
text = "trademark of"
inputs = processor(raw_image, text, return_tensors="pt")

out = model.generate(**inputs)
text = processor.decode(out[0], skip_special_tokens=True)

inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
text2 = processor.decode(out[0], skip_special_tokens=True)



In [4]:
text = text.replace('trademark of','')
text

' the starbucks logo'

In [5]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

doc = nlp(text)

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)
    
nouns = [token.lemma_ for token in doc if token.pos_ == "NOUN"]

In [6]:
nouns[0]

'starbuck'

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [8]:
import fitz
doc = fitz.open('267980655.pdf')
text = ""
for page in doc:
    text+=page.get_text()
text = ' '.join(text.split()).lower()

In [9]:
relevant = []

for t in text.split('. '):
    if nouns[0] in t:
        relevant.append(t)

In [10]:
relevant

['research in conceptual metaphor theory finds, though, that metaphor is “primarily a matter of thought and action and only derivatively a matter of language.” indeed, brands rely not just on verbal metaphor, but also on visual metaphor to differentiate themselves from competitors in the marketplace (e.g., target’s “bullseye” and starbucks’s “siren”)',
 '2018] visual metaphor and distinctiveness 773 marketplace.15 examples include apple’s logo,16 starbucks’s siren,17 and nike’s swoosh.18 as with verbal metaphor in the word mark context, use of visual metaphor enables an image mark to serve as an inherent source identifier by (1) denoting (referring literally to) a brand, as well as (2) 15',
 'starbucks corp., starbucks logo, 2011; audio2visual contest, a siren’s call: a little bit closer, contest image, 2014, https://www.flickr.com/photos/rinoa_cathcart [https://perma.cc/bv7q-8zlt]',
 'consider the common examples mentioned in the introduction: starbucks’s siren logo, nike’s swoosh, an

In [11]:
import spacy
nlp = spacy.load('en_core_web_sm')
txt = relevant[2]
doc = nlp(txt)

chunks = []
for chunk in doc.noun_chunks:
    out = {}
    root = chunk.root
    out[root.pos_] = root
    for tok in chunk:
        if tok != root:
            out[tok.pos_] = tok
    chunks.append(out)
print(chunks)

[{'PROPN': starbucks}, {'PROPN': .}, {'PROPN': logo}, {'NOUN': contest, 'ADJ': audio2visual}, {'NOUN': call, 'DET': a, 'ADJ': siren, 'PART': ’s}, {'NOUN': bit, 'DET': a, 'ADJ': little, 'ADV': closer, 'PUNCT': ,, 'VERB': contest}]


In [34]:
import spacy
nlp = spacy.load('en_core_web_sm')
txt = relevant[3]
doc = nlp(txt)

chunks = []
for chunk in doc.noun_chunks:
    out = {}
    root = chunk.root
    out[root.pos_] = root
    for tok in chunk:
        if tok != root:
            out[tok.pos_] = tok
    chunks.append(out)
print(chunks)

[{'NOUN': examples, 'DET': the, 'ADJ': common}, {'NOUN': introduction, 'DET': the}, {'NOUN': starbucks, 'PART': ’s, 'ADJ': siren}, {'PROPN': nike}, {'NOUN': apple, 'DET': the}]


In [37]:
len(chunks[2])

3