In [1]:
import fitz

pdf_file = "McGraw-Hills Dictionary of American Idioms_Main.pdf"
# pdf_file = "McGraw-Hills Dictionary of American Idioms_Main_Fix.pdf"
doc = fitz.open(pdf_file)

# Observing how the spans look like.

In [None]:
extracted_text = []

for page in doc:
    for text in page.get_text("dict")["blocks"]:
        if "lines" in text:
            for line in text["lines"]:
                for span in line["spans"]:
                    print(span)
                    extracted_text.append(span)

# Checking the frequency of the types (figurative, literal, proverb... and such)

In [3]:
freq = {t : extracted_text.count(t) for t in set(extracted_text)}

In [4]:
sorted_freq = dict(sorted(freq.items(), key=lambda item: item[1], reverse=True))

In [5]:
for key in sorted_freq:
    print(f"{key} : {sorted_freq[key]}")

In [6]:
category = ["Fig.", "Lit.", "Prov.", "Sl.", "Inf.", "Rur.", "Cliché", "Euph."]

# Leaving out unnecessary spans.

In [None]:
cnt = 0

for page in doc:
    for text in page.get_text("dict")["blocks"]:
        if "lines" in text:
            for line in text["lines"]:
                for span in line["spans"]:
                    if span["size"] == 8:
                        cnt += 1
                        print(span)

print(cnt)

In [None]:
cnt = 0

for page in doc:
    for text in page.get_text("dict")["blocks"]:
        if "lines" in text:
            for line in text["lines"]:
                for span in line["spans"]:
                    if span["size"] >= 11:
                        cnt += 1
                        print(span)

print(cnt)

## You can start from here if you don't need to see the details and frequencies

In [1]:
import fitz

pdf_file = "McGraw-Hills Dictionary of American Idioms_Main.pdf"
# pdf_file = "McGraw-Hills Dictionary of American Idioms_Main_Fix.pdf"
doc = fitz.open(pdf_file)

In [2]:
refined_spans = []

page_num = 1
for page in doc:
    if (page_num % 2): margin = 250.0 # odd number
    else: margin = 245.0 # even number
    left_col = []
    right_col = []

    for text in page.get_text("dict")["blocks"]:
        if "lines" in text:
            for line in text["lines"]:
                for span in line["spans"]:

                    if (span["size"] == 8) or (span["size"] >= 11): continue
                    elif (span["size"] == 8.5) and (span["font"] == "Formata-Regular") and (span["text"].strip().isdigit()): continue
                    else:
                        refined = {
                            "text": span["text"],
                            "size": span["size"],
                            "font": span["font"],
                            "origin": span["origin"],
                        }

                        if refined["origin"][0] < margin:
                            left_col.append(refined)
                        else: right_col.append(refined)
    page_num += 1
    refined_spans.extend(left_col)
    refined_spans.extend(right_col)

print(len(refined_spans))

258030


# Refine

In [None]:
for span in refined_spans:
    print(span)

In [11]:
# for span in refined_spans:
#     # if span["size"] == 8.64999008178711 and ";" in span["text"]:
#     if "previous." in span["text"]:
#     # if ";" in span["text"] and span["font"] == "Formata-Medium":
#         print(span)

In [4]:
def print_span(spans, idx=None):
    texts = [s["text"] for s in spans]
    text = "".join(texts)

    if idx: print(idx)

    if text: print(text)
    else: print("check for errors")
    return text

In [5]:
import re
category = ["Fig.", "Lit.", "Prov.", "Sl.", "Inf.", "Rur.", "Cliché", "Euph."]

def multi_meaning(idx):
    pattern = re.compile(r'(?:10|[2-9])\.')
    if pattern.search(refined_spans[idx]["text"]): return True

    return False

In [6]:
def synonym_checker(idx):
    if idx >= len(refined_spans) - 1: return False

    if (refined_spans[idx]["size"] == 8.5) and (refined_spans[idx]["font"] == "Minion-Regular"):
        if ("Go to" in refined_spans[idx]["text"]): return idx
        elif ("Go" in refined_spans[idx]["text"] and "to" in refined_spans[idx+1]["text"]): return idx
    
    return False

def synonym_process(idx):
    8.14999008178711

In [7]:
def sentence_start(idx):
    if refined_spans[idx]["font"] == "MathematicalPi-Six" and "\x02" in refined_spans[idx]["text"].strip(): return True
    return False

def sentence_extract(idx):
    text = []
    while refined_spans[idx]["font"] == "Minion-Italic":
        text.append(refined_spans[idx])
        idx += 1

    print_span(text, idx)
    # print(idx)
    return text, idx

In [35]:
def idiom_start(idx):
    if (refined_spans[idx]["size"] in [8.64999008178711]) and (refined_spans[idx]["font"] == "Formata-Medium"): return True
    if (refined_spans[idx]["size"] == 8.5) and (refined_spans[idx]["font"] == "Minion-Regular") \
        and (refined_spans[idx]["text"].lower().strip() in ["a", "an", "the"]): return True

    return False

def idiom_check(idx):
    if multi_meaning(idx): return False
    
    if (refined_spans[idx]["size"] in [8.64999008178711, 4.7350664138793945]) and (refined_spans[idx]["font"] == "Formata-Medium"): return True
    if (refined_spans[idx]["size"] == 8.297584533691406) and (refined_spans[idx]["font"] == "Formata-Condensed"): return True
    # if (refined_spans[idx]["font"] == "Formata-LightCondensed") and ("someone" in refined_spans[idx]["text"].lower() or "something" in refined_spans[idx]["text"].lower()): return True
    if (refined_spans[idx]["font"] == "Formata-LightCondensed"): return True
    if (refined_spans[idx]["size"] == 8.5) and (refined_spans[idx]["font"] == "Minion-Regular") \
        and (refined_spans[idx]["text"].lower().strip() in ["a", "an", "the"]): return True

    return False

def and_check(idx):
    if (refined_spans[idx]["size"] == 7): return True

    return False

def idiom_extract(idx):
    text = []
    while idiom_check(idx):
        text.append(refined_spans[idx])
        idx += 1
    
    print_span(text, idx)
    # print(idx)
    return text, idx

In [42]:
def meaning_extract(idx):
    variation = False
    text = []
    next = 0

    # Check if there are any variations for this idiom ("*Typically blah blah blah")
    node = idx
    while not sentence_start(node):
        node += 1
    next = node

    var = idx
    while var < next:
        if refined_spans[var]["size"] == 10.714475631713867: 
            variation = True
            break
        var += 1
    
    if variation:
        while idx < next:
            text.append(refined_spans[idx])
            idx += 1
    else:
            # ((refined_spans[idx]["size"] == 8.5) and (refined_spans[idx]["font"] == "Minion-Italic") and (refined_spans[idx]["text"].strip() in category)) or \
        # while ((refined_spans[idx]["size"] == 8.5) and (refined_spans[idx]["font"] == "Minion-Regular")) or \
        #     ((refined_spans[idx]["size"] == 8.5) and (refined_spans[idx]["font"] == "Minion-Italic")) or \
        #     (refined_spans[idx]["font"] == "Minion-RegularSC") or \
        #     refined_spans[idx]["size"] == 8.14999008178711:
        while (not sentence_start(idx)) and (not idiom_start(idx)):
            text.append(refined_spans[idx])
            idx += 1
    
    print_span(text, idx)
    # print(idx)
    return text, idx

In [None]:
idioms = []
meanings = []
sentences = []

idx = 0
while idx < len(refined_spans):
    I = []
    M = []
    S = []
    # multi = False

    # First, collect the idiom part
    if idiom_check(idx):
        idiom, idx = idiom_extract(idx)
        I.append(print_span(idiom).strip())
        # multi = multi_meaning(idx-1)
    
    if and_check(idx):
        idx += 1
        idiom, idx = idiom_extract(idx)
        I.append(print_span(idiom).strip())
        # if not multi:
        #     multi = multi_meaning(idx-1)

    # Extract meanings and sentences
    while True:
        if multi_meaning(idx):
            mean = refined_spans[idx]["text"]
            idx += 1
            if and_check(idx):
                idx += 1
                idiom, idx = idiom_extract(idx)
                I.append(print_span(idiom).strip())

            m, idx = meaning_extract(idx)
            mean = mean + print_span(m)

        else:
            mean, idx = meaning_extract(idx)
            mean = print_span(mean)
            
            if and_check(idx):
                idx += 1
                idiom, idx = idiom_extract(idx)
                I.append(print_span(idiom).strip())
                m, idx = meaning_extract(idx)
                mean = mean + print_span(m)

        M.append(mean.strip())
        if idiom_start(idx): break

        while True:
            if sentence_start(idx):
                idx += 1
                sen, idx = sentence_extract(idx)
                S.append(print_span(sen).strip())
            else: break
            
        if idiom_start(idx): break

    idioms.append(I)
    meanings.append(M)
    sentences.append(S)

In [None]:
for idiom in idioms:
    print(idiom)

In [None]:
for meaning in meanings:
    print(meaning)

In [None]:
for sen in sentences:
    print(sen)

In [39]:
print(refined_spans[16])

{'text': ' Go to ', 'size': 8.5, 'font': 'Minion-Regular', 'origin': (218.39076232910156, 103.21337890625)}


# Save the results

In [47]:
from datasets import Dataset

Dataset.from_dict({
    "Idioms": idioms,
    "Meanings": meanings,
    "Sentences": sentences
}).to_csv("McGraw-Hills_complete.csv")

Creating CSV from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

4927299

# Test Codes (Practice)

In [88]:
idioms = []
meanings = []
sentences = []

def refine_function(start=0):
    idx = start
    if idx >= len(refined_spans): return

    I = []
    M = []
    S = []
    multi = False

    # First, collect the idiom part
    if idiom_check(idx):
        idiom, idx = idiom_extract(idx)
        I.append(idiom)
        multi = multi_meaning(idx-1)
    
    if and_check(idx):
        idx += 1
        idiom, idx = idiom_extract(idx)
        I.append(idiom)
        if not multi:
            multi = multi_meaning(idx-1)

    # Check if there are any variations for this idiom ("*Typically blah blah blah")
    while not idiom_check(idx):
        mean, idx = meaning_extract(idx)
        M.append(mean)
        while sentence_start(idx):
            sen, idx = sentence_extract(idx)
            S.append(sen)
    
    idioms.append(I)
    meanings.append(M)
    sentences.append(S)
    refine_function(idx)

In [None]:
import re
pattern = re.compile(r'(?:10|[1-9])\.')
prac = "abandon ship 1. "
if pattern.search(prac):
    print(pattern.search(prac))

In [None]:
refined_spans = []

for page in doc:
    for text in page.get_text("dict")["blocks"]:
        if "lines" in text:
            for line in text["lines"]:
                for span in line["spans"]:
                    if (span["size"] == 8) or (span["size"] >= 11): continue
                    elif (span["size"] == 8.5) and (span["font"] == "Formata-Regular") and (span["text"].strip().isdigit()): continue
                    else:
                        refined = {
                            "size": span["size"],
                            "font": span["font"],
                            "text": span["text"],
                        }
                        refined_spans.append(refined)

print(len(refined_spans))

In [83]:
page_text = []

page = doc[9]

for text in page.get_text("dict")["blocks"]:
        if "lines" in text:
            for line in text["lines"]:
                for span in line["spans"]:
                    if (span["size"] == 8) or (span["size"] >= 11): continue
                    elif (span["size"] == 8.5) and (span["font"] == "Formata-Regular") and (span["text"].strip().isdigit()): continue
                    else:
                        refined = {
                            "size": span["size"],
                            "font": span["font"],
                            "text": span["text"],
                        }
                        page_text.append(refined)

print(len(page_text))

309


In [84]:
print(page_text)

[{'size': 8.64999008178711, 'font': 'Formata-Medium', 'text': 'all dressed up and nowhere to go '}, {'size': 7.0, 'font': 'Minion-Regular', 'text': 'and '}, {'size': 8.64999008178711, 'font': 'Formata-Medium', 'text': 'all dressed'}, {'size': 8.64999008178711, 'font': 'Formata-Medium', 'text': 'up with nowhere to go'}, {'size': 8.5, 'font': 'Minion-Regular', 'text': ' completely ready for some-'}, {'size': 8.5, 'font': 'Minion-Regular', 'text': 'thing that has been postponed or has failed to material-'}, {'size': 8.5, 'font': 'Minion-Regular', 'text': 'ize. (May be literal or figurative.) '}, {'size': 8.5, 'font': 'MathematicalPi-Six', 'text': '\x02'}, {'size': 8.5, 'font': 'Minion-Italic', 'text': ' Tom: I just heard that'}, {'size': 8.5, 'font': 'Minion-Italic', 'text': 'your company is closed today. Fred: Gee, I’m all dressed up'}, {'size': 8.5, 'font': 'Minion-Italic', 'text': 'and nowhere to go. '}, {'size': 8.5, 'font': 'MathematicalPi-Six', 'text': '\x02'}, {'size': 8.5, 'font':

In [None]:
for span in refined_spans:
    if "; to" in span["text"]:
    # if span["font"] == "Minion-RegularSC":
        print(span)