In [None]:
from paddleocr import PaddleOCR,draw_ocr
from PIL import Image
from IPython.display import display

In [None]:
ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log = False)
# EDIT ME: change the value of img_path to your file's location.
img_path = 'data/1996-393-011.pdf'
result = ocr.ocr(img_path, cls=True)
ocr_text = ''
for res in result:
    for line in res:
        ocr_text += (line[-1][0] + '\n')

print(ocr_text)

In [None]:
from refined.data_types.base_types import Span, Entity
from refined.inference.processor import Refined

In [None]:
refined = Refined.from_pretrained(model_name='wikipedia_model',
                                  entity_set='wikipedia',
                                  use_precomputed_descriptions=False)

In [None]:
text = ocr_text
spans = refined.process_text(text)

named_entities = []
for span in spans:
    id = span.predicted_entity.wikidata_entity_id
    name = span.predicted_entity.wikipedia_entity_title

    if name is not None and id is not None:
        url = construct_wikidata_url(id)
        named_entities.append((name, url))
        

def construct_wikidata_url(id):
    return "https://www.wikidata.org/wiki/" + id

In [None]:
# EDIT ME: If you have an OpenAI API key and want to use it, set it here.
# Otherwise, leave this as an empty string.
openai_api_key = ""

prompt = (
        "Below is the OCR text from a document from UCLA's Campaign Literature collection, "
        " along with a list of named entities known to be associated with the document. "
        "Provide a Dublin Core 'Title', 'Subject', and 'Description' field for a metadata record for the original document. "
        "Use 'sentence case' for the 'Title' field. "
        "Return only Dublin Core fields. Enclose each individual field in curly braces. "
        "Use the given wikidata links in the Subject field if appropriate. \n \n"
        f"The named entities are enclosed in three single quotes below: \n '''{named_entities}''' \n \n"
        f"The text from the document is enclosed in three single quotes below: \n '''{ocr_text}'''"
    )



In [None]:
def get_gpt_as_dict(client, prompt, temperature = 0.1, top_p = 0.3) -> dict:
    completion = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            },
        ],
        temperature=temperature,
        top_p=top_p,
    )
    response = completion.choices[0].message.content
    return json.loads(response)

if openai_api_key:
    from openai import OpenAI
    client = OpenAI(api_key=openai_api_key)
    response = get_gpt_as_dict(client, prompt)
    pprint(response)

else:
    print("Copy and paste the below prompt into your chosen LLM:")
    print()
    print(prompt)
    
