In [17]:
# %pip install --upgrade openai --quiet
# %pip install --upgrade nlpia2-wikipedia --quiet
# %pip install --upgrade tenacity --quiet

In [18]:
# pip install openai==0.28

In [19]:
import json
import logging
import os

import openai
import wikipedia

from typing import Optional
from IPython.display import display, Markdown
from tenacity import retry, wait_random_exponential, stop_after_attempt

logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s')

OPENAI_MODEL = 'gpt-3.5-turbo-0613'
openai.api_key = "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"

In [20]:
labels = [
    "PER",      # people, including fictional characters
    "ORG",         # organizations, companies, agencies, institutions
    "LOC",         # non-gpe locations
    "MISC"
]

In [21]:
def system_message(labels):
    return f"""
You are an expert in Natural Language Processing. Your task is to identify common Named Entities (NER) in a given text.
The possible common Named Entities (NER) types are exclusively: ({", ".join(labels)})."""

In [22]:
def assisstant_message():
    return f"""
EXAMPLE:
    Text: '" our concern is to get out there and play proper cricket , " sri lanka captain arjuna ranatunga told a news conference on the eve of a warmup match between the world cup champions and a world xi team scheduled for saturday .'
    {{
        "ORG": ["world xi"],
        "LOC": ["sri lanka"],
        "PER": ["arjuna ranatunga"],
        "MISC": ["cup champions"]
    }}
--"""

In [23]:
def user_message(text):
    return f"""
TASK:
    Text: {text}
"""

In [24]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
def find_link(entity: str) -> Optional[str]:
    """
    Finds a Wikipedia link for a given entity.
    """
    # try:
    #     titles = wikipedia.search(entity)
    #     if titles:
    #         # naively consider the first result as the best
    #         page = wikipedia.page(titles[0])
    #         return page.url
    # except (wikipedia.exceptions.WikipediaException) as ex:
    #     logging.error(f'Error occurred while searching for Wikipedia link for entity {entity}: {str(ex)}')

    return None

In [25]:
def find_all_links(label_entities:dict) -> dict:
    """ 
    Finds all Wikipedia links for the dictionary entities in the whitelist label list.
    """
    whitelist = ['LOC', 'ORG', 'PER']
    
    return {e: find_link(e) for label, entities in label_entities.items() 
                            for e in entities
                            if label in whitelist}

In [26]:
def enrich_entities(text: str, label_entities: dict) -> str:
    """
    Enriches text with knowledge base links.
    """
    entity_link_dict = find_all_links(label_entities)
    logging.info(f"entity_link_dict: {entity_link_dict}")
    
    for entity, link in entity_link_dict.items():
        text = text.replace(entity, f"[{entity}]({link})")

    return text

In [27]:
def generate_functions(labels: dict) -> list:
    return [
        {
            "name": "enrich_entities",
            "description": "Enrich Text with Knowledge Base Links",
            "parameters": {
                "type": "object",
                    "properties": {
                        "r'^(?:' + '|'.join({labels}) + ')$'": 
                        {
                            "type": "array",
                            "items": {
                                "type": "string"
                            }
                        }
                    },
                    "additionalProperties": False
            },
        }
    ]

In [28]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
def run_openai_task(labels, text):
    messages = [
          {"role": "system", "content": system_message(labels=labels)},
          {"role": "assistant", "content": assisstant_message()},
          {"role": "user", "content": user_message(text=text)}
      ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0613",
        messages=messages,
        functions=generate_functions(labels),
        function_call={"name": "enrich_entities"}, 
        temperature=0,
        frequency_penalty=0,
        presence_penalty=0,
    )

    response_message = response["choices"][0]["message"]
    
    available_functions = {"enrich_entities": enrich_entities}  
    function_name = response_message["function_call"]["name"]
    
    function_to_call = available_functions[function_name]
    logging.info(f"function_to_call: {function_to_call}")

    function_args = json.loads(response_message["function_call"]["arguments"])
    logging.info(f"function_args: {function_args}")

    function_response = function_to_call(text, function_args)

    return {"model_response": response, 
            "function_response": function_response}

In [29]:
text = """soccer - japan get lucky win , china in surprise defeat ."""
result = run_openai_task(labels, text)

 2023-11-20 12:45:33,169 - INFO - function_to_call: <function enrich_entities at 0x7f5d667616c0>
 2023-11-20 12:45:33,170 - INFO - function_args: {'PER': [], 'ORG': [], 'LOC': [], 'MISC': []}
 2023-11-20 12:45:33,170 - INFO - entity_link_dict: {}


In [30]:
display(Markdown(f"""**Text:** {text}   
                     **Enriched_Text:** {result['function_response']}"""))

**Text:** soccer - japan get lucky win , china in surprise defeat .   
                     **Enriched_Text:** soccer - japan get lucky win , china in surprise defeat .

In [31]:
# estimate inference cost assuming gpt-3.5-turbo (4K context)
i_tokens  = result["model_response"]["usage"]["prompt_tokens"] 
o_tokens = result["model_response"]["usage"]["completion_tokens"] 

i_cost = (i_tokens / 1000) * 0.0015
o_cost = (o_tokens / 1000) * 0.002

print(f"""Token Usage
    Prompt: {i_tokens} tokens
    Completion: {o_tokens} tokens
    Cost estimation: ${round(i_cost + o_cost, 5)}""")

Token Usage
    Prompt: 243 tokens
    Completion: 23 tokens
    Cost estimation: $0.00041


In [32]:
print(result['model_response']['choices'][0]['message']['function_call']['arguments'])

{
  "PER": [],
  "ORG": [],
  "LOC": [],
  "MISC": []
}


In [35]:
"""The Beatles were an English rock band formed in Liverpool in 1960, comprising John Lennon, Paul McCartney, George Harrison, and Ringo Starr.""".lower()

'the beatles were an english rock band formed in liverpool in 1960, comprising john lennon, paul mccartney, george harrison, and ringo starr.'