In [2]:
from openai import OpenAI, RateLimitError, APITimeoutError
from tqdm import tqdm
import tiktoken
import concurrent
import time
import json
from pprint import pprint
openai_client = OpenAI(api_key=open("api_key").read().strip())
def save_json(data, filename):
    with open (filename, 'w') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def request_gpt(
    client, messages, model="gpt-4o-mini", temperature=0.5, format=None, seed=None
):
    with open("request_log.txt", "a", encoding="utf-8") as f:
        f.write(f"model: {model}, temperature: {temperature}, format: {format}\n")
        f.write(json.dumps(messages, ensure_ascii=False) + "\n")
        f.write("=====================================\n")
    try:
        if format == "json":
            response = client.chat.completions.create(
                    model=model,
                    messages=messages,
                    response_format={"type": "json_object"},
                    temperature=temperature,
                    seed=seed,
                )
            

        else:
            response = client.chat.completions.create(
                model=model, messages=messages, temperature=temperature, seed=seed
            )
        return response.choices[0].message.content
    except RateLimitError as e:
        print("RateLimitError")
        print(e)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)
    except APITimeoutError as e:
        print("APITimeoutError")
        print(messages)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)

def multithread_prompts(
    client,
    prompts,
    model="gpt-4o-mini",
    temperature=0.5,
    format=None,
    seed=None,
):
    l = len(prompts)
    # results = np.zeros(l)
    with tqdm(total=l) as pbar:
        executor = concurrent.futures.ThreadPoolExecutor(max_workers=100)
        futures = [
            executor.submit(
                request_gpt, client, prompt, model, temperature, format, seed
            )
            for prompt in prompts
        ]
        for _ in concurrent.futures.as_completed(futures):
            pbar.update(1)
    concurrent.futures.wait(futures)
    return [future.result() for future in futures]

def read_jsonl(filename):
    with open(filename, "r") as f:
        return [json.loads(line) for line in f]
def flatten(xss):
    return [x for xs in xss for x in xs]

In [None]:
%pip install pandas

In [7]:
import random
redfm_test_set = read_jsonl("REDFM/data/test.en.jsonl")
redfm_dev_set = read_jsonl("REDFM/data/dev.en.jsonl")
all_relations = json.load(open("REDFM/data/all_relations.json"))
all_relation_indices = { v: k for k, v in all_relations.items() }
# note: subject and object are exact matches in the text, predicate is one of the relations in all_relations and may not appear in the text
few_shot_examples = [{
    "sentence": t['text'],
    "entities": [
        {
            "name": e['surfaceform'],
            "type": e['type']
        } 
        for e in t['entities']
    ],
    "triplets": [
        {
            "subject": r['subject']['surfaceform'],
            "predicate": int(all_relation_indices[r['predicate']['surfaceform']]),
            "object": r["object"]["surfaceform"]
        } for r in t['relations']
    ]
} for t in random.sample(redfm_test_set, 10)]

## single prompt

In [12]:
all_relations_str = "\n".join([f"{i}: {r}" for i, r in all_relations.items()])
system_prompt = {
    "role": "system",
    "content": """You are a knowledge graph constructor. 
        You will be given a sentence, you need to extract a triplet of the form (subject, predicate, object) from the sentence.
        The predicate should be one of the following relations:
        {all_relations_str}
        Reply with the following JSON format:
        {{
            triplets: [
                {{
                    subject: (string),
                    predicate: (int, index of the relation in the list),
                    object: (string)
                }}
            ]
        }}
    """.format(all_relations_str=all_relations_str)
}
user_prompt = lambda sentence: {
    "role": "user",
    "content": f"""Sentence: {sentence}"""
}
few_shot_prompts = flatten([
    [
        {
            "role": "user",
            "content": example['sentence'], 
        },
        {
            "role": "assistant",
            "content": json.dumps({
                "triplets": example['triplets']
            }),
        }
    ]
    for example in few_shot_examples
])

In [13]:
prompts = [[system_prompt] + few_shot_prompts + [user_prompt(test_sample['text'])] for test_sample in redfm_test_set]
prompts = prompts
answers = [
    [
        {
            "subject": r['subject']['surfaceform'],
            "predicate": int(all_relation_indices[r['predicate']['surfaceform']]),
            "object": r["object"]["surfaceform"]
        }
        for r in test_sample['relations']
    ]
    for test_sample in redfm_test_set]
responses = multithread_prompts(openai_client, prompts, model="gpt-4o-mini", temperature=0, format="json")
responses = [json.loads(response)['triplets'] for response in responses]
save_json(responses, "single_prompt_responses.json")

100%|██████████| 446/446 [00:12<00:00, 37.06it/s]


## Decomposed prompt

In [15]:
# entity extraction
system_prompt = {
    "role": "system",
    "content": """You are an entity extractor. You will be given a sentence, you need to extract all the entities from the sentence. 
    The entities have to match exactly with the text.
    Reply with the following JSON format: 
        {{
            entities: [(string)]
        }}
    """
}
user_prompt = lambda sentence: {
    "role": "user",
    "content": f"""Sentence: {sentence}"""
}
few_shot_prompts = flatten([
    [
        {
            "role": "user",
            "content": example['sentence'], 
        },
        {
            "role": "assistant",
            "content": json.dumps({
                "entities": example['entities']
            }),
        }
    ]
    for example in few_shot_examples
])
prompts = [[system_prompt] + few_shot_prompts + [user_prompt(test_sample['text'])] for test_sample in redfm_test_set]
prompts = prompts
answers = [
    [
        e['surfaceform']
        for e in test_sample['entities']
    ]
    for test_sample in redfm_test_set]
responses = multithread_prompts(openai_client, prompts, model="gpt-4o-mini", temperature=0, format="json")
responses = [json.loads(response)['entities'] for response in responses]
save_json(responses, "decomposed_prompt_entities.json")

100%|██████████| 446/446 [00:15<00:00, 27.96it/s]


In [18]:
# relation extraction
all_relations_str = "\n".join([f"{i}: {r}" for i, r in all_relations.items()])
system_prompt = {
    "role": "system",
    "content": """You are a relation extractor
        You will be given a sentence and a list of entities appearing in the sentence.
        You need to assign a relation between any two entities in the form of a triplet (subject, predicate, object).
        The relation should be one of the following relations:
        {all_relations_str}
        Reply with the following JSON format:
        {{
            triplets: [
                {{
                    subject: (string),
                    predicate: (int, index of the relation in the list),
                    object: (string)
                }}
            ]
        }}
    """.format(all_relations_str=all_relations_str)
}
user_prompt = lambda sentence, entities: {
    "role": "user",
    "content": f"""Sentence: {sentence} \n Entities: {entities}"""
}
few_shot_prompts = flatten([
    [
        {
            "role": "user",
            "content": f"Sentence: {example['sentence']} \n Entities: {example['entities']}", 
        },
        {
            "role": "assistant",
            "content": json.dumps({
                "triplets": example['triplets']
            }),
        }
    ]
    for example in few_shot_examples
])
entities = json.load(open("decomposed_prompt_entities.json"))
prompts = []
for test_sample, entity_responses in zip(redfm_test_set, entities):
   # for each combination of entity pairs
    prompts.append([system_prompt] + few_shot_prompts + [user_prompt(test_sample['text'], entity_responses)])
prompts = prompts
answers = [
    [
        {
            "subject": r['subject']['surfaceform'],
            "predicate": int(all_relation_indices[r['predicate']['surfaceform']]),
            "object": r["object"]["surfaceform"]
        }
        for r in test_sample['relations']
    ]
    for test_sample in redfm_test_set]
responses = multithread_prompts(openai_client, prompts, model="gpt-4o-mini", temperature=0, format="json")
responses = [json.loads(response)['triplets'] for response in responses]
save_json(responses, "decomposed_prompt_relations.json")

100%|██████████| 446/446 [00:13<00:00, 32.04it/s]


## Decomposed Prompt -- v2

In [None]:
# entity extraction
# entity_types = ['EVE', 'ORG', 'NUMBER', 'DATE', 'TIME', 'PER', 'CEL', 'Concept', 'MEDIA', 'LOC', 'MISC', 'UNK']
# entity_types = ['event', 'organization', 'number', 'date', 'time', 'person', 'celestial', 'concept', 'media', 'location', 'miscellaneous', 'unknown']
entity_type_dict = {
    "EVE": "event",
    "ORG": "organization",
    "NUMBER": "number",
    "DATE": "date",
    "TIME": "time",
    "PER": "person",
    "CEL": "celestial",
    "Concept": "concept",
    "MEDIA": "media",
    "LOC": "location",
    "MISC": "miscellaneous",
    "UNK": "unknown"
}
system_prompt = {
    "role": "system",
    "content": """You are an entity extractor. You will be given a sentence, you need to extract all the entities from the sentence. 
    The entities must be in one of these types: {entity_types}
    The entities extracted have to match exactly with the text. 
    Reply with the following JSON format: 
        {{
            entities: [
                "text": (string)
                "type": (string, one of the entity types)
                ]
        }}
    """.format(entity_types=list(entity_type_dict.values()))
}
user_prompt = lambda sentence: {
    "role": "user",
    "content": f"""Sentence: {sentence}"""
}
few_shot_prompts = flatten([
    [
        {
            "role": "user",
            "content": example['sentence'], 
        },
        {
            "role": "assistant",
            "content": json.dumps({
                "entities": [
                    {
                        "text": e['name'],
                        "type": entity_type_dict[e['type']]
                    }
                for e in example['entities']
                ]
            }),
        }
    ]
    for example in few_shot_examples
])
prompts = [[system_prompt] + few_shot_prompts + [user_prompt(test_sample['text'])] for test_sample in redfm_test_set]
prompts = prompts
answers = [
    [
        e['surfaceform']
        for e in test_sample['entities']
    ]
    for test_sample in redfm_test_set]
responses = multithread_prompts(openai_client, prompts, model="gpt-4o-mini", temperature=0, format="json")
responses = [json.loads(response)['entities'] for response in responses]
save_json(responses, "decomposed_prompt_entities_v2.json")

100%|██████████| 446/446 [00:19<00:00, 23.19it/s]


In [21]:
## relation extraction
# relation extraction
all_relations_str = "\n".join([f"{i}: {r}" for i, r in all_relations.items()])
system_prompt = {
    "role": "system",
    "content": """You are a relation extractor
        You will be given a sentence and a list of entities appearing in the sentence.
        You need to assign a relation between any two entities in the form of a triplet (subject, predicate, object).
        The relation should be one of the following relations:
        {all_relations_str}
        Reply with the following JSON format:
        {{
            triplets: [
                {{
                    subject: (string),
                    predicate: (int, index of the relation in the list),
                    object: (string)
                }}
            ]
        }}
    """.format(all_relations_str=all_relations_str)
}
user_prompt = lambda sentence, entities: {
    "role": "user",
    "content": f"""Sentence: {sentence} \n Entities: {entities}"""
}
few_shot_prompts = flatten([
    [
        {
            "role": "user",
            "content": f"Sentence: {example['sentence']} \n Entities: {example['entities']}", 
        },
        {
            "role": "assistant",
            "content": json.dumps({
                "triplets": example['triplets']
            }),
        }
    ]
    for example in few_shot_examples
])
entities = json.load(open("decomposed_prompt_entities_v2.json"))
prompts = []
for test_sample, entity_responses in zip(redfm_test_set, entities):
    entity_responses = [e['text'] for e in entity_responses]
   # for each combination of entity pairs
    prompts.append([system_prompt] + few_shot_prompts + [user_prompt(test_sample['text'], entity_responses)])
prompts = prompts
answers = [
    [
        {
            "subject": r['subject']['surfaceform'],
            "predicate": int(all_relation_indices[r['predicate']['surfaceform']]),
            "object": r["object"]["surfaceform"]
        }
        for r in test_sample['relations']
    ]
    for test_sample in redfm_test_set]
responses = multithread_prompts(openai_client, prompts, model="gpt-4o-mini", temperature=0, format="json")
responses = [json.loads(response)['triplets'] for response in responses]
save_json(responses, "decomposed_prompt_relations_v2.json")

100%|██████████| 446/446 [00:11<00:00, 38.51it/s]


## Evaluation

In [26]:
def triplet_correct(predictions, options):
    def _one_triplet_cmp(pred, gold):
        return pred['subject'] == gold['subject'] and pred['object'] == gold['object'] and pred['predicate'] == gold['predicate']
    # if the prediction matches any one of the options, return True
    correct_sum = 0
    for pred in predictions:
        if any([_one_triplet_cmp(pred, gold) for gold in options]):
            # print("prediction: ", pred)
            # print("options:", options)
            correct_sum += 1
    return correct_sum/len(options)

def recall_triplet(predictions, golds):
    return sum([triplet_correct(pred, gold) for pred, gold in zip(predictions, golds)]) / len(golds)


single_prompt_responses = json.load(open("single_prompt_responses.json"))
decomposed_prompt_responses = json.load(open("decomposed_prompt_relations.json"))
decomposed_prompt_responses_v2 = json.load(open("decomposed_prompt_relations_v2.json"))
print(recall_triplet(single_prompt_responses, answers))
print(recall_triplet(decomposed_prompt_responses, answers))
print(recall_triplet(decomposed_prompt_responses_v2, answers))

# print(recall_triplet(responses, answers))
for i, (response, answer) in enumerate(zip(decomposed_prompt_responses_v2, answers)):
    print(f"Sample {i}: {redfm_test_set[i]['text']}")
    print("Response:")
    pprint(response)
    print("Answer:")
    print([{
        "subject": a['subject'],
        "predicate": a['predicate'],
        "object": a['object']
    } for a in answer])
    print("=====================================")

0.1309882942169938
0.14979301341408965
0.16245405690472953
Sample 0: The Porsche Panamera is a mid/full-sized luxury vehicle (E-segment in Europe) manufactured by the German automobile manufacturer Porsche. It is front-engined and has a rear-wheel-drive layout, with all-wheel drive versions also available.
Response:
[{'object': 'mid/full-sized luxury vehicle',
  'predicate': 4,
  'subject': 'Porsche Panamera'},
 {'object': 'Porsche', 'predicate': 19, 'subject': 'Porsche Panamera'}]
Answer:
[{'subject': 'Porsche Panamera', 'predicate': 19, 'object': 'Porsche'}, {'subject': 'Porsche', 'predicate': 0, 'object': 'German'}]
Sample 1: Paramount+ (pronounced Paramount Plus; currently named 10 All Access in Australia) is an American subscription video on-demand over-the-top streaming service owned and operated by ViacomCBS Streaming, a division of ViacomCBS. It offers original content, including newly aired CBS broadcast properties, and content from the ViacomCBS library. In the United States,

In [10]:
entity_type_set = set()
for sample in redfm_test_set:
    entity_types = [e['type'] for e in sample['entities']]
    entity_type_set.update(entity_types)
print(entity_type_set)

{'EVE', 'ORG', 'NUMBER', 'DATE', 'TIME', 'PER', 'CEL', 'Concept', 'MEDIA', 'LOC', 'MISC', 'UNK'}
