In [41]:
import csv
import json
import pandas as pd
import numpy as np
import argparse

from IPython.display import display, HTML
from utils.data_utils import get_data
from utils.constants import *
from utils.prompt_utils import * 
from utils.result_utils import *
from utils.run_utils import *

from main import *

display(HTML(f"<style>table td, table th {{ font-size: 14px; }}</style>"))

# Load the data

### Loading the Magellan datasets

Assuming the datsets have been downloaded (```python utils/download_datasets.py```), we can find the datasets in the ```Input``` folder. For a full list of available datasets, see the README.

In [3]:
dataset = "dblp_acm"
candidate_pairs = get_data(dataset)
candidate_pairs = candidate_pairs[["title_x","year_x", "title_y", "year_y", "label_str"]][:10]

# Running Experiments

### Small Example

In [13]:
# Choose model (gpt-3, gpt-4, gpt-4o) and prompt method (natural, json, yaml etc.)
model = MODEL_NAMES["gpt-3"]
prompt_method = PROMPT_METHODS["natural"]

dataset_type = DATASET_TYPE[dataset]
title_col = DATASET_TITLES[dataset]
total_pairs = len(candidate_pairs)

trial_metrics = {
    "prec": [],
    "rec": [],
    "f1": [],
    "acc": [],
    "time": [],
    "details": [],
}

# List for predictions
preds = []
# Go through each pair of candidates and prompt the model for a prediction
for i in range(total_pairs):
    print(f"Run {i+1}/{total_pairs}")

    # Get the prompt in the desired format
    prompt_data = prompt_method(candidate_pairs.iloc[i], COLUMNS[dataset], dataset_type)
    # Pass the prompt as a list of messages
    messages = prompt(data = prompt_data, category = dataset_type, title_col = title_col)
    
    prompt_str = messages[0]["content"]
    print(f"{prompt_str}\n")
    
    try:
        # Get the completion from the Model
        response_raw, logprob = get_completion_from_messages(messages, model)
        # Make sure the answer is Yes or No
        pred = extract_answer(response_raw.lower())
        preds.append(extract_answer(response_raw.lower()))

        # Set the predicted label in the candidate_pairs DataFrame
        candidate_pairs.at[i, "predicted_label"] = pred
        candidate_pairs.iloc[i, candidate_pairs.columns.get_loc("predicted_label")] = pred

        # The ground truth label in string format
        actual_label = candidate_pairs.iloc[i]["label_str"].lower()

        # Log the prediction vs. actual
        print(f"Prediction: {pred} | Actual: {actual_label}\n\n")

    except openai.OpenAIError as e:
        print(f"Service Unavailable Error: {e}")
        continue
        
# Extract Ground Truth Labels
gt = candidate_pairs["label_str"][:total_pairs]

# Compute Metrics
prec, rec, acc, f1 = compute_metrics(preds, gt, candidate_pairs)

print(f"F1-Score: {f1}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"Accuracy: {acc}")

Run 1/10
Are Publication A and Publication B the same? Yes or No? Publication A is titled 'a query language and optimization techniques for unstructured data', 1996. Publication B is titled 'fundamental techniques for order optimization', 1996. Are Publication A and Publication B the same?

Prediction: no | Actual: no


Run 2/10
Are Publication A and Publication B the same? Yes or No? Publication A is titled 'secure transaction processing in firm real-time database systems', 1997. Publication B is titled 'a database system for real-time event aggregation in telecommunication', 1998. Are Publication A and Publication B the same?

Prediction: no | Actual: no


Run 3/10
Are Publication A and Publication B the same? Yes or No? Publication A is titled 'navigating large-scale semi-structured data in business portals', 2001. Publication B is titled 'navigating large-scale semi-structured data in business portals', 2001. Are Publication A and Publication B the same?

Prediction: yes | Actual: 

### Larger example

##### Define the arguments

In [48]:
args = argparse.Namespace(
    output_folder='output', # Folder to store results
    dataset='dblp_gs',      # Dataset to use
    prompt_format='natural',# Format to use (json, tabular, yaml, natural)
    improvement='basic',    # Improvements (short additional message to guide the model, basic = no message)
    k=0,                    # Number of examples to use in prompt
    num_pairs=50,           # Number of pairs to compare (Dataset size is default)
    llm='gpt-3',            # The model to use
    context=True,           # Changes 'Entity' to suitable description ('Publication', 'Product', 'Song' etc.)
    chain_of_thought=False, # Chaing of thought Ask the model to extract 'yes' or 'no' from the answer
    do_train=False,         # Use the training data instead for prompting
    category_pair = None,
    category_distribution = None,
)

##### Run the matching

In [49]:
run_entity_matching(args)

       [94mINFO[0m     [92m2024-12-20 17:58:01,835[0m [[95mutils.data_utils[0m]  	[[94mINFO[0m] - Loading data from input/Structured/DBLP-GoogleScholar
       [94mINFO[0m     [92m2024-12-20 17:58:01,992[0m [[95mroot[0m]  	[[94mINFO[0m] - Running the matching on 50 examples
       [94mINFO[0m     [92m2024-12-20 17:58:01,992[0m [[95mroot[0m]  	[[94mINFO[0m] - Run 1/50
       [94mINFO[0m     [92m2024-12-20 17:58:01,993[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] -               
<<<<<<<<<<< PROMPT START >>>>>>>>>>>

              Are Publication A and Publication B the same? Yes or No? Publication A is titled 'interview with jim gray', 2003. Publication B is titled 'interview with authors', e larson, san diego . Are Publication A and Publication B the same?
              
<<<<<<<<<<< PROMPT END >>>>>>>>>>>


       [94mINFO[0m     [92m2024-12-20 17:58:02,522[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] - Prediction: no | Actual: no
       [94mINFO[

       [94mINFO[0m     [92m2024-12-20 17:58:08,968[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] - Prediction: yes | Actual: yes
       [94mINFO[0m     [92m2024-12-20 17:58:08,972[0m [[95mroot[0m]  	[[94mINFO[0m] - Run 11/50
       [94mINFO[0m     [92m2024-12-20 17:58:08,974[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] -               
<<<<<<<<<<< PROMPT START >>>>>>>>>>>

              Are Publication A and Publication B the same? Yes or No? Publication A is titled 'opt + + : an object-oriented implementation for extensible database query optimization', n kabra , d dewitt, vldb j., 1999. Publication B is titled 'description logics for semantic query optimization in object-oriented database systems', d beneventano , s bergamaschi , c sartori, acm transactions on database systems ,, 2003.0. Are Publication A and Publication B the same?
              
<<<<<<<<<<< PROMPT END >>>>>>>>>>>


       [94mINFO[0m     [92m2024-12-20 17:58:09,281[0m [[95mutils.run_utils

       [94mINFO[0m     [92m2024-12-20 17:58:12,687[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] - Prediction: no | Actual: no
       [94mINFO[0m     [92m2024-12-20 17:58:12,689[0m [[95mroot[0m]  	[[94mINFO[0m] - Run 21/50
       [94mINFO[0m     [92m2024-12-20 17:58:12,695[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] -               
<<<<<<<<<<< PROMPT START >>>>>>>>>>>

              Are Publication A and Publication B the same? Yes or No? Publication A is titled 'materialized views selection in a multidimensional database', e baralis , s paraboschi , e teniente, vldb, 1997. Publication B is titled 'a foundation for multi-dimensional databases', m gyssens , lvs lakshmanan. Are Publication A and Publication B the same?
              
<<<<<<<<<<< PROMPT END >>>>>>>>>>>


       [94mINFO[0m     [92m2024-12-20 17:58:13,065[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] - Prediction: no | Actual: no
       [94mINFO[0m     [92m2024-12-20 17:58:13,067[0m [[95mr

       [94mINFO[0m     [92m2024-12-20 17:58:16,557[0m [[95mroot[0m]  	[[94mINFO[0m] - Run 31/50
       [94mINFO[0m     [92m2024-12-20 17:58:16,560[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] -               
<<<<<<<<<<< PROMPT START >>>>>>>>>>>

              Are Publication A and Publication B the same? Yes or No? Publication A is titled 'dynamic load balancing in hierarchical parallel database systems', l bouganim , d florescu , p valduriez, vldb, 1996. Publication B is titled 'dynamic load balancing in parallel database systems', e rahm. Are Publication A and Publication B the same?
              
<<<<<<<<<<< PROMPT END >>>>>>>>>>>


       [94mINFO[0m     [92m2024-12-20 17:58:17,259[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] - Prediction: no | Actual: no
       [94mINFO[0m     [92m2024-12-20 17:58:17,260[0m [[95mroot[0m]  	[[94mINFO[0m] - Run 32/50
       [94mINFO[0m     [92m2024-12-20 17:58:17,262[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m]

       [94mINFO[0m     [92m2024-12-20 17:58:20,405[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] - Prediction: no | Actual: no
       [94mINFO[0m     [92m2024-12-20 17:58:20,408[0m [[95mroot[0m]  	[[94mINFO[0m] - Run 41/50
       [94mINFO[0m     [92m2024-12-20 17:58:20,410[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] -               
<<<<<<<<<<< PROMPT START >>>>>>>>>>>

              Are Publication A and Publication B the same? Yes or No? Publication A is titled 'scalable parallel data mining for association rules', e han , g karypis , v kumar, sigmod conference, 1997. Publication B is titled 'mind : a scalable mining for classifier in relational databases', m wang , b iyer , js vitter, proceedings of the acm sigmod workshop on research issues on & hellip ; . Are Publication A and Publication B the same?
              
<<<<<<<<<<< PROMPT END >>>>>>>>>>>


       [94mINFO[0m     [92m2024-12-20 17:58:23,106[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] - Pred

       [94mINFO[0m     [92m2024-12-20 17:58:28,625[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] - Prediction: no | Actual: no
       [94mINFO[0m     [92m2024-12-20 17:58:28,626[0m [[95mutils.run_utils[0m]  	[[94mINFO[0m] - Final Metrics {
                "prec": [
                                1.0
                ],
                "rec": [
                                0.375
                ],
                "f1": [
                                0.5454545454545454
                ],
                "acc": [
                                0.9
                ],
                "time": [
                                26.634288541999922
                ]
}
       [94mINFO[0m     [92m2024-12-20 17:58:28,627[0m [[95mroot[0m]  	[[94mINFO[0m] - Getting file path


'output/output_basic_ctx/dblp_gs/gpt-3_k_0_n_50_natural_00.json'

### Show results