In [10]:
import pandas as pd
from ollama import Client
from tqdm import tqdm
import time

In [4]:
# === Setup ===
ollama_client = Client()
OLLAMA_MODEL = "llama3.2:latest"  # Change to your preferred local model

In [8]:
USE_ORIGINAL_TABLES = False       # Set to False to skip original tables and use only candidate texts

# === CONFIGURATION ===
OLLAMA_MODEL = "llama3.2:latest"  # Change to any model you pulled with `ollama pull`
TABLE1_ID_COL = "custom_id"   # Replace with your actual ID column for table1
TABLE2_ID_COL = "custom_id"   # Replace with your actual ID column for table2
LEFT_ID_COL = "left_id"   # Column in candidate set that maps to table1
RIGHT_ID_COL = "right_id" # Column in candidate set that maps to table2

candidates = pd.read_csv("entity_matches.csv")
# === Load data ===
if USE_ORIGINAL_TABLES:
    table1 = pd.read_csv("walmart_amazon/tableA.csv")        # replace with your file
    table2 = pd.read_csv("walmart_amazon/tableB.csv")      # replace with your file

In [12]:

# === Ollama client ===
ollama_client = Client()

# === Row lookup ===
def get_row_by_id(df, id_col, id_val):
    return df[df[id_col] == id_val].iloc[0].to_dict()

# === Format prompt ===
def format_prompt(rowA, rowB):
    def format_row(row):
        return "\n".join([f"{k}: {str(v)}" for k, v in row.items() if pd.notnull(v) and str(v).strip() != ""])
    
    return f"""
Compare the following two products and determine if they refer to the same real-world item. Answer "Yes" if they match, otherwise "No".

Product A:
{format_row(rowA)}

Product B:
{format_row(rowB)}

Do these products refer to the same item?
""".strip()

# === Format prompt from candidate set directly ===
def format_prompt_from_candidate(row):
    return f"""
Do the two product descriptions refer to the same real-world product? Answer "Yes" if they match, otherwise "No".

Product A:
{row['table1_text']}

Product B:
{row['table2_text']}

Do these products refer to the same item?
""".strip()

# === Query LLM ===
def query_llm(prompt):
    response = ollama_client.chat(model=OLLAMA_MODEL, messages=[
        {"role": "user", "content": prompt}
    ])
    answer = response['message']['content'].strip().lower()
    return answer.startswith("yes")

# === Run matching ===
results = []
total_start = time.time()

for _, row in tqdm(candidates.iterrows(), total=len(candidates)):
    try:
        start_time = time.time()
        
        if USE_ORIGINAL_TABLES:
            rowA = get_row_by_id(table1, TABLE1_ID_COL, row[LEFT_ID_COL])
            rowB = get_row_by_id(table2, TABLE2_ID_COL, row[RIGHT_ID_COL])
            prompt = format_prompt(rowA, rowB)
        else:
            prompt = format_prompt_from_candidate(row)

        is_match = query_llm(prompt)
        elapsed = time.time() - start_time

        match_result = {
            LEFT_ID_COL: row[LEFT_ID_COL],
            RIGHT_ID_COL: row[RIGHT_ID_COL],
            "is_match": is_match,
            "time_sec": round(elapsed, 2)
        }
        results.append(match_result)

        print(f"Matched ({row[LEFT_ID_COL]}, {row[RIGHT_ID_COL]}) -> {'✅ Match' if is_match else '❌ No Match'} [Time: {elapsed:.2f}s]")

    except Exception as e:
        print(f"Error processing pair ({row[LEFT_ID_COL]}, {row[RIGHT_ID_COL]}): {e}")

# === Save results ===
output_df = pd.DataFrame(results)
output_df.to_csv("llm_matching_results.csv", index=False)

total_elapsed = time.time() - total_start
print(f"✅ Matching complete. Saved to llm_matching_results.csv.")
print(f"⏱️ Total matching time: {total_elapsed:.2f} seconds.")

  0%|                                                        | 1/25540 [00:02<15:15:24,  2.15s/it]

Matched (1, 2990) -> ❌ No Match [Time: 2.14s]


  0%|                                                        | 2/25540 [00:03<10:56:44,  1.54s/it]

Matched (1, 21844) -> ❌ No Match [Time: 1.12s]


  0%|                                                         | 3/25540 [00:04<8:31:05,  1.20s/it]

Matched (1, 18985) -> ❌ No Match [Time: 0.79s]


  0%|                                                         | 4/25540 [00:05<9:33:53,  1.35s/it]

Matched (1, 5786) -> ❌ No Match [Time: 1.57s]


  0%|                                                         | 5/25540 [00:06<7:59:18,  1.13s/it]

Matched (1, 9227) -> ❌ No Match [Time: 0.73s]


  0%|                                                         | 6/25540 [00:08<9:20:04,  1.32s/it]

Matched (1, 3248) -> ❌ No Match [Time: 1.68s]


  0%|                                                         | 7/25540 [00:09<8:44:01,  1.23s/it]

Matched (1, 9667) -> ❌ No Match [Time: 1.05s]


  0%|                                                         | 8/25540 [00:10<8:06:13,  1.14s/it]

Matched (1, 14986) -> ❌ No Match [Time: 0.95s]


  0%|                                                         | 9/25540 [00:10<7:31:11,  1.06s/it]

Matched (1, 13757) -> ❌ No Match [Time: 0.88s]


  0%|                                                        | 10/25540 [00:11<6:20:51,  1.12it/s]

Matched (1, 4408) -> ❌ No Match [Time: 0.52s]


  0%|                                                        | 11/25540 [00:12<6:51:31,  1.03it/s]

Matched (2, 7148) -> ❌ No Match [Time: 1.13s]


  0%|                                                        | 12/25540 [00:14<9:12:52,  1.30s/it]

Matched (2, 16721) -> ❌ No Match [Time: 2.06s]


  0%|                                                        | 13/25540 [00:15<9:17:54,  1.31s/it]

Matched (2, 3173) -> ❌ No Match [Time: 1.34s]


  0%|                                                        | 14/25540 [00:17<8:47:22,  1.24s/it]

Matched (2, 13162) -> ❌ No Match [Time: 1.07s]


  0%|                                                       | 15/25540 [00:20<12:45:04,  1.80s/it]

Matched (2, 16448) -> ❌ No Match [Time: 3.09s]


  0%|                                                       | 16/25540 [00:23<15:48:09,  2.23s/it]

Matched (2, 3879) -> ❌ No Match [Time: 3.23s]


  0%|                                                       | 17/25540 [00:24<13:47:28,  1.95s/it]

Matched (2, 1716) -> ❌ No Match [Time: 1.28s]


  0%|                                                       | 18/25540 [00:26<12:45:33,  1.80s/it]

Matched (2, 13159) -> ❌ No Match [Time: 1.46s]


  0%|                                                       | 19/25540 [00:27<11:59:55,  1.69s/it]

Matched (2, 21493) -> ❌ No Match [Time: 1.44s]


  0%|                                                       | 20/25540 [00:29<12:22:43,  1.75s/it]

Matched (2, 1178) -> ❌ No Match [Time: 1.87s]


  0%|                                                       | 21/25540 [00:30<11:34:51,  1.63s/it]

Matched (3, 18029) -> ❌ No Match [Time: 1.37s]


  0%|                                                       | 22/25540 [00:32<12:30:19,  1.76s/it]

Matched (3, 16496) -> ❌ No Match [Time: 2.07s]


  0%|                                                       | 23/25540 [00:34<11:29:57,  1.62s/it]

Matched (3, 16497) -> ❌ No Match [Time: 1.29s]


  0%|                                                       | 24/25540 [00:35<11:10:53,  1.58s/it]

Matched (3, 18605) -> ❌ No Match [Time: 1.47s]


  0%|                                                       | 25/25540 [00:37<11:44:05,  1.66s/it]

Matched (3, 18323) -> ✅ Match [Time: 1.84s]


  0%|                                                       | 26/25540 [00:38<10:40:53,  1.51s/it]

Matched (3, 18322) -> ❌ No Match [Time: 1.16s]


  0%|                                                       | 27/25540 [00:40<10:46:02,  1.52s/it]

Matched (3, 18030) -> ❌ No Match [Time: 1.55s]


  0%|                                                       | 28/25540 [00:43<14:33:25,  2.05s/it]

Matched (3, 14447) -> ❌ No Match [Time: 3.30s]


  0%|                                                       | 29/25540 [00:46<15:47:25,  2.23s/it]

Matched (3, 15627) -> ❌ No Match [Time: 2.63s]


  0%|                                                       | 30/25540 [00:47<12:59:41,  1.83s/it]

Matched (3, 18950) -> ❌ No Match [Time: 0.91s]


  0%|                                                       | 31/25540 [00:50<17:00:18,  2.40s/it]

Matched (4, 13215) -> ❌ No Match [Time: 3.72s]


  0%|                                                       | 32/25540 [00:52<14:40:46,  2.07s/it]

Matched (4, 21425) -> ❌ No Match [Time: 1.30s]


  0%|                                                       | 33/25540 [00:53<14:07:15,  1.99s/it]

Matched (4, 4378) -> ❌ No Match [Time: 1.81s]


  0%|                                                       | 34/25540 [00:55<13:40:13,  1.93s/it]

Matched (4, 4379) -> ❌ No Match [Time: 1.78s]


  0%|                                                       | 35/25540 [00:58<14:46:21,  2.09s/it]

Matched (4, 21754) -> ❌ No Match [Time: 2.45s]


  0%|                                                       | 36/25540 [00:59<13:33:55,  1.91s/it]

Matched (4, 2566) -> ❌ No Match [Time: 1.52s]


  0%|                                                       | 37/25540 [01:01<12:38:55,  1.79s/it]

Matched (4, 15326) -> ❌ No Match [Time: 1.48s]


  0%|                                                       | 38/25540 [01:05<19:11:37,  2.71s/it]

Matched (4, 5304) -> ❌ No Match [Time: 4.86s]


  0%|                                                       | 39/25540 [01:08<18:57:22,  2.68s/it]

Matched (4, 14382) -> ❌ No Match [Time: 2.60s]


  0%|                                                       | 40/25540 [01:09<15:58:30,  2.26s/it]

Matched (4, 7384) -> ❌ No Match [Time: 1.27s]


  0%|                                                       | 41/25540 [01:14<20:56:57,  2.96s/it]

Matched (5, 10085) -> ❌ No Match [Time: 4.59s]


  0%|                                                       | 42/25540 [01:16<19:33:07,  2.76s/it]

Matched (5, 10217) -> ❌ No Match [Time: 2.30s]


  0%|                                                       | 43/25540 [01:19<18:38:47,  2.63s/it]

Matched (5, 20783) -> ❌ No Match [Time: 2.33s]


  0%|                                                       | 44/25540 [01:22<20:04:16,  2.83s/it]

Matched (5, 8301) -> ❌ No Match [Time: 3.30s]


  0%|                                                       | 45/25540 [01:23<16:38:29,  2.35s/it]

Matched (5, 7449) -> ❌ No Match [Time: 1.22s]


  0%|                                                       | 46/25540 [01:24<14:30:29,  2.05s/it]

Matched (5, 11425) -> ❌ No Match [Time: 1.34s]


  0%|                                                       | 47/25540 [01:27<15:40:29,  2.21s/it]

Matched (5, 1364) -> ❌ No Match [Time: 2.59s]


  0%|                                                       | 48/25540 [01:30<16:18:22,  2.30s/it]

Matched (5, 9354) -> ❌ No Match [Time: 2.51s]


  0%|                                                       | 49/25540 [01:31<14:34:10,  2.06s/it]

Matched (5, 20111) -> ❌ No Match [Time: 1.48s]


  0%|                                                       | 50/25540 [01:32<12:47:28,  1.81s/it]

Matched (5, 9357) -> ❌ No Match [Time: 1.22s]


  0%|                                                       | 51/25540 [01:34<12:17:22,  1.74s/it]

Matched (6, 20933) -> ❌ No Match [Time: 1.57s]


  0%|                                                       | 52/25540 [01:35<12:01:15,  1.70s/it]

Matched (6, 10423) -> ❌ No Match [Time: 1.61s]


  0%|                                                       | 53/25540 [01:37<11:53:49,  1.68s/it]

Matched (6, 8885) -> ❌ No Match [Time: 1.64s]


  0%|                                                       | 54/25540 [01:39<12:08:29,  1.72s/it]

Matched (6, 12426) -> ❌ No Match [Time: 1.79s]


  0%|                                                       | 55/25540 [01:40<11:00:25,  1.55s/it]

Matched (6, 7867) -> ❌ No Match [Time: 1.18s]


  0%|                                                        | 56/25540 [01:41<9:47:00,  1.38s/it]

Matched (6, 20229) -> ❌ No Match [Time: 0.98s]


  0%|                                                       | 57/25540 [01:43<10:33:53,  1.49s/it]

Matched (6, 6048) -> ❌ No Match [Time: 1.75s]


  0%|▏                                                       | 58/25540 [01:44<9:27:30,  1.34s/it]

Matched (6, 1145) -> ❌ No Match [Time: 0.97s]


  0%|▏                                                       | 59/25540 [01:45<9:55:06,  1.40s/it]

Matched (6, 6393) -> ❌ No Match [Time: 1.55s]


  0%|▏                                                      | 60/25540 [01:47<10:39:38,  1.51s/it]

Matched (6, 17879) -> ❌ No Match [Time: 1.75s]


  0%|▏                                                      | 61/25540 [01:50<14:33:23,  2.06s/it]

Matched (7, 4981) -> ❌ No Match [Time: 3.34s]


  0%|▏                                                      | 62/25540 [01:52<13:42:32,  1.94s/it]

Matched (7, 1049) -> ❌ No Match [Time: 1.65s]


  0%|▏                                                      | 63/25540 [01:53<12:36:59,  1.78s/it]

Matched (7, 3063) -> ❌ No Match [Time: 1.42s]


  0%|▏                                                      | 64/25540 [01:57<15:47:09,  2.23s/it]

Matched (7, 4979) -> ❌ No Match [Time: 3.27s]


  0%|▏                                                      | 65/25540 [01:58<14:32:00,  2.05s/it]

Matched (7, 11872) -> ❌ No Match [Time: 1.64s]


  0%|▏                                                      | 66/25540 [02:00<13:02:56,  1.84s/it]

Matched (7, 3048) -> ❌ No Match [Time: 1.35s]


  0%|▏                                                      | 67/25540 [02:01<12:14:49,  1.73s/it]

Matched (7, 3682) -> ❌ No Match [Time: 1.46s]


  0%|▏                                                      | 68/25540 [02:03<12:47:45,  1.81s/it]

Matched (7, 1044) -> ❌ No Match [Time: 1.99s]


  0%|▏                                                      | 69/25540 [02:06<14:20:44,  2.03s/it]

Matched (7, 5847) -> ❌ No Match [Time: 2.54s]


  0%|▏                                                      | 70/25540 [02:07<13:15:08,  1.87s/it]

Matched (7, 14915) -> ❌ No Match [Time: 1.51s]


  0%|▏                                                      | 71/25540 [02:10<15:58:44,  2.26s/it]

Matched (8, 1304) -> ❌ No Match [Time: 3.16s]


  0%|▏                                                      | 72/25540 [02:15<19:58:25,  2.82s/it]

Matched (8, 9285) -> ❌ No Match [Time: 4.14s]


  0%|▏                                                      | 73/25540 [02:17<18:06:51,  2.56s/it]

Matched (8, 13269) -> ❌ No Match [Time: 1.93s]


  0%|▏                                                      | 73/25540 [02:18<13:24:53,  1.90s/it]


KeyboardInterrupt: 