In [2]:
import os
import re
import time
import numpy as np
import pandas as pd
import faiss
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import openai
from dotenv import load_dotenv
import pickle
import hashlib

In [4]:
# Load .env file
load_dotenv()

# Set API key from environment
openai.api_key = os.getenv("OPENAI_API_KEY")

if openai.api_key is None:
    raise ValueError("OpenAI API key not found. Make sure you have a .env file with OPENAI_API_KEY set.")

In [5]:
def encode_texts_openai(texts, cache_path="embeddings_cache.pkl", model="text-embedding-3-large"):
    print("\nEncoding texts using OpenAI model...")
    
    # Load cache if exists
    if os.path.exists(cache_path):
        with open(cache_path, "rb") as f:
            cache = pickle.load(f)
    else:
        cache = {}

    embeddings = []
    new_count = 0

    for text in tqdm(texts, desc="Embedding"):
        key = hashlib.md5(text.encode('utf-8')).hexdigest()
        if key in cache:
            embeddings.append(cache[key])
        else:
            try:
                response = openai.embeddings.create(input=text, model=model)
                vec = response.data[0].embedding
                cache[key] = vec
                embeddings.append(vec)
                new_count += 1
            except Exception as e:
                print(f"OpenAI API failed on: {text[:50]}...\n{e}")
                vec = [0.0] * 1536  # fallback, dim of text-embedding-3-small
                embeddings.append(vec)

    # Save updated cache
    with open(cache_path, "wb") as f:
        pickle.dump(cache, f)

    print(f"Embedding complete. {new_count} new vectors generated, {len(texts) - new_count} loaded from cache.")
    return np.array(embeddings)

In [7]:
def load_data(tb1_path, tb2_path):
    # table1 = pd.read_csv(tb1_path)
    # table2 = pd.read_csv(tb2_path)
    table1 = pd.read_csv(tb1_path, encoding='cp1252')
    table2 = pd.read_csv(tb2_path, encoding='cp1252')
    # table1 = pd.read_csv(tb1_path, encoding='latin1')
    # table2 = pd.read_csv(tb2_path, encoding='utf-8-sig')  # fixes BOM issue
    

    # Strip quotes and whitespace from column names
    # table1.columns = table1.columns.str.strip().str.replace('"', '')
    # table2.columns = table2.columns.str.strip().str.replace('"', '')
    return table1, table2

def preprocess(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^\w\s-]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_table(df, columns_to_concat):
    # Fill NaN for each selected column
    for col in columns_to_concat:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in dataframe.")
        df[col] = df[col].fillna('')

    # Apply preprocess and concatenate
    processed_text = ''
    for col in columns_to_concat:
        processed_text += df[col].apply(preprocess) + ' '
    
    df['processed'] = processed_text.str.strip()
    df['processed'] = df['processed'].replace('', 'unknown')
    return df

def preprocess_tables(table1, table2, table1_columns, table2_columns):
    table1 = preprocess_table(table1, table1_columns)
    table2 = preprocess_table(table2, table2_columns)
    return table1, table2

def encode_texts(model, texts, batch_size=64):
    print("\nEncoding texts into embeddings...")
    start_time = time.time()
    embeddings = model.encode(texts, show_progress_bar=True, batch_size=batch_size)
    elapsed = time.time() - start_time
    print(f"Embedding generation completed in {elapsed:.2f} seconds.")

    if np.isnan(embeddings).any():
        print("Warning: NaN values detected in embeddings - replacing with zeros")
        embeddings = np.nan_to_num(embeddings)
    return embeddings

def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1)
    norms[norms == 0] = 1e-10
    return embeddings / norms[:, np.newaxis]

def build_faiss_index(embeddings, batch_size=1000):
    print("\nBuilding FAISS index...")
    start_time = time.time()
    dimension = embeddings.shape[1]
    if dimension == 0:
        raise ValueError("Embedding dimension is 0 - check your input data")
    
    try:
        index = faiss.IndexFlatIP(dimension)
        for i in tqdm(range(0, len(embeddings), batch_size), desc="Adding batches to FAISS"):
            batch = embeddings[i:i+batch_size]
            if not np.isnan(batch).any():
                index.add(batch)
            else:
                print(f"Skipping batch {i} due to NaN values")
        elapsed = time.time() - start_time
        print(f"FAISS index built in {elapsed:.2f} seconds.")
        return index
    except Exception as e:
        print(f"Error creating FAISS index: {e}")
        print("Using brute-force matching instead")
        return None

def find_matches(query_embeddings, table1, target_df, target_embeddings, index=None, top_k=30):
    if index is None:
        raise ValueError("FAISS index is not available. Matching cannot proceed.")
    print("\nFinding matches...")
    matches = []
    start_time = time.time()
    for i, query_embedding in enumerate(tqdm(query_embeddings, desc="Matching queries")):
        if np.isnan(query_embedding).any():
            print(f"Skipping query {i} due to NaN values")
            continue

        if index:
            query_embedding = query_embedding.reshape(1, -1)
            distances, indices = index.search(query_embedding, top_k)
            distances = distances[0]
            indices = indices[0]
        # else:
        #     similarities = np.dot(target_embeddings, query_embedding)
        #     indices = np.argsort(similarities)[-top_k:][::-1]
        #     distances = similarities[indices]

        for score, idx in zip(distances, indices):
            matches.append({
                'left_id': table1.iloc[i]['id'],
                'right_id': target_df.iloc[idx]['id'],
                'similarity_score': score,
                'table1_text': table1.iloc[i]['processed'],
                'table2_text': target_df.iloc[idx]['processed']
            })
    elapsed = time.time() - start_time
    print(f"Matching completed in {elapsed:.2f} seconds.")
    return matches

def save_results(matches_df, index=None, matches_file="entity_matches.csv", index_file="entity_matching_index.faiss"):
    matches_df.to_csv(matches_file, index=False)
    print(f"\nSaved matches to {matches_file}.")
    if index:
        faiss.write_index(index, index_file)
        print(f"Saved FAISS index to {index_file}.")



In [8]:
# eval for apt-buy
import os
import pandas as pd
import time
from sentence_transformers import SentenceTransformer

def main():
    # File paths
    tb1_path = os.path.join("Abt-Buy", "Abt.csv")
    tb2_path = os.path.join("Abt-Buy", "Buy.csv")
    ground_truth_path = os.path.join("Abt-Buy", "abt_buy_perfectMapping.csv")

    # Load and preprocess data
    table1, table2 = load_data(tb1_path, tb2_path)
    table1_columns = ["name"]
    table2_columns = ["name"]
    table1, table2 = preprocess_tables(table1, table2, table1_columns, table2_columns)

    # Load ground truth
    ground_truth = pd.read_csv(ground_truth_path)
    ground_truth_set = set(zip(ground_truth['idAbt'], ground_truth['idBuy']))

    # Load model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Generate and normalize embeddings (with timing)
    start_time = time.time()
    embeddings1 = encode_texts(model, table1['processed'].tolist())
    embeddings2 = encode_texts(model, table2['processed'].tolist())
    embeddings1 = normalize_embeddings(embeddings1)
    embeddings2 = normalize_embeddings(embeddings2)
    embedding_time = time.time() - start_time

    # Test different k values
    k_values = [1, 5, 10, 15, 20, 25, 30]
    results = []

    for k in k_values:
        # Build FAISS index (with timing)
        start_time = time.time()
        index = build_faiss_index(embeddings2)
        build_time = time.time() - start_time

        # Find matches (with timing)
        start_time = time.time()
        matches = find_matches(embeddings1, table1, table2, embeddings2, index=index, top_k=k)
        search_time = time.time() - start_time

        matches_df = pd.DataFrame(matches)
        predicted_set = set(zip(matches_df['left_id'], matches_df['right_id']))

        # Compute metrics
        true_positives = predicted_set & ground_truth_set
        recall = len(true_positives) / len(ground_truth_set)
        precision = len(true_positives) / len(predicted_set) if len(predicted_set) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        results.append({
            'k': k,
            'recall': recall,
            'precision': precision,
            'f1_score': f1_score,
            'embedding_time': embedding_time,
            'build_time': build_time,
            'search_time': search_time,
            'total_pairs': len(predicted_set),
            'true_positives': len(true_positives)
        })

    # Print results
    results_df = pd.DataFrame(results)
    print("\nEvaluation Results:")
    print(results_df.to_string(index=False))

    # Optionally save results
    results_df.to_csv("blocking_evaluation_results_apt-buy.csv", index=False)

if __name__ == "__main__":
    main()


Encoding texts into embeddings...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Embedding generation completed in 1.21 seconds.

Encoding texts into embeddings...


Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Embedding generation completed in 0.67 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████████████| 2/2 [00:00<00:00, 2114.06it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████████| 1081/1081 [00:00<00:00, 10470.02it/s]


Matching completed in 0.10 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████████████| 2/2 [00:00<00:00, 2020.38it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|██████████████████████████████████████| 1081/1081 [00:00<00:00, 3249.08it/s]


Matching completed in 0.33 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████████████| 2/2 [00:00<00:00, 2254.40it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|██████████████████████████████████████| 1081/1081 [00:00<00:00, 1877.70it/s]


Matching completed in 0.58 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████████████| 2/2 [00:00<00:00, 2944.40it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|██████████████████████████████████████| 1081/1081 [00:00<00:00, 1290.67it/s]


Matching completed in 0.84 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████████████| 2/2 [00:00<00:00, 3663.15it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|███████████████████████████████████████| 1081/1081 [00:01<00:00, 984.35it/s]


Matching completed in 1.10 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████████████| 2/2 [00:00<00:00, 2816.86it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|███████████████████████████████████████| 1081/1081 [00:01<00:00, 793.87it/s]


Matching completed in 1.36 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████████████| 2/2 [00:00<00:00, 2989.53it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|███████████████████████████████████████| 1081/1081 [00:01<00:00, 665.93it/s]

Matching completed in 1.62 seconds.

Evaluation Results:
 k   recall  precision  f1_score  embedding_time  build_time  search_time  total_pairs  true_positives     cssr
 1 0.705561   0.716004  0.710744         1.88559    0.003065     0.104206         1081             774 0.000916
 5 0.919781   0.186679  0.310366         1.88559    0.002463     0.334103         5405            1009 0.004579
10 0.960802   0.097502  0.177039         1.88559    0.002046     0.576956        10810            1054 0.009158
15 0.974476   0.065927  0.123498         1.88559    0.001831     0.839056        16215            1069 0.013736
20 0.980857   0.049769  0.094731         1.88559    0.001469     1.099950        21620            1076 0.018315
25 0.987238   0.040074  0.077022         1.88559    0.001909     1.363760        27025            1083 0.022894
30 0.990884   0.033518  0.064843         1.88559    0.001621     1.625386        32430            1087 0.027473





In [22]:
# eval for amazon-best buy
import os
import pandas as pd
import time
from sentence_transformers import SentenceTransformer

def main():
    # File paths
    tb1_path = os.path.join("files", "amazon.csv")
    tb2_path = os.path.join("files", "best_buy.csv")
    ground_truth_path = os.path.join("files", "labeled_data.csv")

    # Load and preprocess data
    table1, table2 = load_data(tb1_path, tb2_path)
    table1_columns = ["Brand", "Name"]
    table2_columns = ["Brand", "Name"]
    table1, table2 = preprocess_tables(table1, table2, table1_columns, table2_columns)

    # Load ground truth (with special handling for header row)
    ground_truth_df = pd.read_csv(ground_truth_path, skiprows=5)
    ground_truth_matches = set(zip(
        ground_truth_df.loc[ground_truth_df['gold'] == 1, 'ltable.ID'],
        ground_truth_df.loc[ground_truth_df['gold'] == 1, 'rtable.ID']
    ))
    n_ltable = ground_truth_df['ltable.ID'].nunique()
    n_rtable = ground_truth_df['rtable.ID'].nunique()
    total_possible_pairs = n_ltable * n_rtable

    # Load model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Time embedding generation
    start_time = time.time()
    embeddings1 = encode_texts(model, table1['processed'].tolist())
    embeddings2 = encode_texts(model, table2['processed'].tolist())
    embeddings1 = normalize_embeddings(embeddings1)
    embeddings2 = normalize_embeddings(embeddings2)
    embedding_time = time.time() - start_time

    # Test different k values
    k_values = [1, 5, 10, 15, 20, 25, 30]
    results = []

    for k in k_values:
        # Time index building
        start_time = time.time()
        index = build_faiss_index(embeddings2)
        build_time = time.time() - start_time

        # Time matching
        start_time = time.time()
        matches = find_matches(embeddings1, table1, table2, embeddings2, index=index, top_k=k)
        search_time = time.time() - start_time

        matches_df = pd.DataFrame(matches)
        candidate_set = set(zip(matches_df['left_id'], matches_df['right_id']))

        # Compute metrics
        true_positives = ground_truth_matches.intersection(candidate_set)
        recall = len(true_positives) / len(ground_truth_matches) if len(ground_truth_matches) > 0 else 0.0
        precision = len(true_positives) / len(candidate_set) if len(candidate_set) > 0 else 0.0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        reduction_ratio = 1 - (len(candidate_set) / total_possible_pairs)

        results.append({
            'k': k,
            'recall': recall,
            'precision': precision,
            'f1_score': f1_score,
            'reduction_ratio': reduction_ratio,
            'embedding_time': embedding_time,
            'build_time': build_time,
            'search_time': search_time,
            'total_pairs': len(candidate_set),
            'true_positives': len(true_positives)
        })

        # Show sample matches for k=30 (original value)
        if k == 30:
            print("\nSample matches (k=30):")
            print(matches_df.sort_values(by='similarity_score', ascending=False).head())

    # Print results
    results_df = pd.DataFrame(results)
    print(f"\nTotal ground-truth matches: {len(ground_truth_matches)}")
    print("\nEvaluation Results:")
    print(results_df.to_string(index=False, float_format="{:0.4f}".format))

    # Save results
    results_df.to_csv("amazon_bestbuy_evaluation_results.csv", index=False)
    print("\nResults saved to amazon_bestbuy_evaluation_results.csv")

if __name__ == "__main__":
    main()


Encoding texts into embeddings...


Batches:   0%|          | 0/67 [00:00<?, ?it/s]

Embedding generation completed in 6.49 seconds.

Encoding texts into embeddings...


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Embedding generation completed in 2.76 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|███████████████████████████████| 6/6 [00:00<00:00, 1455.93it/s]


FAISS index built in 0.01 seconds.

Finding matches...


Matching queries: 100%|████████████████████████████████| 4259/4259 [00:00<00:00, 4382.98it/s]


Matching completed in 0.97 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|███████████████████████████████| 6/6 [00:00<00:00, 1676.49it/s]


FAISS index built in 0.01 seconds.

Finding matches...


Matching queries: 100%|████████████████████████████████| 4259/4259 [00:02<00:00, 1758.04it/s]


Matching completed in 2.42 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|███████████████████████████████| 6/6 [00:00<00:00, 2934.11it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|████████████████████████████████| 4259/4259 [00:03<00:00, 1328.50it/s]


Matching completed in 3.21 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|███████████████████████████████| 6/6 [00:00<00:00, 2508.56it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|████████████████████████████████| 4259/4259 [00:04<00:00, 1004.93it/s]


Matching completed in 4.24 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|███████████████████████████████| 6/6 [00:00<00:00, 2383.80it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 4259/4259 [00:05<00:00, 807.33it/s]


Matching completed in 5.28 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|███████████████████████████████| 6/6 [00:00<00:00, 2110.34it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 4259/4259 [00:06<00:00, 656.40it/s]


Matching completed in 6.49 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|███████████████████████████████| 6/6 [00:00<00:00, 2148.35it/s]


FAISS index built in 0.00 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 4259/4259 [00:07<00:00, 568.72it/s]


Matching completed in 7.49 seconds.

Sample matches (k=30):
       left_id  right_id  similarity_score  \
59610     1988      3563          0.983699   
71850     2396      2461          0.970228   
30780     1027      2542          0.967111   
60390     2014      2461          0.963338   
23880      797      2459          0.959631   

                                             table1_text  \
59610  other 3m privacy filter for widescreen laptop ...   
71850  other case logic pls-13 neoprene 133-inch neop...   
30780  other case logic laps-114 14-inch laptop sleev...   
60390  other case logic pls-14 14-inch neoprene lapto...   
23880  other case logic laps-117 17 - 173 -inch lapto...   

                                             table2_text  
59610  other 3m privacy filter for widescreen laptop ...  
71850  other case logic neoprene laptop sleeve black ...  
30780    other case logic laptop sleeve black laps-114bl  
60390  other case logic neoprene laptop sleeve black ...  
23880  

In [19]:
# eval for DBLP-Scholar

import os
import pandas as pd
import time
from sentence_transformers import SentenceTransformer

def main():
    # File paths
    tb1_path = os.path.join("DBLP-Scholar", "DBLP1.csv")
    tb2_path = os.path.join("DBLP-Scholar", "Scholar.csv")
    ground_truth_path = os.path.join("DBLP-Scholar", "DBLP-Scholar_perfectMapping.csv")

    # Load and preprocess data
    table1, table2 = load_data(tb1_path, tb2_path)
    table1_columns = ["title", "authors"]
    table2_columns = ["title", "authors"]
    table1, table2 = preprocess_tables(table1, table2, table1_columns, table2_columns)

    # Load ground truth
    ground_truth = pd.read_csv(ground_truth_path)
    ground_truth_set = set(zip(ground_truth['idDBLP'], ground_truth['idScholar']))

    # Load model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Time embedding generation
    start_time = time.time()
    embeddings1 = encode_texts(model, table1['processed'].tolist())
    embeddings2 = encode_texts(model, table2['processed'].tolist())
    embeddings1 = normalize_embeddings(embeddings1)
    embeddings2 = normalize_embeddings(embeddings2)
    embedding_time = time.time() - start_time

    # Test different k values
    k_values = [1, 5, 10, 15, 20, 25, 30]
    results = []

    for k in k_values:
        # Time index building
        start_time = time.time()
        index = build_faiss_index(embeddings2)
        build_time = time.time() - start_time

        # Time matching
        start_time = time.time()
        matches = find_matches(embeddings1, table1, table2, embeddings2, index=index, top_k=k)
        search_time = time.time() - start_time

        matches_df = pd.DataFrame(matches)
        predicted_set = set(zip(matches_df['left_id'], matches_df['right_id']))

        # Compute metrics
        true_positives = predicted_set & ground_truth_set
        recall = len(true_positives) / len(ground_truth_set) if len(ground_truth_set) > 0 else 0
        precision = len(true_positives) / len(predicted_set) if len(predicted_set) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        results.append({
            'k': k,
            'recall': recall,
            'precision': precision,
            'f1_score': f1_score,
            'embedding_time': embedding_time,
            'build_time': build_time,
            'search_time': search_time,
            'total_pairs': len(predicted_set),
            'true_positives': len(true_positives)
        })

        # Show sample matches for k=10
        if k == 10:
            print("\nSample matches (k=10):")
            print(matches_df.sort_values(by='similarity_score', ascending=False).head())

    # Print results
    results_df = pd.DataFrame(results)
    print("\nEvaluation Results:")
    print(results_df.to_string(index=False, float_format="{:0.4f}".format))

    # Save results
    results_df.to_csv("dblp_scholar_evaluation_results_DBLP-Scholar.csv", index=False)
    print("\nResults saved to dblp_scholar_evaluation_results.csv")

if __name__ == "__main__":
    main()


Encoding texts into embeddings...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Embedding generation completed in 1.96 seconds.

Encoding texts into embeddings...


Batches:   0%|          | 0/1005 [00:00<?, ?it/s]

Embedding generation completed in 30.40 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|██████████████████████████████| 65/65 [00:00<00:00, 957.68it/s]


FAISS index built in 0.07 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2616/2616 [00:06<00:00, 408.63it/s]


Matching completed in 6.40 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|██████████████████████████████| 65/65 [00:00<00:00, 463.82it/s]


FAISS index built in 0.14 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2616/2616 [00:06<00:00, 391.25it/s]


Matching completed in 6.69 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 65/65 [00:00<00:00, 1623.94it/s]


FAISS index built in 0.04 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2616/2616 [00:07<00:00, 352.11it/s]


Matching completed in 7.43 seconds.

Sample matches (k=10):
                            left_id  \
7580       journals/sigmod/Sidell96   
8640         journals/sigmod/Fong95   
25470     journals/vldb/PeckhamMD95   
10140     journals/sigmod/YeungHL94   
1570   journals/sigmod/EisenbergM01   

                                                right_id  similarity_score  \
7580   url:http://portal.acm.org/ft_gateway.cfm%3Fid%...               1.0   
8640                                        zw3-t-veNvAJ               1.0   
25470                                       zOfVTNgEwz0J               1.0   
10140                                       yQv94p3tE1IJ               1.0   
1570                                        GKUflKzgAVEJ               1.0   

                                             table1_text  \
7580   the mariposa distributed database management s...   
8640   mapping extended entity relationship model to ...   
25470  data model for extensible support of explicit ...

Adding batches to FAISS: 100%|█████████████████████████████| 65/65 [00:00<00:00, 1017.79it/s]


FAISS index built in 0.07 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2616/2616 [00:08<00:00, 320.99it/s]


Matching completed in 8.15 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 65/65 [00:00<00:00, 1509.56it/s]


FAISS index built in 0.04 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2616/2616 [00:08<00:00, 293.19it/s]


Matching completed in 8.92 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 65/65 [00:00<00:00, 1306.99it/s]


FAISS index built in 0.05 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2616/2616 [00:09<00:00, 273.86it/s]


Matching completed in 9.55 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 65/65 [00:00<00:00, 1196.13it/s]


FAISS index built in 0.06 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2616/2616 [00:10<00:00, 253.26it/s]


Matching completed in 10.33 seconds.

Evaluation Results:
 k  recall  precision  f1_score  embedding_time  build_time  search_time  total_pairs  true_positives
 1  0.4410     0.9014    0.5922         32.4345      0.0710       6.4050         2616            2358
 5  0.8837     0.3612    0.5128         32.4345      0.1436       6.6885        13080            4725
10  0.9609     0.1964    0.3261         32.4345      0.0438       7.4321        26160            5138
15  0.9792     0.1334    0.2349         32.4345      0.0667       8.1529        39240            5236
20  0.9856     0.1007    0.1828         32.4345      0.0478       8.9266        52320            5270
25  0.9878     0.0808    0.1493         32.4345      0.0535       9.5574        65400            5282
30  0.9892     0.0674    0.1262         32.4345      0.0579      10.3348        78480            5289

Results saved to dblp_scholar_evaluation_results.csv


In [31]:
# eval for walmart_amazon
import os
import pandas as pd
import time
from sentence_transformers import SentenceTransformer

def main():
    # File paths
    tb1_path = os.path.join("walmart_amazon", "tableA.csv")
    tb2_path = os.path.join("walmart_amazon", "tableB.csv")
    ground_truth_path = os.path.join("walmart_amazon", "matches.csv")

    # Load and preprocess data
    table1, table2 = load_data(tb1_path, tb2_path)
    table1_columns = ["brand", "groupname", "title", "shortdescr"]
    table2_columns = ["brand", "category1", "category2", "title", "proddescrshort"]
    table1, table2 = preprocess_tables(table1, table2, table1_columns, table2_columns)
    # print(table1.columns)
    # print(table2.columns)
    # Load ground truth
    ground_truth = pd.read_csv(ground_truth_path)
    ground_truth_set = set(zip(ground_truth['id1'], ground_truth['id2']))
    total_entities_left = table1['custom_id'].nunique()
    total_entities_right = table2['custom_id'].nunique()
    total_possible_pairs = total_entities_left * total_entities_right

    # Load model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Time embedding generation
    start_time = time.time()
    embeddings1 = encode_texts(model, table1['processed'].tolist())
    embeddings2 = encode_texts(model, table2['processed'].tolist())
    embeddings1 = normalize_embeddings(embeddings1)
    embeddings2 = normalize_embeddings(embeddings2)
    embedding_time = time.time() - start_time

    # Test different k values
    k_values = [1, 5, 10, 15, 20, 25, 30]
    results = []

    for k in k_values:
        # Time index building
        start_time = time.time()
        index = build_faiss_index(embeddings2)
        build_time = time.time() - start_time

        # Time matching
        start_time = time.time()
        matches = find_matches(embeddings1, table1, table2, embeddings2, index=index, top_k=k)
        search_time = time.time() - start_time

        matches_df = pd.DataFrame(matches)
        predicted_set = set(zip(matches_df['left_id'], matches_df['right_id']))

        # Compute metrics
        true_positives = predicted_set & ground_truth_set
        recall = len(true_positives) / len(ground_truth_set)
        precision = len(true_positives) / len(predicted_set) if len(predicted_set) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        reduction_ratio = 1 - (len(predicted_set) / total_possible_pairs)
        pairs_quality = precision  # Same as precision

        results.append({
            'k': k,
            'recall': recall,
            'precision': precision,
            'f1_score': f1_score,
            'reduction_ratio': reduction_ratio,
            'pairs_quality': pairs_quality,
            'embedding_time': embedding_time,
            'build_time': build_time,
            'search_time': search_time,
            'total_pairs': len(predicted_set),
            'true_positives': len(true_positives),
            'false_positives': len(predicted_set - ground_truth_set),
            'false_negatives': len(ground_truth_set - predicted_set)
        })

        # Show sample matches for k=10 (original value)
        if k == 10:
            print("\nSample matches (k=10):")
            print(matches_df.sort_values(by='similarity_score', ascending=False).head())

    # Print results
    results_df = pd.DataFrame(results)
    print(f"\nTotal ground-truth matches: {len(ground_truth_set)}")
    print(f"Total possible pairs: {total_possible_pairs}")
    print("\nEvaluation Results:")
    print(results_df.to_string(index=False, float_format="{:0.4f}".format))

    # Save results
    results_df.to_csv("walmart_amazon_evaluation_results.csv", index=False)
    print("\nResults saved to walmart_amazon_evaluation_results.csv")

if __name__ == "__main__":
    main()


Encoding texts into embeddings...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Embedding generation completed in 3.78 seconds.

Encoding texts into embeddings...


Batches:   0%|          | 0/345 [00:00<?, ?it/s]

Embedding generation completed in 23.26 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 23/23 [00:00<00:00, 1049.57it/s]


FAISS index built in 0.02 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2554/2554 [00:02<00:00, 862.96it/s]


Matching completed in 2.96 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 23/23 [00:00<00:00, 1387.20it/s]


FAISS index built in 0.02 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2554/2554 [00:03<00:00, 810.65it/s]


Matching completed in 3.15 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 23/23 [00:00<00:00, 2103.92it/s]


FAISS index built in 0.01 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2554/2554 [00:04<00:00, 621.91it/s]


Matching completed in 4.11 seconds.

Sample matches (k=10):
       left_id  right_id  similarity_score  \
15780     1579      3324          0.990323   
12820     1283      1241          0.985462   
9610       962     14318          0.983938   
7220       723     12996          0.970382   
10730     1074      2523          0.969655   

                                             table1_text  \
15780              v7 mice v7 3-button optical mouse usb   
12820  d-link networking d-link systems powerline av ...   
9610   amped wireless networking amped wireless high ...   
7220   corsair memory corsair memory vs2gbkit400c3 2 ...   
10730  corsair memory corsair hx3x12g1333c9 12 gb xms...   

                                             table2_text  
15780             v7 mice  v7 3 button usb optical mouse  
12820  d-link powerline network adapters  d-link syst...  
9610   amped wireless routers  amped wireless high po...  
7220   corsair memory  corsair memory vs2gbkit400c3 2...  
10730  

Adding batches to FAISS: 100%|█████████████████████████████| 23/23 [00:00<00:00, 2324.61it/s]


FAISS index built in 0.01 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2554/2554 [00:04<00:00, 515.24it/s]


Matching completed in 4.96 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 23/23 [00:00<00:00, 2334.79it/s]


FAISS index built in 0.01 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2554/2554 [00:05<00:00, 428.58it/s]


Matching completed in 5.96 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 23/23 [00:00<00:00, 2228.49it/s]


FAISS index built in 0.01 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2554/2554 [00:06<00:00, 370.11it/s]


Matching completed in 6.90 seconds.

Building FAISS index...


Adding batches to FAISS: 100%|█████████████████████████████| 23/23 [00:00<00:00, 2264.42it/s]


FAISS index built in 0.01 seconds.

Finding matches...


Matching queries: 100%|█████████████████████████████████| 2554/2554 [00:07<00:00, 324.88it/s]


Matching completed in 7.86 seconds.

Total ground-truth matches: 1154
Total possible pairs: 56376996

Evaluation Results:
 k  recall  precision  f1_score  reduction_ratio  pairs_quality  embedding_time  build_time  search_time  total_pairs  true_positives  false_positives  false_negatives
 1  0.4913     0.2220    0.3058           1.0000         0.2220         27.0644      0.0235       2.9622         2554             567             1987              587
 5  0.8007     0.0724    0.1327           0.9998         0.0724         27.0644      0.0192       3.1529        12770             924            11846              230
10  0.8951     0.0404    0.0774           0.9995         0.0404         27.0644      0.0128       4.1099        25540            1033            24507              121
15  0.9272     0.0279    0.0542           0.9993         0.0279         27.0644      0.0120       4.9608        38310            1070            37240               84
20  0.9437     0.0213    0.0417       