In [22]:
import pandas as pd
import random
from difflib import SequenceMatcher
from tqdm import tqdm

In [23]:

# ---- Parameters ----
INPUT_PARQUET = "authors_data_10000.parquet"
OUTPUT_CSV = "pairs_10000_0-65.csv"
NAME_SIM_THRESHOLD = 0.65  # synthetic same-author similarity threshold

In [24]:
def name_similarity(name1, name2):
    return SequenceMatcher(None, name1.lower(), name2.lower()).ratio()


In [25]:
def generate_pairs(df, name_sim_threshold=NAME_SIM_THRESHOLD):
    pairs = []

    print("Generating same-author (synthetic) pairs...")
    for i, row1 in tqdm(df.iterrows(), total=len(df)):
        for j, row2 in df.iterrows():
            if i >= j:
                continue
            sim = name_similarity(row1['author'], row2['author'])
            if sim >= name_sim_threshold:
                pairs.append({
                    "author_id_1": row1["author_id"],
                    "author_id_2": row2["author_id"],
                    "same_author": 1
                })

    print("Generating different-author (negative) pairs...")
    author_ids = df["author_id"].tolist()
    for _ in tqdm(range(len(pairs))):
        a1, a2 = random.sample(author_ids, 2)
        if a1 != a2:
            pairs.append({
                "author_id_1": a1,
                "author_id_2": a2,
                "same_author": 0
            })

    return pd.DataFrame(pairs)

In [26]:
if __name__ == "__main__":
    print(f"Reading author data from: {INPUT_PARQUET}")
    df = pd.read_parquet(INPUT_PARQUET)

    print("Generating labeled pairs...")
    pairs_df = generate_pairs(df)

    print(f"Saving to: {OUTPUT_CSV}")
    pairs_df.to_csv(OUTPUT_CSV, index=False)
    print(f"Done. {len(pairs_df)} pairs saved.")

  0%|          | 2/1699 [00:00<02:10, 13.00it/s]

Reading author data from: authors_data_10000.parquet
Generating labeled pairs...
Generating same-author (synthetic) pairs...


100%|██████████| 1699/1699 [01:23<00:00, 20.43it/s]
100%|██████████| 693/693 [00:00<00:00, 703960.44it/s]

Generating different-author (negative) pairs...
Saving to: pairs_10000_0-65.csv
Done. 1386 pairs saved.



