In [19]:
from tqdm import tqdm
import pandas as pd
import numpy as np

In [20]:
from openai import OpenAI
API_KEY = "yourkeyhere"
client = OpenAI(api_key=API_KEY)

In [21]:
def get_embeddings(publisher_strings: list, ndim=1000, model="text-embedding-3-small"):
    response = client.embeddings.create(
    input=publisher_strings,
    model=model,
    dimensions=ndim
)
    embeddings_list = [x.embedding for x in response.data]
    embeddings = dict(zip(publisher_strings, embeddings_list))
    return embeddings

def cosine_similarity(vector1, vector2):
    """Calculate the cosine similarity between two vectors."""
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    return dot_product / (norm_vector1 * norm_vector2)

def vector_based_clustering(embedding_dict, cosine_similarity_threshold=0.85, tqdm_message=""):
    """Clusters publishers based on cosine similarity of their vectors and assigns a representative group name."""
    publishers = embedding_dict.keys()
    
    # Step 1: Initial clustering using vector cosine similarity
    groups = []
    for i, publisher in enumerate(tqdm(publishers, desc=tqdm_message)):
        added_to_group = False
        
        for group in groups:
            # Calculate cosine similarity with all members of the group
            similarities = [cosine_similarity(embedding_dict[publisher], embedding_dict[member]) for member in group]
            average_similarity = np.mean(similarities)

            if average_similarity >= cosine_similarity_threshold:
                group.append(publisher)
                added_to_group = True
                break
        
        if not added_to_group:
            groups.append([publisher])
    
    # Step 2: Assign representative to each group and map to DataFrame
    # Representative is chosen as the shortest name in the group
    representative_mapping = {}
    for group in groups:
        representative = min(group, key=len)
        for member in group:
            representative_mapping[member] = representative

    return groups

In [22]:
def create_publisher_similarity_mapping(df, locations, cosine_similarity_threshold, output_path):

    max_whitespace = len(sorted(locations, key=len, reverse=True)[0])

    with open(output_path, "a", encoding="utf8") as f:
        f.write("publication_place_harmonized\tpublisher_harmonized\tpublisher_similarity_group\n")
        
        for i, location in enumerate(locations):
            
            try:
                location_df = df.groupby("publication_place_harmonized").get_group(location)
                publishers = location_df["publisher_harmonized"].unique()

                embeddings = {}
                batch_size = 2048

                # Split publishers into batches if necessary
                if len(publishers) <= batch_size:
                    embeddings = get_embeddings(publishers, ndim=1000, model="text-embedding-3-small")
                else:
                    for j in range(0, len(publishers), batch_size):
                        batch = publishers[j:j + batch_size]
                        batch_embeddings = get_embeddings(batch, ndim=1000, model="text-embedding-3-small")
                        
                        # Extend the embeddings dictionary with new batch
                        embeddings.update(batch_embeddings)

                groups = vector_based_clustering(embeddings,
                                                 cosine_similarity_threshold,
                                                 tqdm_message=f"{i}/{len(locations)} - {location} {' '*(max_whitespace - len(location))}")
                
                for group in sorted(groups, key=len, reverse=True):
                    representative = min(group)
                    for publisher in group:
                        f.write(f"{location}\t{publisher}\t{representative}\n")
                
                # Flush data to disk after each location
                f.flush()

            except Exception as e:
                print(f"Failed processing {location}: {e}")

    print("Finished!")


In [12]:
# The vector-based clustering procedure starts from the current state of the curated dataset, using harmonized places and publishers

enb = pd.read_parquet("../data/curated/enb_books.parquet")
df = enb[["publication_place_harmonized", "publisher_harmonized"]].drop_duplicates().dropna()
locations = df["publication_place_harmonized"].value_counts()[lambda x: x > 1].index[::-1]

In [None]:
# Run this to start requesting the OpenAI API for embeddings and perform the clustering
# The results are stored in the relevant config file
create_publisher_similarity_mapping(df, locations, 0.7, "publisher_similarity_groups.tsv")