In [8]:
import pandas as pd
import pandas as pd
import numpy as np
import os

# Data loading

In [16]:
df = pd.read_csv('nl_full_all_columns.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,id,title,type,region,latitude,longitude,clean_text,geohash,country_code,thumbnail_link,thumbnail_file_name,thumbnail_author,thumbnail_license,thumbnail_file_extension,generated_text,category_output,level,main_category,subcategories
0,0,72,Arnhem,city,NL,51.983333,5.916667,"Arnhem (uitspraak: of , Arnhems: Ernem) is ee...",u1hpwxk,NL,https://upload.wikimedia.org/wikipedia/commons...,23_april_2016_CS_Arnhem.jpg,Hellendijk81,CC BY-SA 4.0,jpg,"Arnhem, the capital of Gelderland, is a city w...","[1, city, [region, history]]\n",1.0,city,"region, history"
1,1,112,Amersfoort,city,NL,52.156389,5.389722,Amersfoort () is een stad en gemeente in het o...,u17bdwu,NL,https://upload.wikimedia.org/wikipedia/commons...,Koppelpoort_Night.jpg,Richywiseman,CC BY-SA 4.0,jpg,"Amersfoort, nestled in the eastern part of the...","[1, city, [history, monument]]\n",1.0,city,"history, monument"
2,2,227,Bergen op Zoom,city,NL,51.5,4.3,"Bergen op Zoom (; Bergs: Bèrrege, Frans: Berg-...",u15hcz7,NL,,,,,,Bergen op Zoom is a city in the westernmost pa...,"[1, city, [region, history]]\n",1.0,city,"region, history"


In [6]:
df.columns

Index(['Unnamed: 0', 'id', 'title', 'type', 'region', 'latitude', 'longitude',
       'clean_text', 'geohash', 'country_code', 'thumbnail_link',
       'thumbnail_file_name', 'thumbnail_author', 'thumbnail_license',
       'thumbnail_file_extension', 'generated_text', 'category_output',
       'level', 'main_category', 'subcategories'],
      dtype='object')

# Making embeddings

## all-MiniLM-L6-v2
Starting with a small local model as a baseline comparison. It should be fast, but we will see how the quality is.

In [1]:
from sentence_transformers import SentenceTransformer
import json

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def generate_embeddings(df, text_column='generated_text', id_column='id', save_path="embeddings/", file_name="embeddings.json"):
    os.makedirs(save_path, exist_ok=True)  

    # Generate embeddings
    embeddings = model.encode(df[text_column].tolist(), show_progress_bar=True)

    # Prepare data for JSON
    records = []
    for idx, row in df.iterrows():
        record = {
            "id": row[id_column],
            "embedding": embeddings[idx].tolist(),
            "metadata": {
                "title": row.get("title"),
                "longitude": row.get("longitude"),
                "latitude": row.get("latitude"),
                "main_category": row.get("main_category"),
                "subcategories": row.get("subcategories"),
            },
        }
        records.append(record)

    # Save to JSON file
    save_file = os.path.join(save_path, file_name)
    with open(save_file, "w") as f:
        json.dump(records, f, indent=4)

    print(f"Embeddings saved to {save_file}")
    return save_file

# Generate and save embeddings
embedding_file = generate_embeddings(df, text_column='generated_text')


Batches:   0%|          | 0/1254 [00:00<?, ?it/s]

Embeddings saved to embeddings/embeddings.json


## all-mpnet-base-v2
A larger model should result in better embeddings

In [15]:
import os
import json
import pandas as pd

In [7]:
# Load the model with GPU support if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

def generate_embeddings(df, text_column='generated_text', id_column='id', save_path="embeddings/", file_name="all-mpnet-base-v2_embeddings.json"):
    os.makedirs(save_path, exist_ok=True)  

    # Generate embeddings using GPU (if available)
    embeddings = model.encode(df[text_column].tolist(), show_progress_bar=True, convert_to_tensor=True)
    embeddings = embeddings.cpu().numpy()  # Convert back to CPU for saving

    # Prepare data for JSON
    records = []
    for idx, row in df.iterrows():
        record = {
            "id": row[id_column],
            "embedding": embeddings[idx].tolist(),
            "metadata": {
                "title": row.get("title"),
                "longitude": row.get("longitude"),
                "latitude": row.get("latitude"),
                "main_category": row.get("main_category"),
                "subcategories": row.get("subcategories"),
            },
        }
        records.append(record)

    # Save to JSON file
    save_file = os.path.join(save_path, file_name)
    with open(save_file, "w") as f:
        json.dump(records, f, indent=4)

    print(f"Embeddings saved to {save_file}")
    return save_file

# Check if CUDA is available
if torch.cuda.is_available():
    print("Using GPU for embedding generation.")
else:
    print("Using CPU for embedding generation. Consider using a GPU for faster processing.")

# Generate and save embeddings
embedding_file = generate_embeddings(df, text_column='generated_text')

Using GPU for embedding generation.


Batches: 100%|██████████| 1254/1254 [04:51<00:00,  4.30it/s]


Embeddings saved to embeddings/all-mpnet-base-v2_embeddings.json


## NovaSearch/stella_en_1.5B_v5

https://huggingface.co/spaces/mteb/leaderboard

I considerd using Google's one, but people complain about it. It is also surprisingly costly. It makes 768 size embeddings, same size as the one above.
I decided to run this instead, which is currently ranked 7th on the leaderbord and makes 1024 size embeddings. 
If I were to make the embeddings locally, it would take 5 days, so I am using Google Colab with the Tesla T4 GPU. There it takes 4.5 hours. 
Then querying will be faster and can be done locally.

In [21]:
from sentence_transformers import SentenceTransformer
import os
import json
import pandas as pd
import torch
import numpy as np  

In [25]:
import os
import json
import pandas as pd

def convert_space_separated_embedding(embedding_str):
    """Convert space-separated string into a list of floats."""
    try:
        # Remove brackets and split by spaces
        embedding_str = embedding_str.strip("[]")  # Remove brackets
        embedding_list = [float(x) for x in embedding_str.split()]  # Split by spaces and convert to floats
        return embedding_list
    except Exception as e:
        print(f"Error parsing embedding: {embedding_str[:50]}... -> {e}")
        return []  # Return empty list if conversion fails

def save_embeddings_to_json(df, embeddings_file, id_column='id', save_path="embeddings/", file_name="stella_en_1.5B_v5_embeddings.json"):
    os.makedirs(save_path, exist_ok=True)  

    # Load embeddings from CSV
    embeddings_df = pd.read_csv(embeddings_file)

    # Apply conversion function to fix space-separated embeddings
    embeddings_df['embedding'] = embeddings_df['embedding'].astype(str).apply(convert_space_separated_embedding)

    # Merge embeddings with metadata
    df = df.merge(embeddings_df, on=id_column, how='left')

    # Prepare data for JSON
    records = []
    for idx, row in df.iterrows():
        record = {
            "id": row[id_column],
            "embedding": row["embedding"],
            "metadata": {
                "title": row.get("title"),
                "longitude": row.get("longitude"),
                "latitude": row.get("latitude"),
                "main_category": row.get("main_category"),
                "subcategories": row.get("subcategories"),
            },
        }
        records.append(record)

    # Save to JSON file
    save_file = os.path.join(save_path, file_name)
    with open(save_file, "w") as f:
        json.dump(records, f, indent=4)

    print(f"Embeddings saved to {save_file}")
    return save_file

# Example usage:
embeddings_csv = "embeddings/embeddings.csv"
embedding_file = save_embeddings_to_json(df, embeddings_csv)


Error parsing embedding: -0.51359147  0.34303078  0.1313565  ... -0.3159386... -> could not convert string to float: '...'
Error parsing embedding: -0.22574395  0.8070596   0.20274551 ...  0.0463788... -> could not convert string to float: '...'
Error parsing embedding: -0.38148203  0.35940802  0.20563067 ... -0.3902692... -> could not convert string to float: '...'
Error parsing embedding: -0.42317134  0.2903743  -0.21283899 ...  0.3725879... -> could not convert string to float: '...'
Error parsing embedding:  0.13874565 -0.3041305   0.26780203 ... -0.4350951... -> could not convert string to float: '...'
Error parsing embedding: -0.54513705  0.27219346  0.08262409 ...  0.9977526... -> could not convert string to float: '...'
Error parsing embedding: -0.5016144   0.41484746  0.42675057 ... -0.0312602... -> could not convert string to float: '...'
Error parsing embedding: -0.7271013   0.5439979  -0.4557619  ...  0.0375515... -> could not convert string to float: '...'
Error parsing em

In [24]:
import pandas as pd

# Load and inspect the first few rows
embeddings_df = pd.read_csv("embeddings/embeddings.csv")
print(embeddings_df.head())

   Unnamed: 0   id                                     generated_text  \
0           0   72  Arnhem, the capital of Gelderland, is a city w...   
1           1  112  Amersfoort, nestled in the eastern part of the...   
2           2  227  Bergen op Zoom is a city in the westernmost pa...   
3           3  345  Dorestad: A Thriving Medieval Trading Post\n\n...   
4           4  371  Drenthe, a province in the north of the Nether...   

                                           embedding  
0  [-0.51359147  0.34303078  0.1313565  ... -0.31...  
1  [-0.22574395  0.8070596   0.20274551 ...  0.04...  
2  [-0.38148203  0.35940802  0.20563067 ... -0.39...  
3  [-0.42317134  0.2903743  -0.21283899 ...  0.37...  
4  [ 0.13874565 -0.3041305   0.26780203 ... -0.43...  


In [28]:
embeddings_df['embedding'][0]

'[-0.51359147  0.34303078  0.1313565  ... -0.31593865  0.08821227\n  1.4397588 ]'

In [10]:
query_vector = model.encode([df['generated_text'][0]]).astype("float32")
query_vector

array([[-0.51359266,  0.34304222,  0.13135456, ..., -0.31594902,
         0.08820565,  1.4397745 ]], shape=(1, 1024), dtype=float32)

In [13]:
df[['id','generated_text']].to_csv('small_test.csv')

In [14]:
df2 = pd.read_csv('small_test.csv')