In [4]:
import pandas as pd
import pandas as pd
import numpy as np
import os

# Data loading

In [5]:
df = pd.read_csv('nl_full_all_columns.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,id,title,type,region,latitude,longitude,clean_text,geohash,country_code,thumbnail_link,thumbnail_file_name,thumbnail_author,thumbnail_license,thumbnail_file_extension,generated_text,category_output,level,main_category,subcategories
0,0,72,Arnhem,city,NL,51.983333,5.916667,"Arnhem (uitspraak: of , Arnhems: Ernem) is ee...",u1hpwxk,NL,https://upload.wikimedia.org/wikipedia/commons...,23_april_2016_CS_Arnhem.jpg,Hellendijk81,CC BY-SA 4.0,jpg,"Arnhem, the capital of Gelderland, is a city w...","[1, city, [region, history]]\n",1.0,city,"region, history"
1,1,112,Amersfoort,city,NL,52.156389,5.389722,Amersfoort () is een stad en gemeente in het o...,u17bdwu,NL,https://upload.wikimedia.org/wikipedia/commons...,Koppelpoort_Night.jpg,Richywiseman,CC BY-SA 4.0,jpg,"Amersfoort, nestled in the eastern part of the...","[1, city, [history, monument]]\n",1.0,city,"history, monument"
2,2,227,Bergen op Zoom,city,NL,51.5,4.3,"Bergen op Zoom (; Bergs: Bèrrege, Frans: Berg-...",u15hcz7,NL,,,,,,Bergen op Zoom is a city in the westernmost pa...,"[1, city, [region, history]]\n",1.0,city,"region, history"


In [6]:
df.columns

Index(['Unnamed: 0', 'id', 'title', 'type', 'region', 'latitude', 'longitude',
       'clean_text', 'geohash', 'country_code', 'thumbnail_link',
       'thumbnail_file_name', 'thumbnail_author', 'thumbnail_license',
       'thumbnail_file_extension', 'generated_text', 'category_output',
       'level', 'main_category', 'subcategories'],
      dtype='object')

# Making embeddings

## all-MiniLM-L6-v2
Starting with a small local model as a baseline comparison. It should be fast, but we will see how the quality is.

In [1]:
from sentence_transformers import SentenceTransformer
import json

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def generate_embeddings(df, text_column='generated_text', id_column='id', save_path="embeddings/", file_name="embeddings.json"):
    os.makedirs(save_path, exist_ok=True)  

    # Generate embeddings
    embeddings = model.encode(df[text_column].tolist(), show_progress_bar=True)

    # Prepare data for JSON
    records = []
    for idx, row in df.iterrows():
        record = {
            "id": row[id_column],
            "embedding": embeddings[idx].tolist(),
            "metadata": {
                "title": row.get("title"),
                "longitude": row.get("longitude"),
                "latitude": row.get("latitude"),
                "main_category": row.get("main_category"),
                "subcategories": row.get("subcategories"),
            },
        }
        records.append(record)

    # Save to JSON file
    save_file = os.path.join(save_path, file_name)
    with open(save_file, "w") as f:
        json.dump(records, f, indent=4)

    print(f"Embeddings saved to {save_file}")
    return save_file

# Generate and save embeddings
embedding_file = generate_embeddings(df, text_column='generated_text')


Batches:   0%|          | 0/1254 [00:00<?, ?it/s]

Embeddings saved to embeddings/embeddings.json


## all-mpnet-base-v2
A larger model should result in better embeddings

In [2]:
import pandas as pd 
import torch

In [7]:
# Load the model with GPU support if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)

def generate_embeddings(df, text_column='generated_text', id_column='id', save_path="embeddings/", file_name="all-mpnet-base-v2_embeddings.json"):
    os.makedirs(save_path, exist_ok=True)  

    # Generate embeddings using GPU (if available)
    embeddings = model.encode(df[text_column].tolist(), show_progress_bar=True, convert_to_tensor=True)
    embeddings = embeddings.cpu().numpy()  # Convert back to CPU for saving

    # Prepare data for JSON
    records = []
    for idx, row in df.iterrows():
        record = {
            "id": row[id_column],
            "embedding": embeddings[idx].tolist(),
            "metadata": {
                "title": row.get("title"),
                "longitude": row.get("longitude"),
                "latitude": row.get("latitude"),
                "main_category": row.get("main_category"),
                "subcategories": row.get("subcategories"),
            },
        }
        records.append(record)

    # Save to JSON file
    save_file = os.path.join(save_path, file_name)
    with open(save_file, "w") as f:
        json.dump(records, f, indent=4)

    print(f"Embeddings saved to {save_file}")
    return save_file

# Check if CUDA is available
if torch.cuda.is_available():
    print("Using GPU for embedding generation.")
else:
    print("Using CPU for embedding generation. Consider using a GPU for faster processing.")

# Generate and save embeddings
embedding_file = generate_embeddings(df, text_column='generated_text')

Using GPU for embedding generation.


Batches: 100%|██████████| 1254/1254 [04:51<00:00,  4.30it/s]


Embeddings saved to embeddings/all-mpnet-base-v2_embeddings.json
