In [1]:
import json
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance
from tqdm import tqdm
from httpx import ReadTimeout
import time
from sentence_transformers import SentenceTransformer



# Qdrat

## all-MiniLM-L6-v2_embeddings.json

In [3]:
# 1. Load the JSON File
with open('embeddings/all-MiniLM-L6-v2_embeddings.json', 'r') as f:
    data = json.load(f)

In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
from httpx import ReadTimeout

# 2. Initialize Qdrant Client
client = QdrantClient(host="localhost", port=6333)

# 3. Create Collection in Qdrant
collection_name = "all-MiniLM-L6-v2_collection"
vector_dim = len(data[0]['embedding'])  # Dimension of the embeddings

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_dim, distance=Distance.COSINE),
)


def upload_batch(client, collection_name, points, batch_index, total_batches, retries=3):
    """Uploads a single batch to Qdrant with retry logic."""
    for attempt in range(retries):
        try:
            client.upsert(collection_name=collection_name, points=points)
            return True  # Upload successful
        except ReadTimeout:
            print(f"Timeout for batch {batch_index}/{total_batches}. Retrying ({attempt + 1}/{retries})...")
            time.sleep(2)  # Wait before retrying
    print(f"Failed to upload batch {batch_index}/{total_batches} after {retries} attempts.")
    return False  # Upload failed after retries


def upload_to_qdrant(data, collection_name, client, batch_size=50, max_workers=8):
    """Uploads data to Qdrant in parallel using multiple threads."""
    num_batches = (len(data) + batch_size - 1) // batch_size  # Total number of batches
    batches = [
        data[i:i + batch_size]
        for i in range(0, len(data), batch_size)
    ]
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        with tqdm(total=num_batches, desc="Uploading to Qdrant") as pbar:
            for batch_index, batch in enumerate(batches, start=1):
                points = [
                    {
                        "id": item["id"],
                        "vector": item["embedding"],
                        "payload": item["metadata"],
                    }
                    for item in batch
                ]
                # Submit each batch upload task to the executor
                futures.append(
                    executor.submit(upload_batch, client, collection_name, points, batch_index, num_batches)
                )
            
            # Process completed tasks and update the progress bar
            for future in as_completed(futures):
                pbar.update(1)  # Increment progress bar for each completed task
                if not future.result():
                    print("A batch failed to upload.")
    
    print(f"Data uploaded to Qdrant collection: {collection_name}")


# Upload embeddings to Qdrant using parallel threads
upload_to_qdrant(data, collection_name, client, batch_size=50, max_workers=4)


  client.recreate_collection(
Uploading to Qdrant: 100%|██████████| 803/803 [06:53<00:00,  1.94it/s]

Data uploaded to Qdrant collection: all-MiniLM-L6-v2_collection





In [5]:

# Initialize the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Function to Query Qdrant with Text
def query_qdrant_with_text(client, collection_name, query_text, top_k=3):
    # Step 1: Convert query text to a vector
    query_vector = model.encode(query_text)

    # Step 2: Search the Qdrant collection
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k,
    )

    # Step 3: Return search results
    return search_result

# Example Usage
query_text = "cool architecture in Amsterdam"
results = query_qdrant_with_text(client, collection_name, query_text, top_k=15)

# Display Results
print(f"Query: {query_text}\n")
for result in results:
    print(f"Result ID: {result.id}, Score: {result.score}")
    print(f"Metadata: {result.payload['title'],result.payload['main_category']}\n")

  search_result = client.search(


Query: cool architecture in Amsterdam

Result ID: 4823183, Score: 0.7274457
Metadata: ('Lijnbaansgracht 18A-C', 'building')

Result ID: 5395888, Score: 0.70786786
Metadata: ('Nieuwe Looiersstraat 68-70', 'building')

Result ID: 2287299, Score: 0.7032949
Metadata: ('Huize Sint-Jan', 'monument')

Result ID: 5433672, Score: 0.69894695
Metadata: ('De Scheppende Hand & Chaos', 'statue')

Result ID: 3838800, Score: 0.6983609
Metadata: ('Marisstraat', 'street')

Result ID: 3838806, Score: 0.688607
Metadata: ('Mesdagplein', 'square')

Result ID: 3843778, Score: 0.6884556
Metadata: ('Turfstraat 28a-32 (Baarn)', 'building')

Result ID: 4818690, Score: 0.68700856
Metadata: ('Paviljoen Noorderplantsoen', 'building')

Result ID: 5226629, Score: 0.68608326
Metadata: ('Memories (Bastardilla)', 'street art')

Result ID: 1375486, Score: 0.685442
Metadata: ('Kruiskerk van Delfzijl', 'church')

Result ID: 5679631, Score: 0.68539655
Metadata: ('Lycka', 'building')

Result ID: 5641956, Score: 0.68520623
Me

In [2]:
import folium
from IPython.display import display

def display_map_inline(results):
    # Initialize a map centered around the first result's coordinates
    first_location = results[0].payload
    map_center = [first_location['latitude'], first_location['longitude']]
    map_object = folium.Map(location=map_center, zoom_start=12)

    # Add markers for each result
    for result in results:
        metadata = result.payload
        title = metadata.get('title', 'Unknown')
        latitude = metadata['latitude']
        longitude = metadata['longitude']
        main_category = metadata.get('main_category', 'N/A')
        subcategories = metadata.get('subcategories', 'N/A')

        # Add a marker with a popup
        popup_content = f"<b>{title}</b><br>Main Category: {main_category}<br>Subcategories: {subcategories}"
        folium.Marker(
            location=[latitude, longitude],
            popup=popup_content,
            tooltip=title,
        ).add_to(map_object)

    # Display the map directly in the notebook
    display(map_object)
    return map_object

# Example: Generate and display map
#map_object = display_map_inline(results)


# FAISS
Trying FAISS if it is easier to use

## all-MiniLM-L6-v2

In [7]:
import json
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

In [12]:
# Load embeddings from JSON file
embeddings_file = "embeddings/all-MiniLM-L6-v2_embeddings.json"
with open(embeddings_file, "r") as f:
    embeddings_data = json.load(f)

# Extract embeddings and IDs
embeddings = np.array([item["embedding"] for item in embeddings_data], dtype="float32")
ids = [item["id"] for item in embeddings_data]

# Load the metadata DataFrame
metadata_file = "nl_full_all_columns.csv"
metadata_df = pd.read_csv(metadata_file)

filtered_metadata_df = metadata_df[metadata_df["id"].isin(ids)].set_index("id")


In [17]:
# Normalize embeddings for cosine similarity
def normalize(vectors):
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / norms

embeddings_normalized = normalize(embeddings)

# Create a FAISS index for cosine similarity
dimension = embeddings_normalized.shape[1]  # Size of embedding vectors
index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
index.add(embeddings_normalized)

# Save the index to disk (optional)
faiss.write_index(index, "embeddings/all-MiniLM-L6-v2_faiss_index.index")


In [4]:
def query_faiss(index, query_text, model, metadata_df, ids, top_k=5):
    """
    Query the FAISS index and retrieve limited metadata.

    Args:
        index: FAISS index instance.
        query_text: Text query for similarity search.
        model: SentenceTransformer instance for generating query embeddings.
        metadata_df: Pandas DataFrame containing metadata indexed by ID.
        ids: List of IDs corresponding to embeddings.
        top_k: Number of top results to retrieve.

    Returns:
        List of results containing ID, score, and selected metadata fields.
    """
    # Step 1: Convert query text to vector
    query_vector = model.encode([query_text]).astype("float32")
    query_vector = query_vector / np.linalg.norm(query_vector, axis=1, keepdims=True)  # Normalize for cosine similarity

    # Step 2: Search FAISS index
    distances, indices = index.search(query_vector, top_k)

    # Step 3: Retrieve limited metadata for results
    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if idx == -1:  # No result
            continue
        result_id = ids[idx]
        metadata_row = metadata_df.loc[result_id]
        metadata = {
            "title": metadata_row["title"],
            "main_category": metadata_row["main_category"],
            "subcategories": metadata_row["subcategories"],
            "latitude": metadata_row["latitude"],
            "longitude": metadata_row["longitude"],
            "generated_text": metadata_row["generated_text"],
        }
        results.append({
            "id": result_id,
            "score": dist,
            "metadata": metadata,
        })
    return results

In [15]:
# Load the Sentence Transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")



# Example Query
query_text = "modern architecture in Rotterdam"
top_k = 8
results = query_faiss(index, query_text, model, filtered_metadata_df, ids, top_k)

# Display Results
print(f"Query: {query_text}\n")
for result in results:
    print(f"Result ID: {result['id']}, Score: {result['score']}")
    print(f"Metadata: {result['metadata']}\n")


Query: modern architecture in Rotterdam

Result ID: 5134637, Score: 0.7322934865951538
Metadata: {'title': 'Up:Town', 'main_category': 'skyscraper', 'subcategories': 'unique architecture, city', 'latitude': np.float64(51.916575), 'longitude': np.float64(4.486402777777778), 'generated_text': "Let's explore the Up:Town building, a striking addition to Rotterdam's skyline.  Standing at 107 meters tall, it's a significant structure on the Wijnhaveneiland.  Completed in late 2018,  this impressive building boasts 34 floors, housing a mix of rental and ownership apartments.  The top eight floors offer luxury apartments for purchase, while the rest are available for rent, providing a variety of living options within this modern high-rise.  Its design seamlessly blends residential space with the vibrant urban environment of Rotterdam. The building is a true testament to modern architecture, and contributes significantly to Rotterdam's ever-evolving cityscape.\n"}

Result ID: 459691, Score: 0.7

In [11]:
import folium
from IPython.display import display

def display_map_inline(results):
    """
    Display a map with markers for the retrieved locations.

    Args:
        results: List of query results from FAISS, each containing:
                 - id
                 - score
                 - metadata with fields: title, main_category, subcategories, latitude, longitude.
    Returns:
        A Folium map object.
    """
    # Ensure results exist
    if not results:
        print("No results to display.")
        return None

    # Initialize a map centered around the first result's coordinates
    first_location = results[0]['metadata']
    map_center = [first_location['latitude'], first_location['longitude']]
    map_object = folium.Map(location=map_center, zoom_start=12)

    # Add markers for each result
    for result in results:
        metadata = result['metadata']
        title = metadata.get('title', 'Unknown')
        latitude = metadata['latitude']
        longitude = metadata['longitude']
        main_category = metadata.get('main_category', 'N/A')
        subcategories = metadata.get('subcategories', 'N/A')

        # Add a marker with a popup
        popup_content = (
            f"<b>{title}</b><br>"
            f"Main Category: {main_category}<br>"
            f"Subcategories: {subcategories}"
        )
        folium.Marker(
            location=[latitude, longitude],
            popup=popup_content,
            tooltip=title,
        ).add_to(map_object)

    # Display the map directly in the notebook
    display(map_object)
    return map_object

In [16]:


# Example: Generate and display map
map_object = display_map_inline(results)


## all-mpnet-base-v2

In [6]:
import json
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

In [7]:
# Load embeddings from JSON file
embeddings_file = "embeddings/all-mpnet-base-v2_embeddings.json"
with open(embeddings_file, "r") as f:
    embeddings_data = json.load(f)

# Extract embeddings and IDs
embeddings = np.array([item["embedding"] for item in embeddings_data], dtype="float32")
ids = [item["id"] for item in embeddings_data]

# Load the metadata DataFrame
metadata_file = "nl_full_all_columns.csv"
metadata_df = pd.read_csv(metadata_file)

filtered_metadata_df = metadata_df[metadata_df["id"].isin(ids)].set_index("id")

# Normalize embeddings for cosine similarity
def normalize(vectors):
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / norms

embeddings_normalized = normalize(embeddings)

# Create a FAISS index for cosine similarity
dimension = embeddings_normalized.shape[1]  # Size of embedding vectors
index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
index.add(embeddings_normalized)

# Save the index to disk (optional)
faiss.write_index(index, "embeddings/all-mpnet-base-v2_faiss_index.index")



In [8]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Example Query
query_text = "What should a Rembrandt lover see in Leiden"
top_k = 20
results = query_faiss(index, query_text, model, filtered_metadata_df, ids, top_k)

# Display Results
print(f"Query: {query_text}\n")
for result in results:
    print(f"Result ID: {result['id']}, Score: {result['score']}")
    print(f"Metadata: {result['metadata']}\n")


Query: What should a Rembrandt lover see in Leiden

Result ID: 549276, Score: 0.7298647165298462
Metadata: {'title': 'Museum De Lakenhal', 'main_category': 'museum', 'subcategories': 'art, history', 'latitude': np.float64(52.16305555555555), 'longitude': np.float64(4.4875), 'generated_text': 'Step into Museum De Lakenhal, Leiden\'s municipal art museum since 1874. Housed in a stunning 1640 building, formerly the city\'s cloth hall, the museum beautifully blends Leiden\'s rich history with its remarkable art collection.  The museum\'s collection is primarily focused on Leiden\'s history, showcasing pieces depicting pivotal moments such as the Siege of Leiden in 1573-1574 and its subsequent relief.  You\'ll find masterpieces by renowned artists, including Rembrandt van Rijn, whose early work, "The Brillenverkoper,"  is one of the museum’s treasures.  Also on display are significant works by Lucas van Leyden, Jan Steen, Gerrit Dou, and many other significant Leiden artists.  The museum al

In [12]:
t = display_map_inline(results)

## NovaSearch/stella_en_1.5B_v5

In [7]:
import json
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer


In [8]:
# Load embeddings from JSON file
embeddings_file = "embeddings/stella_en_1_5B_v5_embeddings.json"
with open(embeddings_file, "r") as f:
    embeddings_data = json.load(f)

# Extract embeddings and IDs
embeddings = np.array([item["embedding"] for item in embeddings_data], dtype="float32")
ids = [item["id"] for item in embeddings_data]

# Load the metadata DataFrame
metadata_file = "nl_full_all_columns.csv"
metadata_df = pd.read_csv(metadata_file)

filtered_metadata_df = metadata_df[metadata_df["id"].isin(ids)].set_index("id")

# Normalize embeddings for cosine similarity
def normalize(vectors):
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / norms

embeddings_normalized = normalize(embeddings)

# Create a FAISS index for cosine similarity
dimension = embeddings_normalized.shape[1]  # Size of embedding vectors
index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
index.add(embeddings_normalized)

# Save the index to disk (optional)
faiss.write_index(index, "embeddings/stella_en_1_5B_v5_embeddings_faiss_index.index")



In [12]:
model = SentenceTransformer("NovaSearch/stella_en_1.5B_v5")

# Example Query
query_text = "brutalist architecture Amsterdam"
top_k = 20
results = query_faiss(index, query_text, model, filtered_metadata_df, ids, top_k)

# Display Results
print(f"Query: {query_text}\n")
for result in results:
    print(f"Result ID: {result['id']}, Score: {result['score']}")
    print(f"Metadata: {result['metadata']}\n")


Query: brutalist architecture Amsterdam

Result ID: 1765510, Score: 0.6337276697158813
Metadata: {'title': 'ARCAM', 'main_category': 'museum', 'subcategories': 'art, architecture', 'latitude': np.float64(52.37145), 'longitude': np.float64(4.912647222222223), 'generated_text': "ARCAM, the Amsterdam Centre for Architecture, is a fascinating institution dedicated to architecture, urban planning, and landscape architecture in and around Amsterdam.  Founded in 1985 by Maarten Kloos, it arose from a need to coordinate the fragmented knowledge within the architectural field in the city.  Initially located in the Academy of Architecture building, its growth necessitated a move to a pavilion near the NEMO Science Museum in the late 1990s, where it remains today.\n\nARCAM’s activities are diverse and engaging.  They’ve overseen projects ranging from creating comprehensive maps charting the architectural landscape, to organizing exhibitions and debates focusing on Amsterdam's future development –