In [7]:
from FlagEmbedding import BGEM3FlagModel
import flatgeobuf as fgb 
import geopandas as gpd 
import numpy as np
from numpy.linalg import norm
import ast

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [8]:
import numpy as np
from sklearn.random_projection import GaussianRandomProjection

def reduce_to_2d(vector, random_state=42): # Attention: random state needs to match when writing!
    """
    Reduces a high-dimensional vector to 2D coordinates using Gaussian Random Projection.
    
    Args:
        vector: numpy array or list - Input vector to reduce
        random_state: int - Random seed for reproducibility
    
    Returns:
        tuple (x, y) - 2D coordinates of the reduced vector
    """
    # Reshape vector to 2D array as required by sklearn
    vector = np.array(vector).reshape(1, -1)
    
    # Initialize projection
    rp = GaussianRandomProjection(n_components=2, random_state=random_state)
    
    # Fit and transform in one step
    reduced = rp.fit_transform(vector)
    
    # Return x, y coordinates
    return reduced[0, 0], reduced[0, 1]

reduce_to_2d([0,1,2,3,4,9,5])

(7.480072735693709, -7.089641506373143)

In [9]:
def create_bbox_from_point(coords, span=0.1):
    """
    Create a bounding box from a center point and span value.
    
    Args:
        coords (tuple): Center point coordinates (x, y)
        span (float): The span/extent value that determines the size of the bbox
                     (will be added/subtracted from center point)
    
    Returns:
        tuple: Bounding box coordinates (min_x, min_y, max_x, max_y)
    """
    x, y = coords
    return (x - span, y - span, x + span, y + span)

create_bbox_from_point((-1.058773621913573, -0.49202262345718906))

(-1.1587736219135731,
 -0.592022623457189,
 -0.9587736219135731,
 -0.3920226234571891)

In [34]:
#QUERY = "enjoying nature, plants and vegetation"
#QUERY = "eating tasty food at the table, enjoying dinner"
#QUERY = "reading about the industrial revolution"
QUERY = "observing animals in the wild"
query_emb = model.encode(QUERY, batch_size=12, max_length=2048)['dense_vecs']
query_2d = reduce_to_2d(query_emb)
query_bbox = create_bbox_from_point(query_2d, 0.3)

features = []
with open("alice.fgb", "rb") as f: # retreive parts of the file from disk or via http request
    reader = fgb.Reader(f, bbox=query_bbox)
    for feature in reader:
        features.append(feature)

gdf = gpd.GeoDataFrame(features) 
gdf["text"] = gdf["properties"].apply(lambda x: x["text"])
gdf["embeddings"] = gdf["properties"].apply(lambda x: x["embeddings"])
del gdf["properties"]
gdf["cosine_similarity"] = gdf["embeddings"].apply(lambda x: cos_sim(np.array(ast.literal_eval(x.replace("  ",","))), np.array(query_emb))) # dirty tricks as fgb does not support arrays as type (but json, need to explore)
del gdf["embeddings"]
print("Retrieved datapoints: ", len(gdf))
gdf.sort_values("cosine_similarity",ascending=False, inplace=True)
gdf

Retrieved datapoints:  25


Unnamed: 0,type,geometry,text,cosine_similarity
24,Feature,POINT (-1.6834 -1.09001),birds and animals that,0.728023
17,Feature,POINT (-1.3311 -0.92575),"wandering, when a sharp",0.680383
15,Feature,POINT (-1.31957 -1.06279),"of it; so, after hunting",0.633657
8,Feature,POINT (-1.17947 -0.82297),thoughtfully at the mushroom,0.581669
10,Feature,POINT (-1.11244 -0.95903),the fan and the pair,0.571449
23,Feature,POINT (-1.5916 -0.98052),"mushroom, and her eyes",0.569117
6,Feature,POINT (-1.14737 -0.76465),"growing near her, about",0.568814
4,Feature,POINT (-1.33792 -0.73858),was walking hand in hand,0.562929
22,Feature,POINT (-1.47002 -0.97506),back the wandering hair,0.562181
2,Feature,POINT (-1.34482 -0.77),"the mushroom, and crawled",0.55456


In [35]:
print(gdf[["text","cosine_similarity"]].head(5).to_markdown())

|    | text                         |   cosine_similarity |
|---:|:-----------------------------|--------------------:|
| 24 | birds and animals that       |            0.728023 |
| 17 | wandering, when a sharp      |            0.680383 |
| 15 | of it; so, after hunting     |            0.633657 |
|  8 | thoughtfully at the mushroom |            0.581669 |
| 10 | the fan and the pair         |            0.571449 |


In [None]:
QUERY = "observing animals in the wild"
query_emb = model.encode(QUERY, batch_size=12, max_length=2048)['dense_vecs']
query_2d = reduce_to_2d(query_emb)
query_bbox = create_bbox_from_point(query_2d,100) # very large value, all data points are in here!

features = []
with open("alice.fgb", "rb") as f:
    reader = fgb.Reader(f, bbox=query_bbox)
    for feature in reader:
        features.append(feature)

gdf = gpd.GeoDataFrame(features) 
gdf["text"] = gdf["properties"].apply(lambda x: x["text"])
gdf["embeddings"] = gdf["properties"].apply(lambda x: x["embeddings"])
del gdf["properties"]
gdf["cosine_similarity"] = gdf["embeddings"].apply(lambda x: cos_sim(np.array(ast.literal_eval(x.replace("  ",","))), np.array(query_emb)))
del gdf["embeddings"]
print("Retrieved datapoints: ", len(gdf))
gdf.sort_values("cosine_similarity",ascending=False)

Retrieved datapoints:  2905


Unnamed: 0,type,geometry,text,cosine_similarity
2278,Feature,POINT (-0.91857 -0.01612),the look of the creature,0.739605
2112,Feature,POINT (-1.39575 0.02426),up by wild beasts and,0.728607
2902,Feature,POINT (-1.6834 -1.09001),birds and animals that,0.728023
1609,Feature,POINT (-0.94805 0.33132),Turtle in the distance,0.721267
2596,Feature,POINT (-0.67203 -0.5654),Rabbit coming to look,0.720095
...,...,...,...,...
790,Feature,POINT (0.63945 0.67866),"” she said to herself, (not in a very hopeful ...",0.291587
475,Feature,POINT (0.49955 -0.22684),Then again—‘before she had this fit—’ you neve...,0.286018
516,Feature,POINT (1.27404 -0.60829),"“If you didn’t sign it,” said the King, “that ...",0.285998
2506,Feature,POINT (-0.27514 -0.269),Pennyworth only of beautiful Soup?,0.274283
