# Analyse Embeddings from AirBnB data to see if they can predict gentrification (using the Manchester Gentrification Index)

  - Data from [Inside AirBnB](https://insideairbnb.com/)
  - [Manchester gentrification index](https://www.common-wealth.org/interactive/the-greater-manchester-gentrification-index

## Imports

In [None]:
import os
import urllib
import time
import folium
import pickle

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import branca
import branca.colormap as cm
from shapely.geometry import Point
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import train_test_split, GridSearchCV
import folium, rasterio
from rasterio.transform import from_bounds


from sentence_transformers import SentenceTransformer



## Data

Data for the Greater Manchester Gentrification Index were provided privately. They have not been included in the github repository.

Airbnb data for Manchester are [Inside AirBnB](https://insideairbnb.com/get-the-data/). Specifically four different snapshots:
  - 2004-03-28
  - 2024-06-26
  - 2024-09-23
  - 2024-12-25

The code will try to download the ata if they don't exist.

From each snapshot, we use three files:
   - `listings.csv.gz`
   - `reviews.csv.gz`
   - `neighbourhoods.geojson`

The downloaded files need to be placed in a directory called [../data/airbnb-manchester](../data/airbnb-manchester) .

### Download and prepare AirBnB data

In [None]:
# Try to download the data, checking if it exists already
dates = [ '2024-03-28', '2024-06-26', '2024-09-23', '2024-12-25']
files = ["listings.csv.gz", "reviews.csv.gz"]
data_dir = os.path.join("..", "data", "airbnb-manchester")
root_url = "https://data.insideairbnb.com/united-kingdom/england/greater-manchester/"
neighbourhoods_url = "https://data.insideairbnb.com/united-kingdom/england/greater-manchester/2024-12-25/visualisations/neighbourhoods.geojson"

for d in dates:
    # Check if d is already a directory
    if os.path.isdir(os.path.join(data_dir, d)):
        print(f"Directory {d} already exists.")
        continue
    # It isn't, so create it and download the data
    os.makedirs(os.path.join(data_dir, d), exist_ok=True)
    print(f"Downloading data for {d} ...")
    for f in files:
        url = f"{root_url}{d}/data/{f}"
        try:
            print(f"\tDownloading {f} from {url} ", end="")
            urllib.request.urlretrieve(url, os.path.join(data_dir, d, f))
            print("...done.")
        except urllib.error.HTTPError as e:
            print(f"Error downloading {f} for {d}: {e}")
        # Sleep for a few seconds so not to abuse their server
        time.sleep(3)

# Get the neighbourhoods first, if we haven't done so already
if not os.path.isfile(os.path.join(data_dir, "neighbourhoods.geojson")):
    try:
        print(f"Downloading neighbourhoods from {neighbourhoods_url} ", end="")
        urllib.request.urlretrieve(neighbourhoods_url, os.path.join(data_dir, "neighbourhoods.geojson") )
        print(" ...done.")
    except urllib.error.HTTPError as e:
        print(f"Error downloading neighbourhoods: {e}")

Open the listings files and create a single pandas dataframe.

Note that I drop rows that are identical, but there are still some properties that are duplicated because the desciption, name, or neighbourhood overview change. I leave them in for now but could just drop those with identical `host_id`, keeping only the most recent one.


In [None]:
dfs = []
for d in dates:
    dfs.append(pd.read_csv(os.path.join(data_dir, d, "listings.csv.gz")))
full_listings_df = pd.concat(dfs)

# Only interested in some columns
listings_df = full_listings_df.loc[:,['id', 'name', 'description', 'neighborhood_overview', 'host_id', 'latitude', 'longitude']]

# Drop rows that are identical
listings_df = listings_df.drop_duplicates()

# Drop rows that have no property or neighbourhood description
listings_df = listings_df.dropna(subset=['description', 'neighborhood_overview'], how="any")

# Concatenate the neighbourhood and property descriptions into single text
listings_df['text'] = listings_df['description'] + " " + listings_df['neighborhood_overview']

listings_df

Useful to see the full set of listings data:

In [None]:
full_listings_df

## Read and prepare the Greater Manchester Gentrification Index data and LSOA data

GMGI data:

In [None]:
gmgi = pd.read_csv("../data/gmgi_data/lsoa_summary_jan25.csv")
gmgi = gmgi.iloc[:,1:]  # Drop the first column
gmgi

RLSOA data (from the [ONS Open Geography portal](https://geoportal.statistics.gov.uk/)).
Annoyingly can't find data just for GM, so extract from larger E&W set. Only run this once.

Read the LSOA data

In [None]:
lsoas =  gpd.read_file('../data/LSOAs_2011/LSOA_2011_EW_BSC_V4.shp')
manc_lads = ['Manchester', 'Rochdale', 'Bolton', 'Bury', 'Wigan', 'Oldham',  'Trafford', 'Salford', 'Tameside', 'Stockport']
manc_lads_pattern = '|'.join(manc_lads)
gm_lsoa=lsoas[lsoas['LSOA11NMW'].str.contains(manc_lads_pattern)]
gm_lsoa = gm_lsoa.to_crs(epsg=4326)
gm_lsoa.plot()

In [None]:
"""
NO LONGER RUNNING THIS, BUT CAN'T FIGURE OUT HOW TO MAKE IT raw ON PYCHARM

# ChatGPD says these are the Local Authority District codes for all LADs in Greater Manchester
greater_manchester_lads = [
    "E08000001",  # Bolton
    "E08000002",  # Bury
    "E08000003",  # Manchester
    "E08000004",  # Oldham
    "E08000005",  # Rochdale
    "E08000006",  # Salford
    "E08000007",  # Stockport
    "E08000008",  # Tameside
    "E08000009",  # Trafford
    "E08000010",  # Wigan
]
# Lookup
lookup = pd.read_csv("/Users/geonsm/research/projects/current/integrate/data/Lower_Layer_Super_Output_Area_(2021)_to_Ward_(2023)_to_LAD_(2023)_Lookup_in_England_and_Wales.csv")

# Need to extract Greater Manchester from E&W
ew_lsoa = gpd.read_file("/Users/geonsm/research/projects/current/integrate/data/Lower_layer_Super_Output_Areas_December_2021_Boundaries_EW_BSC_V4_-4299016806856585929.geojson")
# Merge LSOA geometries with LAD codes
ew_lsoa_with_lad = ew_lsoa.merge(lookup[['LSOA21CD', 'LAD23CD']], on='LSOA21CD')
# Filter for Greater Manchester LADs
gm_lsoa = ew_lsoa_with_lad[ew_lsoa_with_lad['LAD23CD'].isin(greater_manchester_lads)]

# Plot to check it looks OK
ax = ew_lsoa.plot(color='lightgrey', linewidth=0.1, edgecolor='white', figsize=(10, 10))
gm_lsoa.plot(ax=ax, color='red', linewidth=0.2, edgecolor='black')
plt.show()

# It's fine, save it
gm_lsoa.to_file(os.path.join(data_dir, "greater_manchester_lsoas.geojson"), driver="GeoJSON")

gm_lsoa = gpd.read_file(os.path.join(data_dir, "greater_manchester_lsoas.geojson"))
gm_lsoa
"""


In [None]:
gm_lsoa.crs

In [None]:
# Attach GMGI columns to lsoa data
gm_gmgi_lsoa = pd.merge(left=gm_lsoa, right=gmgi, left_on="LSOA11CD", right_on="LSOA11CD")
# Map the gentrification index (sanity check)
gm_gmgi_lsoa.plot(column="gi_n")

Attach GI to the airbnb data based on spatial location of the parent LSOA

In [None]:
listings_gdf = gpd.GeoDataFrame(
    listings_df,
    geometry=[Point(xy) for xy in zip(listings_df['longitude'], listings_df['latitude'])],
    crs="EPSG:4326"
)
listings_gdf.plot()


In [None]:
# Spatial join: adds LSOA info to each point based on which polygon it falls in
listings_gdf_gmgi = gpd.sjoin(
    listings_gdf,
    gm_gmgi_lsoa,
    how="left",        # keep all listings, add LSOA where matched
    predicate="within" # or "intersects" if you prefer
)

# Plot with the origina LSOA index scores too
ax = gm_gmgi_lsoa.plot(column="gi_n")
listings_gdf_gmgi.plot(
    ax=ax,
    column="gi_n",
    markersize=15,          # smaller points
    edgecolor="black",      # black outline
    linewidth=0.2,          # outline thickness
    legend=True
)


gInteractive plot to check it is sensible (thanks chatgpt, I've not even read this code!)

In [None]:
# ---------------------------------------------------------------------------
# Interactive GI‑index map for Greater Manchester — Folium version
# ---------------------------------------------------------------------------


# 1.  Clean & re‑project ------------------------------------------------------
# ‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑‑
gm_gmgi_lsoa_temp      = gm_gmgi_lsoa.dropna(subset=["gi_n"]).to_crs(4326)
listings_gdf_gmgi_temp = listings_gdf_gmgi.dropna(subset=["gi_n"]).to_crs(4326)

# 2.  Base map centred on the study area -------------------------------------
centre = gm_gmgi_lsoa_temp.unary_union.centroid
m = folium.Map(
    location=[centre.y, centre.x],
    zoom_start=10,
    tiles="cartodbpositron"
)

# 3.  Continuous colour scale (monotonic index avoids the ValueError) ---------
min_gi, max_gi = gm_gmgi_lsoa_temp["gi_n"].min(), gm_gmgi_lsoa_temp["gi_n"].max()
cmap = cm.LinearColormap(
    colors=["#fee8c8", "#a1dab4", "#41b6c4", "#2c7fb8", "#253494"],
    index=np.linspace(min_gi, max_gi, 5),   # strictly increasing
    vmin=min_gi,
    vmax=max_gi,
    caption="Green‑Infrastructure index (GI_n)"
)

# 4.  LSOA polygons layer -----------------------------------------------------
folium.GeoJson(
    gm_gmgi_lsoa_temp,
    style_function=lambda feat: {
        "fillColor"   : cmap(feat["properties"]["gi_n"]),
        "color"       : "black",
        "weight"      : 0.3,
        "fillOpacity" : 0.7
    },
    tooltip=folium.GeoJsonTooltip(
        fields=["LSOA11CD", "gi_n"],
        aliases=["LSOA", "GI_n"],
        sticky=False
    ),
    name="LSOA GI scores"
).add_to(m)

# 5.  Listing point markers ---------------------------------------------------
for _, row in listings_gdf_gmgi_temp.iterrows():
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius      = 4,
        color       = "black",
        weight      = 0.4,
        fill        = True,
        fill_color  = cmap(row["gi_n"]),
        fill_opacity= 0.9,
        popup       = folium.Popup(f"GI_n = {row['gi_n']:.2f}", show=False)
    ).add_to(m)

# 6.  Legend & controls -------------------------------------------------------
cmap.add_to(m)
folium.LayerControl(collapsed=False).add_to(m)

# cleanup
del gm_gmgi_lsoa_temp, listings_gdf_gmgi_temp

# 7.  Display inline in Jupyter / IPython ------------------------------------
m

# If you want a standalone file instead (or as well):
# m.save("gm_gmgi_interactive_map.html")
# display(m._repr_html_())   # show inline *and* write file

## Calculate the Airbnb embeddings

TODO: have a look for the most appropriate embeddings model, this is just a ChatGPT recommendation


In [None]:
# Load a long-text-capable model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

embeddings_cache = "../data/airbnb-manchester/cached_embeddings.npz"
if os.path.exists(embeddings_cache):
    print(f"Loading embeddings from {embeddings_cache}")
    embeddings = np.load(embeddings_cache)["embeddings"]
else:
    print(f"Calculating embeddings")
    embeddings = model.encode(listings_df['text'].tolist(), show_progress_bar=True)
    np.savez_compressed(embeddings_cache, embeddings=embeddings)

assert len(embeddings) == len(listings_df), "The number of embeddings does not match the number of listings."


Run a PCA to convert these embeddings into size 3, so that they can be mapped to RGB colour space

In [None]:
# PCA → 3 components
X = np.asarray(embeddings)
xyz = PCA(n_components=3, random_state=0).fit_transform(X)

# Scale to [0, 1] then [0, 255] for RGB
xyz_min, xyz_max = xyz.min(axis=0), xyz.max(axis=0)
rgb_255 = ((xyz - xyz_min) / (xyz_max - xyz_min + 1e-9) * 255).astype(int)

# Hex colors
hex_colors = [f'#{r:02x}{g:02x}{b:02x}' for r, g, b in rgb_255]

# Attach to DataFrame
listings_df = listings_df.copy()
listings_df['color'] = hex_colors

Static map of the PCA embedding colours

In [None]:
#plt.figure(figsize=(8, 6))
#plt.scatter(listings_df['longitude'], listings_df['latitude'],
#            c=listings_df['color'], s=10)
#plt.xlabel('Longitude')
#plt.ylabel('Latitude')
#plt.title('Listings coloured by embedding (PCA→RGB)')
#plt.show()

Interactive map

In [None]:
m = folium.Map(location=[listings_df['latitude'].mean(),
                         listings_df['longitude'].mean()],
               zoom_start=10)
for lat, lon, col in zip(listings_df['latitude'],
                         listings_df['longitude'],
                         listings_df['color']):
    folium.CircleMarker([lat, lon], radius=3,
                        color=col, fill=True,
                        fill_color=col, fill_opacity=0.8).add_to(m)
m

## Random forest to predict gentrification from the embeddings

  - Listings data with GMGI: `listings_gdf_gmgi`
  - Associated matrix of embeddings: `embeddings`

Use k-fold crosss validation and tune the hyper parameters

In [None]:


# --- Prepare data ---

## There are na values (problem with LSOAs above) so filter these out temporarily
#valid_mask = ~listings_gdf_gmgi["gi_n"].isna()
#print(f"{len(listings_gdf_gmgi[valid_mask])} valid listings found")

#X = embeddings[valid_mask.values]  # shape: (n_samples, n_features)
X = embeddings
#y = listings_gdf_gmgi.loc[valid_mask, "gi_n"].values  # target: gentrification score
y = listings_gdf_gmgi.loc[:, "gi_n"].values  # target: gentrification score
assert len(X) == len(y) # checked earlier, just making sure

# Split data into training (used for paramter tuning) and hold-out test sets (for testing afterwards)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- Find the best model after hyper parameter tuning ---

model_path = os.path.join("..", "data", "airbnb-manchester", "cached_rf_gridsearch.pkl")  # Optionally cache the final model

if os.path.exists(model_path):
    print("Loading cached model...")
    with open(model_path, "rb") as f:
        grid_search = pickle.load(f)

else:
    print("Training model, this will take some time")          
    param_grid = {
        'n_estimators': [100, 200, 300],         # Number of trees in the forest. More trees = better performance, but slower.
        'max_depth': [None, 10, 20],        # Maximum depth of each tree. None allows nodes to expand until all leaves are pure.
        'min_samples_split': [2, 5],        # Minimum number of samples required to split an internal node.
        'min_samples_leaf': [1, 2],         # Minimum number of samples required to be at a leaf node.
        'max_features': ['sqrt', 'log2'],   # Number of features to consider when looking for the best split.
        'bootstrap': [True, False]          # Whether bootstrap samples are used when building trees.
    }
    
    rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    
    grid_search = GridSearchCV(
        rf,
        param_grid,
        cv=5,  # k for cross-validation
        scoring='neg_mean_squared_error',
        n_jobs=-1,  # Run on all cores
        verbose = 1  # Show progress (not as good as a progress bar, but much simpler)
    )
    
    grid_search.fit(X_train, y_train)
    with open(model_path, "wb") as f:
        pickle.dump(grid_search, f)
    print("Model trained and saved to cache.")

print("Param grid: ", grid_search.param_grid)  # To check the cached one is correct (compare to grid defined above)
print("Best params:", grid_search.best_params_)
print("Best score (neg MSE):", grid_search.best_score_)

best_model = grid_search.best_estimator_

In [None]:
# --- Predict and evaluate ---
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")



In [None]:
# --- Plot predicted vs actual ---
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel("Actual gi_n")
plt.ylabel("Predicted gi_n")
plt.title("Random Forest Predictions")
plt.grid(True)
plt.tight_layout()
plt.show()

Cool. Now map the errors

In [None]:
predictions = best_model.predict(X)  # Predict over the whole dataset (not just the test)
listings_gdf_gmgi['gi_pred'] = predictions
listings_gdf_gmgi['gi_pred_error'] = predictions - listings_gdf_gmgi['gi_n']

listings_gdf_gmgi['gi_pred_error'].hist(bins=20)

In [None]:
listings_gdf_gmgi.plot(
    column="gi_pred_error",
    markersize=2,          # smaller points
    legend=True,
)

I'm concerned that because the embedding matrix is so large that the model may be over-specified.

Sanity check: can I fit any RF to a dataset with a large number of parameters (embedding dimension) and a relatively small number of observations (listings)?

To test, create a random embeding matrix and re-calculate the model.

ChatGPT generated the code below. Seems like the model doesn't work, which is good news!

_The random model is very poor, so the gentrification model may be OK_.


In [None]:
# ------------------------------------------------------------------
# 1.  Create a reproducible random‑embedding matrix (same shape)
# ------------------------------------------------------------------
rng = np.random.RandomState(42)
random_embeddings = rng.normal(size=embeddings.shape)

# ------------------------------------------------------------------
# 2.  Prepare data (only rows with a valid GI score)
# ------------------------------------------------------------------
X_rand = random_embeddings
#X_rand = random_embeddings[valid_mask.values]
y_rand = listings_gdf_gmgi.loc[:, "gi_n"].values
#y_rand = listings_gdf_gmgi.loc[valid_mask, "gi_n"].values

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_rand, y_rand, test_size=0.2, random_state=42
)

# ------------------------------------------------------------------
# 3.  Train a random‑forest model
# ------------------------------------------------------------------
rf_rand = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)
rf_rand.fit(X_train_r, y_train_r)

# ------------------------------------------------------------------
# 4.  Evaluate performance
# ------------------------------------------------------------------
y_pred_r = rf_rand.predict(X_test_r)
mse_r = mean_squared_error(y_test_r, y_pred_r)
r2_r = r2_score(y_test_r, y_pred_r)

print(f"Random embeddings baseline — MSE: {mse_r:.4f}")
print(f"Random embeddings baseline — R²:  {r2_r:.4f}")

# ------------------------------------------------------------------
# 5.  Plot predicted vs. actual
# ------------------------------------------------------------------
plt.figure(figsize=(6, 6))
plt.scatter(y_test_r, y_pred_r, alpha=0.5)
plt.plot(
    [y_rand.min(), y_rand.max()],
    [y_rand.min(), y_rand.max()],
    linestyle="--", linewidth=1
)
plt.xlabel("Actual gi_n")
plt.ylabel("Predicted gi_n (random)")
plt.title("Random‑embedding baseline")
plt.grid(True)
plt.tight_layout()
plt.show()

## NN to predict gentriciation

Repeat with a neural network to see whether it is any better

In [None]:
# Define model cache path
model_path = os.path.join("..", "data", "airbnb-manchester", "cached_nn_gridsearch.pkl")

# Load or train model
if os.path.exists(model_path):
    print("Loading cached model...")
    with open(model_path, "rb") as f:
        grid_search_nn = pickle.load(f)
else:
    print("Training neural network...")

    # Grid search parameters.
    # (The first version was when we used a NN directly, rather than in a pipeline
    # with a standard scaler)
    #param_grid = {
    #    'hidden_layer_sizes': [
    #        (50,), (100,), (100, 50), (200,), (200, 100), (100, 100)
    #    ],
    #    'activation': ['relu', 'tanh'],
    #    'alpha': [1e-5, 1e-4, 1e-3],  # L2 regularization
    #    'learning_rate_init': [0.0001, 0.001, 0.01],
    #    'solver': ['adam', 'lbfgs'],  # Try both optimizers (lbfgs often works better for smaller datasets)
    #}
    param_grid = {
        'mlp__hidden_layer_sizes': [
            (50,), (100,), (100, 50), (200,), (200, 100), (100, 100)
        ],
        'mlp__activation': ['relu', 'tanh'],
        'mlp__alpha': [1e-5, 1e-4, 1e-3],
        'mlp__learning_rate_init': [0.0001, 0.001, 0.01],
        'mlp__solver': ['adam', 'lbfgs'],
    }


    # Create the NN. Run the data through a standard scaler.
    #nn = MLPRegressor(max_iter=1000, random_state=42)
    nn = Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPRegressor(max_iter=2000, random_state=42))
    ])

    grid_search_nn = GridSearchCV(
        nn,
        param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )

    grid_search_nn.fit(X_train, y_train)

    with open(model_path, "wb") as f:
        pickle.dump(grid_search_nn, f)
    print("Model trained and saved to cache.")

# Print best model info
print("Best params:", grid_search_nn.best_params_)
print("Best score (neg MSE):", grid_search_nn.best_score_)

best_model_nn = grid_search_nn.best_estimator_

In [None]:
# --- Predict and evaluate ---
best_model_nn = grid_search_nn.best_estimator_
y_pred_nn = best_model_nn.predict(X_test)

mse_nn = mean_squared_error(y_test, y_pred)
r2_nn = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse_nn:.4f}")
print(f"R² Score: {r2_nn:.4f}")



In [None]:
# --- Plot predicted vs actual ---
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred_nn, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel("Actual gi_n")
plt.ylabel("Predicted gi_n")
plt.title("Neural Network Predictions")
plt.grid(True)
plt.tight_layout()
plt.show()

It doesn't work as well as the random forest, lets use that.

# Apply the Manchester Model to San Francisco

The model that predicts a gentrification in Manchester works very well. Lets see if it works in San Francicso (a place with a history of gentrification-related research and abundant data).

Method:

 - Download AirBnB data for San Francisco
 - Calculate the embeddings
 - Use the previous model to predict gentrification
 - Compare to San Francisco gentrification data, i.e. Landis, UDP and Freeman classifications published [here](https://pmc.ncbi.nlm.nih.gov/articles/PMC6616964/). (_Note that those are categorical classification so will need to convert model output scores to categories_).

## San Francisco AirBnB data



Download the data from Inside Airbnb (if required)

In [None]:
# Same code as used to download / obtain the Manchester data

dates = [ '2025-03-01', '2024-12-04', '2024-09-04', '2024-06-04' ]
files = ["listings.csv.gz", "reviews.csv.gz"]
data_dir = os.path.join("..", "data", "airbnb-sanfrancisco")
root_url = "https://data.insideairbnb.com/united-states/ca/san-francisco/"
neighbourhoods_url = "https://data.insideairbnb.com/united-states/ca/san-francisco/2025-03-01/visualisations/neighbourhoods.geojson"



for d in dates:
    # Check if d is already a directory
    if os.path.isdir(os.path.join(data_dir, d)):
        print(f"Directory {d} already exists.")
        continue
    # It isn't, so create it and download the data
    os.makedirs(os.path.join(data_dir, d), exist_ok=True)
    print(f"Downloading data for {d} ...")
    for f in files:
        url = f"{root_url}{d}/data/{f}"
        try:
            print(f"\tDownloading {f} from {url} ", end="")
            urllib.request.urlretrieve(url, os.path.join(data_dir, d, f))
            print("...done.")
        except urllib.error.HTTPError as e:
            print(f"Error downloading {f} for {d}: {e}")
        # Sleep for a few seconds so not to abuse their server
        time.sleep(3)


# Get the neighbourhoods first, if we haven't done so already
if not os.path.isfile(os.path.join(data_dir, "neighbourhoods.geojson")):
    try:
        print(f"Downloading neighbourhoods from {neighbourhoods_url} ", end="")
        urllib.request.urlretrieve(neighbourhoods_url, os.path.join(data_dir, "neighbourhoods.geojson") )
        print(" ...done.")
    except urllib.error.HTTPError as e:
        print(f"Error downloading neighbourhoods: {e}")


Read the data

https://data.insideairbnb.com/united-states/ca/san-francisco/2024-09-04/data/listings.csv.gz

In [None]:
dfs = []
for d in dates:
    dfs.append(pd.read_csv(os.path.join(data_dir, d, "listings.csv.gz")))
sf_full_listings_df = pd.concat(dfs)

# Only interested in some columns
sf_listings_df = sf_full_listings_df.loc[:,['id', 'name', 'description', 'neighborhood_overview', 'host_id', 'latitude', 'longitude']].drop_duplicates().dropna(subset=['description', 'neighborhood_overview'], how="any")

# Concatenate the neighbourhood and property descriptions into single text
sf_listings_df['text'] = sf_listings_df['description'] + " " + sf_listings_df['neighborhood_overview']

sf_listings_df

Spatialise the listings data

In [None]:
sf_listings_gdf = gpd.GeoDataFrame(
    sf_listings_df,
    geometry=[Point(xy) for xy in zip(sf_listings_df['longitude'], sf_listings_df['latitude'])],
    crs="EPSG:4326"
)
sf_listings_gdf.plot(markersize=0.8)


## Calculate the SF embeddings

In [None]:
print(f"Using model {model}")

embeddings_cache = "../data/airbnb-sanfrancisco/cached_embeddings.npz"
if os.path.exists(embeddings_cache):
    print(f"Loading embeddings from {embeddings_cache}")
    sf_embeddings = np.load(embeddings_cache)["embeddings"]
else:
    print(f"Calculating embeddings")
    sf_embeddings = model.encode(sf_listings_df['text'].tolist(), show_progress_bar=True)
    np.savez_compressed(embeddings_cache, embeddings=sf_embeddings)

assert len(sf_embeddings) == len(sf_listings_df), \
    "The number of embeddings does not match the number of listings. Do you need to recreate the cache?"


Visualise the embeddings (map PCA reduction to three components: R, G, B)

In [None]:
# PCA → 3 components
X = np.asarray(sf_embeddings)
xyz = PCA(n_components=3, random_state=0).fit_transform(X)

# Scale to [0, 1] then [0, 255] for RGB
xyz_min, xyz_max = xyz.min(axis=0), xyz.max(axis=0)
rgb_255 = ((xyz - xyz_min) / (xyz_max - xyz_min + 1e-9) * 255).astype(int)

# Hex colors
hex_colors = [f'#{r:02x}{g:02x}{b:02x}' for r, g, b in rgb_255]

# Attach to DataFrame
sf_listings_df['color'] = hex_colors

In [None]:
# Static map

#plt.figure(figsize=(8, 6))
#plt.scatter(sf_listings_df['longitude'], sf_listings_df['latitude'],
#            c=sf_listings_df['color'], s=10)
#plt.xlabel('Longitude')
#plt.ylabel('Latitude')
#plt.title('Listings coloured by embedding (PCA→RGB)')
#plt.show()

In [None]:
m = folium.Map(location=[sf_listings_df['latitude'].mean(),
                         sf_listings_df['longitude'].mean()],
               zoom_start=13)
for lat, lon, col in zip(sf_listings_df['latitude'],
                         sf_listings_df['longitude'],
                         sf_listings_df['color']):
    folium.CircleMarker([lat, lon], radius=2,
                        color=col, fill=True,
                        fill_color=col, fill_opacity=0.8).add_to(m)
m

## Predict SF gentrification

Start by predicting scores, then convert these into categories so that they can be compared to the published data.

In [None]:
sf_gentrification_pred = best_model.predict(sf_embeddings)

sf_listings_df['gentrification_pred'] = sf_gentrification_pred

In [None]:
var_to_map = 'gentrification_pred'  # To make less repetition later

# Create a linear colormap
colormap = cm.linear.viridis.scale(
    min(sf_listings_df[var_to_map]), max(sf_listings_df[var_to_map])
)

# Create map
m = folium.Map(location=[sf_listings_df['latitude'].mean(),
                         sf_listings_df['longitude'].mean()],
               zoom_start=13)

# Add CircleMarkers with mapped colors
for lat, lon, val in zip(sf_listings_df['latitude'],
                         sf_listings_df['longitude'],
                         sf_listings_df[var_to_map]):
    folium.CircleMarker([lat, lon], radius=2,
                        color=colormap(val),
                        fill=True, fill_color=colormap(val),
                        fill_opacity=0.8).add_to(m)

# Optionally add the colormap legend to the map
colormap.caption = 'Gentrification Score'
colormap.add_to(m)

m

In [None]:
sf_gentrification_pred

## Compare SF gentrification categories to real data

XXXX HERE Try to get the data from this publication and somehow(?) compare it to these scores

https://pmc.ncbi.nlm.nih.gov/articles/PMC6616964/

In the [Mujahid paper](https://pmc.ncbi.nlm.nih.gov/articles/PMC6616964/), the Landis and Freeman are largely consistent (or at least somewhat consistent) whereas the UDP is quite different. So we compare our SF prediction to Landis and Freeman. This isn't ideal because those are categorical classifications whereas the Manchester one is numeric, so we simply divide our data into decile ranges that loosely represent the categories.

![Gentrification classifications (from Mojahid et al. (2019))](img/sf_gentrification.png)

The following table shows the Landis and Freeman classification categories, and how we divide the Manchester index to losely correspond to those:

| Name | Landis      | Freeman     | Manchester Index Decile Range |
|------|-------------|-------------|-------------------------------|
| B    | Stable      | Stable      | 30-59                         |
| A    | Declining   | Excluded    | 0-29                          |
| C    | Gentrifying | Gentrifying | 59-100                        |



In [None]:
# Categorise depending on prediction

# Define thresholds and labels
thresholds = [0, 30, 70, 100]  # Adjust these as needed
labels = ['A', 'B', 'C']

# Convert deciles to quantile values
quantile_values = [t / 100 for t in thresholds]

# Compute the actual threshold values from the data
cut_points = sf_listings_df['gentrification_pred'].quantile(quantile_values).values
print(f"Cut points: {cut_points}")

# Assign categories
sf_listings_df['gentrification_cat'] = pd.cut(
    sf_listings_df['gentrification_pred'],
    bins=cut_points,
    labels=labels,
    include_lowest=True,
    right=False  # to match "0-29", "30-69", "70-100"
)
sf_listings_df.loc[:,['gentrification_pred', 'gentrification_cat', 'text']]

In [None]:
# Map the categories (thanks ChatGPT)

var_to_map = 'gentrification_cat'

# Get unique categories and assign colors
categories = sorted(sf_listings_df[var_to_map].dropna().unique())
colormap = branca.colormap.linear.Set1_03.scale(0, len(categories) - 1)
color_dict = {cat: colormap(i) for i, cat in enumerate(categories)}

# Create the map
m = folium.Map(location=[sf_listings_df['latitude'].mean(),
                         sf_listings_df['longitude'].mean()],
               zoom_start=13)

# Add circle markers
for _, row in sf_listings_df.iterrows():
    val = row[var_to_map]
    color = color_dict.get(val, 'gray')
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=2,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.8
    ).add_to(m)

# Add a simple HTML legend
legend_items = ''.join(
    f'<i style="background:{color_dict[c]};width:10px;height:10px;display:inline-block;margin-right:5px;"></i> {c}<br>'
    for c in categories
)
legend_html = f"""
<div style="position: fixed; bottom: 30px; left: 30px; width: 160px;
            background-color: white; padding: 10px; border: 1px solid grey; z-index:9999;">
<b>Gentrification Category</b><br>{legend_items}
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

m

I don't think it's working. I.e. the Manchester prediction model doesn't work well in San Francisco.

One last thing to try is make a kernel-weighted moving-average map from the raw prediction values to see if this shows a trend.

In [None]:
# Thanks ChatGPT!

# --- Convert DataFrame → GeoDataFrame (lat/lon → geometry) ---
gdf = gpd.GeoDataFrame(
    sf_listings_df,
    geometry=gpd.points_from_xy(sf_listings_df['longitude'],
                                sf_listings_df['latitude']),
    crs="EPSG:4326"
).to_crs(3857)  # Need a metric CRS (metres)

# --- Kernel-weighted mean surface of gentrification_pred ---
coords   = np.stack([gdf.geometry.x, gdf.geometry.y], axis=1)
values   = gdf['gentrification_pred'].to_numpy()
bw       = 400                                    # metres
kde_den  = KernelDensity(bandwidth=bw).fit(coords)
kde_num  = KernelDensity(bandwidth=bw).fit(coords, sample_weight=values)

xmin, ymin, xmax, ymax = gdf.total_bounds
#n = 400                                           # grid size
n = 100                                           # grid size
xx, yy = np.meshgrid(np.linspace(xmin, xmax, n),
                     np.linspace(ymin, ymax, n))
grid_pts = np.c_[xx.ravel(), yy.ravel()]

avg = np.exp(kde_num.score_samples(grid_pts) -
             kde_den.score_samples(grid_pts)).reshape(n, n)

# --- Render RGBA raster in memory ---
cmap, norm = plt.cm.viridis, plt.Normalize(avg.min(), avg.max())
rgba = (cmap(norm(avg)) * 255).astype(np.uint8)

# --- Save PNG with geotransform ---
out_png = "/tmp/gentri_surface.png"
transform = from_bounds(xmin, ymin, xmax, ymax, n, n)
with rasterio.open(out_png, "w", driver="PNG",
                   width=n, height=n, count=4, dtype=rgba.dtype,
                   transform=transform, crs="EPSG:3857") as dst:
    for i in range(4):
        dst.write(rgba[:, :, i], i + 1)


In [None]:

# --- Folium overlay ---
m = folium.Map(location=[sf_listings_df['latitude'].mean(),
                         sf_listings_df['longitude'].mean()],
               zoom_start=13, tiles='CartoDB positron')

folium.raster_layers.ImageOverlay(
    image=out_png,
    bounds=[[gdf.to_crs(4326).geometry.y.min(), gdf.to_crs(4326).geometry.x.min()],
            [gdf.to_crs(4326).geometry.y.max(), gdf.to_crs(4326).geometry.x.max()]],
    opacity=0.5,
    name='Avg gentrification_pred',
).add_to(m)

folium.LayerControl().add_to(m)
m

# Extras

TODO:
 - [X] Tune the model
 - [X] k-Fold test/validation
 - [X] Ranger RF (implementation in scikit-learn)
 - [ ] Convert above to hex heat map

Apply elsewhere - San Francisco (and/or UK)
 - [London](https://trustforlondon.org.uk/data/gentrification-across-london/) (and through [CDRC>](https://www.cdrc.ac.uk/quantifying-state-led-gentrification-in-london/))

Apply to other text (i.e. recalculate embeddings and apply trained RF)
  - Twitter? Reddit? Insta?