In [1]:
%reload_ext autotime
import pandas as pd
import geopandas as gpd
from glob import glob
import os
from tqdm.auto import tqdm
import statsmodels.formula.api as smf
tqdm.pandas()

In [2]:
pd.set_option('display.max_columns', None)
df = pd.read_csv("results.csv")
LLM_results = pd.read_csv("LLM_results.csv")
new_cols = LLM_results.columns[~LLM_results.columns.isin(df.columns)].tolist()
df = df.merge(LLM_results[new_cols + ["panoid"]], on="panoid")
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326").to_crs(2193)
df.to_csv("merged_results.csv", index=False)
df

Unnamed: 0,Index,pid,n,time,anxiousness,latitude,longitude,geometry,panoid,panolat,panolon,panodate,panothirdparty,source,uploader,green,environment,water,obscured,people,cars,bikes
0,0,P20001,1,2023-04-25T02:51:42Z,0,-36.924795,174.738044,POINT (1754803.112 5912059.517),IvrcS0W1RlFAlnci-p39XA,-36.924665,174.737914,2012-04,False,launch,,70,residential,0,30,0,1,0
1,1,P20001,2,2023-04-25T08:43:13Z,0,-36.924801,174.738076,POINT (1754806.011 5912058.736),IvrcS0W1RlFAlnci-p39XA,-36.924665,174.737914,2012-04,False,launch,,70,residential,0,30,0,1,0
2,2,P20001,3,2023-04-26T02:22:26Z,0,-36.924783,174.738071,POINT (1754805.575 5912060.734),IvrcS0W1RlFAlnci-p39XA,-36.924665,174.737914,2012-04,False,launch,,70,residential,0,30,0,1,0
3,3,P20001,4,2023-04-26T09:00:53Z,0,-36.924797,174.738123,POINT (1754810.215 5912059.092),IvrcS0W1RlFAlnci-p39XA,-36.924665,174.737914,2012-04,False,launch,,70,residential,0,30,0,1,0
4,4,P20001,5,2023-04-26T20:36:19Z,3,-36.924771,174.738084,POINT (1754806.75 5912062.109),IvrcS0W1RlFAlnci-p39XA,-36.924665,174.737914,2012-04,False,launch,,70,residential,0,30,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1318,1426,P20721,1,2024-05-05T02:00:52Z,1,-36.893455,174.728262,POINT (1753994.828 5915552.341),CfRtPfDMNhfXHTNvMwnYRw,-36.893394,174.728062,2024-06,False,launch,,40,residential,0,20,1,1,0
1319,1427,P20721,2,2024-05-05T07:36:22Z,1,-36.893483,174.728253,POINT (1753993.98 5915549.306),CfRtPfDMNhfXHTNvMwnYRw,-36.893394,174.728062,2024-06,False,launch,,40,residential,0,20,1,1,0
1320,1428,P20721,3,2024-05-05T23:06:27Z,2,-36.845252,174.759951,POINT (1756917.266 5920848.722),AF1QipN2FD2eYEmK8bRpEgoM7fFl5-nUstwWujnRj0gv,-36.845292,174.759939,2022-06-24,True,photos:street_view_publish_api,Mint Design,10,built up,0,60,0,3,0
1321,1429,P20721,4,2024-05-06T07:04:57Z,0,-36.845165,174.759885,POINT (1756911.566 5920858.442),AF1QipNj6yheGtCvR6Gk2Svq_lG_fuaGPjehPV8kouy8,-36.845177,174.759792,2022-06-24,True,photos:street_view_publish_api,Mint Design,0,Built up,0,100,0,0,0


In [3]:
files = glob("nz-*-topo-150k/*.gpkg")
files

['nz-river-polygons-topo-150k/nz-river-polygons-topo-150k.gpkg',
 'nz-lake-polygons-topo-150k/nz-lake-polygons-topo-150k.gpkg',
 'nz-coastlines-topo-150k/nz-coastlines-topo-150k.gpkg',
 'nz-pond-polygons-topo-150k/nz-pond-polygons-topo-150k.gpkg',
 'nz-spring-points-topo-150k/nz-spring-points-topo-150k.gpkg',
 'nz-river-centrelines-topo-150k/nz-river-centrelines-topo-150k.gpkg',
 'nz-swamp-polygons-topo-150k/nz-swamp-polygons-topo-150k.gpkg']

In [4]:
water_dict = {}
for f in tqdm(files):
  name = os.path.dirname(f).replace("nz-", "").replace("-topo-150k", "")
  water_gdf = gpd.read_file(f)
  water_gdf.sindex
  water_dict[name] = water_gdf

  0%|          | 0/7 [00:00<?, ?it/s]

In [5]:
def get_closest_water(point):
  closest_distance = 1e10
  closest_name = "?"
  for name, water_gdf in water_dict.items():
    distance = water_gdf.sindex.nearest(point, return_distance=True)[1][0]
    if distance < closest_distance:
      closest_distance = distance
      closest_name = name
  return closest_name, closest_distance

df["water_type"], df["water_distance"] = zip(*df.geometry.progress_apply(get_closest_water))

  0%|          | 0/1323 [00:00<?, ?it/s]

In [6]:
df.water_type.value_counts()

water_type
river-centrelines    564
lake-polygons        409
coastlines           341
pond-polygons          5
river-polygons         4
Name: count, dtype: int64

In [7]:
df.water_distance.describe().apply("{0:.5f}".format)

count       1323.00000
mean        9402.82600
std       126620.10526
min            4.32646
25%          384.24734
50%          690.59940
75%         1101.74075
max      1885737.33112
Name: water_distance, dtype: object

In [8]:
df.to_csv("merged_results.csv", index=False)

In [9]:
smf.ols(formula="anxiousness ~ green + environment + water + obscured + people + cars + bikes + water_distance + water_type", data=df).fit().summary()

0,1,2,3
Dep. Variable:,anxiousness,R-squared:,0.023
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,1.672
Date:,"Fri, 13 Dec 2024",Prob (F-statistic):,0.0381
Time:,12:12:55,Log-Likelihood:,-3125.5
No. Observations:,1323,AIC:,6289.0
Df Residuals:,1304,BIC:,6388.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.3122,0.329,7.034,0.000,1.667,2.957
environment[T.built up],-0.5638,0.452,-1.246,0.213,-1.451,0.324
environment[T.cafes],-2.2787,1.334,-1.708,0.088,-4.896,0.338
environment[T.green],-0.6358,1.344,-0.473,0.636,-3.272,2.000
environment[T.park],-2.1825,2.617,-0.834,0.405,-7.317,2.952
environment[T.residential],0.2727,0.300,0.909,0.363,-0.316,0.861
environment[T.rural],-2.2408,2.610,-0.859,0.391,-7.360,2.879
environment[T.shops],-0.8079,0.574,-1.406,0.160,-1.935,0.319
water_type[T.lake-polygons],-0.2273,0.193,-1.175,0.240,-0.607,0.152

0,1,2,3
Omnibus:,135.42,Durbin-Watson:,0.828
Prob(Omnibus):,0.0,Jarque-Bera (JB):,178.652
Skew:,0.898,Prob(JB):,1.61e-39
Kurtosis:,2.889,Cond. No.,4710000.0
