Note:



In [1]:
import pandas as pd
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
from scipy.spatial import cKDTree
import numpy as np
import requests
import json
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
tqdm.pandas()  # "tqdm>=4.9.0"

  from .autonotebook import tqdm as notebook_tqdm


# property data

In [2]:
property_data_path = "../data/curated/rental-17-24.csv"
output_dir = "../data/raw/time_park.csv"
property_df = pd.read_csv(property_data_path)
coords = property_df[['lat', 'lng']].drop_duplicates()

# Parks

In [9]:
parks_path = "../data/curated/parks-and-reserves1.csv"
parks_df = pd.read_csv(parks_path)[['latitude', 'longitude']]
parks_df.rename(columns={'latitude': 'lat', 'longitude': 'lng'}, inplace=True)

In [11]:
num_to_find = 1
tree = cKDTree(parks_df[['lng', 'lat']].values)
property_coords = coords[['lng', 'lat']].values
distances, indices = tree.query(property_coords, k=1)
coords["park_index"] = list(indices)

In [12]:
coords["park_index"].value_counts()

park_index
1351    191606
2112     67516
2689     31368
2383     13297
1616     12795
         ...  
412          1
1246         1
1202         1
2672         1
396          1
Name: count, Length: 2136, dtype: int64

In [13]:
from collections import defaultdict
err_count = defaultdict(int)
def get_time_proximity(coordinates):
    url = "http://localhost:8080/ors/v2/directions/driving-car"
    body = {"coordinates": coordinates}
    # print(body)
    response = requests.post(url, json=body)
    try:
        if response.status_code == 200:
            summary = response.json()["routes"][0]["summary"]
            if "duration" in summary:
                return summary["duration"]
            return 0
        else:
            
            if(response.json()["error"]["code"] == 2004):
                # distance is too long (> 100000.0 m)
                err_count[2004] += 1
                return -1
            if(response.json()["error"]["code"] == 2010):
                err_count[2010] += 1
                # "error":{"code":2010,"message":"Could not find routable point within a radius of 400.0 meters of specified coordinate
                return -1
            print(response.text)
            print(body)
            
            raise Exception("Error in request")
    except Exception as e:
        print(response.json())
        raise e

# # Example usage
# directions = get_time_proximity([[144.96332, -37.8140], [144.96332, -37.8120]])
# print(directions)

In [14]:
def get_time_proximity_from_property(row, cities):
    city = cities.iloc[int(row['city_index'])]
    coordinates = [[row['lng'], row['lat']], [city['lng'], city['lat']]]
    time = get_time_proximity(coordinates)
    # print(time)
    return time

In [15]:
coords.shape

(394888, 4)

In [18]:
# dist time matching
coords["time_city"] = coords.progress_apply(lambda row: get_time_proximity_from_property(row, parks_df), axis=1)
# property_df.to_csv(f"../data/raw/property/rent_{i}_ptv.csv", index=False)

100%|██████████| 394888/394888 [15:39<00:00, 420.28it/s]


In [19]:
err_count

defaultdict(int, {2010: 140})

In [21]:
coords.to_csv(output_dir, index=False)

# join to orginal dataset

## read all

In [28]:
import sys, os
import pandas as pd
import geopandas as gpd
sys.path.append("../")
from scripts.proximity import proximity_hard_join, proximity_sjoin

In [33]:
cleaned_df = pd.read_csv(property_data_path)
city_df = pd.read_csv(city_output)

In [None]:
cleaned_df

In [34]:
proximity_sjoin(cleaned_df, city_df).shape




(106402, 20)

In [5]:
city_df['time_city'].value_counts()

time_city
-1.0       44211
 1979.6       37
 1952.0       36
 2044.8       36
 1991.7       36
           ...  
 3717.8        1
 3726.9        1
 3516.5        1
 3411.1        1
 4702.7        1
Name: count, Length: 49128, dtype: int64

In [82]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
city_coords = pd.read_csv(f"../data/raw/time_city.csv")
gdf_city_coords = gpd.GeoDataFrame(city_coords, geometry=gpd.points_from_xy(city_coords.lng, city_coords.lat))
gpd_cleaned_df = gpd.GeoDataFrame(cleaned_df, geometry=gpd.points_from_xy(cleaned_df.lng, cleaned_df.lat))

# Ensure both GeoDataFrames have the same CRS (Coordinate Reference System)
gdf_city_coords = gdf_city_coords.set_crs("EPSG:4326")
gpd_cleaned_df = gpd_cleaned_df.set_crs("EPSG:4326")

# Perform the nearest spatial join
joined_gdf = gpd.sjoin_nearest(gpd_cleaned_df,gdf_city_coords[['geometry','time_city']], how="left",rsuffix='city_coords')
joined_gdf.drop(columns=['index_city_coords'],inplace=True)





ValueError: 'index_right' cannot be a column name in the frames being joined

In [94]:
gpd_cleaned_df


Unnamed: 0,lat,lng,address,bed,bath,car,type,rented_price,date,year,geometry,sa2_code_left,region,median_income,index_right,sa2_code_right,sa2_name,population,cpi,unemployment_rate
0,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,800,2023-08-01,2023,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,43780.105621,11621.0,206041505.0,Melbourne CBD - West,20027.0,4.1,3.691667
1,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,800,2023-07-01,2023,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,43780.105621,11621.0,206041505.0,Melbourne CBD - West,20027.0,4.1,3.691667
2,-37.813730,144.955580,"201/560 LONSDALE STREET, MELBOURNE",2.0,2.0,1.0,Unit/apmt,540,2021-08-01,2021,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,39300.000000,10577.0,206041505.0,Melbourne CBD - West,16098.0,3.5,4.200000
3,-37.813730,144.955580,"1702/560 LONSDALE STREET, MELBOURNE",2.0,1.0,0.0,Unit/apmt,720,2023-08-01,2023,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,43780.105621,11621.0,206041505.0,Melbourne CBD - West,20027.0,4.1,3.691667
4,-37.813730,144.955580,"1702/560 LONSDALE STREET, MELBOURNE",2.0,1.0,0.0,Unit/apmt,650,2023-06-01,2023,POINT (144.95558 -37.81373),206041505.0,Melbourne CBD - West,43780.105621,11621.0,206041505.0,Melbourne CBD - West,20027.0,4.1,3.691667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531551,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,370,2019-09-01,2019,POINT (144.73271 -37.88363),213051581.0,Point Cook - North West,57433.000000,9841.0,213051581.0,Point Cook - North West,9739.0,1.8,5.175000
1531552,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,370,2019-08-01,2019,POINT (144.73271 -37.88363),213051581.0,Point Cook - North West,57433.000000,9841.0,213051581.0,Point Cook - North West,9739.0,1.8,5.175000
1531553,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,370,2019-07-01,2019,POINT (144.73271 -37.88363),213051581.0,Point Cook - North West,57433.000000,9841.0,213051581.0,Point Cook - North West,9739.0,1.8,5.175000
1531554,-37.883631,144.732710,"96 BOARDWALK BOULEVARD, POINT COOK",3.0,2.0,2.0,Unit/apmt,365,2018-06-01,2018,POINT (144.73271 -37.88363),213051581.0,Point Cook - North West,57001.000000,9319.0,213051581.0,Point Cook - North West,9678.0,1.8,5.283333


In [32]:
cleaned_df.dtypes

lat                                    float64
lng                                    float64
address                                 object
bed                                    float64
bath                                   float64
car                                    float64
type                                    object
rented_price                             int64
date                                    object
year                                     int64
geometry                                object
sa2_code                                 int64
region                                  object
median_income                          float64
population                               int64
Statistical Areas Level 2 2021 name     object
cpi_x                                  float64
cpi_y                                  float64
unemployment_rate                      float64
dtype: object