In [91]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from geopy.geocoders import Nominatim
from shapely.geometry import Polygon, Point
import geopandas as gpd
from tqdm import tqdm
import regex as re
from pathlib import Path
import json


In [213]:
DATA_DIR = Path("data_bomen")
INCIDENT_DATA_PATH = DATA_DIR / 'Stormdata & FireStations geadresseerd.csv'
TREE_DATA_PATH =  DATA_DIR / "BOMEN_DATA.csv"
TREE_DATA_WITH_ZIP_PATH = DATA_DIR / "BOMEN_DATA_WITH_ZIP.csv"
ZIPCODE_JSON_PATH = DATA_DIR / "zipcodes_boxes.json"
TREE_DATA_CLEAN_PATH = DATA_DIR / "tree_geo_data_clean.csv"
GRID_DATA_PATH = DATA_DIR / "grid_enriched.csv"
INCIDENTS_WEATHER_PATH = DATA_DIR / "incidents_weather.csv"
INCIDENTS_WEATHER_GEO_PATH = DATA_DIR / "incidents_weather_geo.csv"

ZIP_KEY = "Zipcode"
ZIP4_KEY = "Zip4"

DATE_WINDOW = 7

TREE_COLUMNS = [
    "id",
    "soortnaamKort",        # andere soortnamen??
    "boomhoogte",
    "stamdiameter",
    "jaarVanAanleg",
    "typeObject",
    "standplaatsGedetailleerd",
    'SDVIEW',
    "RADIUS",
    "location",
    "grid_id",
]

MAP_BOOMHOOGTE = {
    'a. tot 6 m.' : "0-6",
    'b. 6 tot 9 m.': "6-9",
    'c. 9 tot 12 m.': "9-12",
    'd. 12 tot 15 m.': "12-15",
    'e. 15 tot 18 m.': "15-18",
    'f. 18 tot 24 m.': "18-25",
    'g. 24 m. en hoger': "24",
    'q. Niet van toepassing': "hQ"
}

MAP_STAMDIAMETER = {
    '0,1 tot 0,2 m.': "0.1-0.2",
    '0,2 tot 0,3 m.' : "0.2-0.3",
    '0,3 tot 0,5 m.': "0.3-0.5",
    '0,5 tot 1 m.': "0.5-1.0",
    '1,0 tot 1,5 m.': "1.0-1.5",
    '1,5 m. en grot': "1.5",
    'Onbekend': "dQ",
}

SERVICE_AREAS_OUT_OF_SCOPE = [
    "Amstelveen",
    "Aalsmeer",
    "Uithoorn"
]

RF_INCIDENT_COLUMNS = [
    "Incident_ID",
    "Service_Area",
    "grid_id",
    "Date",
    "Hour",
    "temperature_2m",
    "relative_humidity_2m",
    "dew_point_2m",
    "apparent_temperature",
    "precipitation",
    "rain",
    "snowfall",
    "snow_depth",
    "weather_code",
    "pressure_msl",
    "surface_pressure",
    "wind_speed_10m",
    "wind_direction_10m",
    "wind_gusts_10m",
    "soil_temperature_0_to_7cm",
    "soil_temperature_7_to_28cm",
    "soil_temperature_28_to_100cm",
    "soil_temperature_100_to_255cm",
    "soil_moisture_0_to_7cm",
    "soil_moisture_7_to_28cm",
    "soil_moisture_28_to_100cm",
    "soil_moisture_100_to_255cm",
]

RF_TREE_COLUMNS = [
    "tree_id",
    "grid_id",
    "soortnaamKort",
    "boomhoogte",
    "stamdiameter",
    "jaarVanAanleg",
    "typeObject",
    "standplaatsGedetailleerd",
]

RF_GRID_COLUMNS = [
    "grid_id",
    "has_tree",
    "avg_height",
    "avg_diameter",
    'avg_year',
    'Fraxinus', 
    'Salix', 
    'Alnus', 
    'Quercus', 
    'Tilia', 
    'Acer',
    'Populus', 
    'Betula', 
    'Prunus', 
    'Platanus', 
    'Malus', 
    'Robinia',
    'Crataegus', 
    'Ulmus', 
    'Carpinus', 
    'Overig', 
    'Onbekend'
]

In [93]:
# read storm_data
df = pd.read_csv(INCIDENT_DATA_PATH, sep=",", encoding="utf-8")
df = df.drop(['Unnamed: 0'], axis=1)
df = df.set_index('Incident_ID')

# read tree data
trees = pd.read_csv(TREE_DATA_PATH, sep=",", encoding="utf-8")

In [94]:
incidents_weather_df = pd.read_csv(INCIDENTS_WEATHER_PATH, sep=",", encoding="utf-8")
# Filter on areas in scope
incidents_weather_df = incidents_weather_df[~incidents_weather_df.Service_Area.isin(SERVICE_AREAS_OUT_OF_SCOPE)]


In [95]:
#Extracts the first 4 digits of zipcode '1010 AA' --> '1010' 
def extract_zip_4(
    df,
    zip_col,
    new_col,
):
    # Strips off last two number from zipcode (1010 AA - > 1010)
    pattern = r'\d{4}'
    df[new_col] = [re.match(pattern, zip_code)[0] if isinstance(zip_code, str) else None for zip_code in df[zip_col].values]
    return df


def add_zipcode(
    df
):
    # Adds zipcode based on lat and long coordinate values
    geolocator = Nominatim(user_agent="my_request")
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
        Latitude = row['LAT']
        Longitude = row['LNG']
        
        location = geolocator.reverse(f"{Latitude},{Longitude}")
        address = location.raw['address']
    
        zipcode = str(address.get('postcode'))  
        
        df.at[index, 'Zipcode'] = zipcode
    return df

In [96]:
df_tree_incidents = df[df["Damage_Type"]=="Tree"]
df_tree_incidents = extract_zip_4(df=df_tree_incidents, zip_col=ZIP_KEY, new_col=ZIP4_KEY)
df_tree_incidents



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,Incident_Starttime,Incident_Endtime,Incident_Duration,Incident_Priority,Service_Area,Damage_Type,Municipality,LON,LAT,Day,Month,Year,City,Suburb,Residential,Road,Zipcode,Zip4
Incident_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
511,08:25:18,10:00:56,95.633,2.0,Amstelveen,Tree,Amstelveen,4.838685,52.281552,7,9,2018,,,,Bouwerij,1185 XW,1185
738,16:46:38,18:00:55,74.283,2.0,Victor,Tree,Amsterdam,4.930968,52.359724,10,9,2018,Amsterdam,Oost,Dapperbuurt,Linnaeusplantsoen,1093 KL,1093
1493,06:59:05,08:21:25,82.333,2.0,Amstelveen,Tree,Amstelveen,4.879741,52.301365,21,9,2018,,,,Oranjebaan,1183 PL,1183
1502,08:29:22,10:06:43,97.350,2.0,Dirk,Tree,Amsterdam,4.870656,52.355189,21,9,2018,Amsterdam,Zuid,Museumkwartier,Van Breestraat,1071 ZM,1071
1507,09:25:55,11:43:54,137.983,2.0,Teunis,Tree,Amsterdam,4.832333,52.380748,21,9,2018,Amsterdam,Nieuw-West,,M.C. Addicksstraat,1063 VZ,1063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436479,18:47:00,20:36:09,109.150,5.0,Victor,Tree,Amsterdam,4.944491,52.352064,31,7,2023,Amsterdam,Oost,Middenmeer,Rutherfordstraat,1098 TM,1098
436603,14:10:34,15:13:30,62.933,2.0,Osdorp,Tree,Amsterdam,4.824533,52.376163,2,8,2023,Amsterdam,Nieuw-West,,Louis Couperusstraat,1064 CE,1064
436834,03:38:27,06:29:11,170.733,5.0,Nico,Tree,Amsterdam,4.928197,52.390568,6,8,2023,Amsterdam,Noord,Tuindorp Buiksloot,Nieuwendammerdijk,1025 LP,1025
436904,21:52:41,22:16:29,23.800,2.0,Nico,Tree,Amsterdam,4.917106,52.368343,6,8,2023,Amsterdam,Centrum,,Hoogte Kadijk,1018 BV,1018


In [97]:
#Map zips to coordinates
def map_zip_to_crs(
    df
):
    geolocator = Nominatim(user_agent="my_request")
    zipcodes = df['Zipcode'].unique()
    zip_dict = {}
    for code in tqdm(zipcodes, desc="Processing ZIP codes"):
        result = geolocator.geocode(query={'postalcode': code, 'country': 'Nederland'}, addressdetails=True)
        if result is not None and 'boundingbox' in result.raw:
            zip_dict[code] = result.raw['boundingbox']

    with open(ZIPCODE_JSON_PATH, 'w') as f:
        json.dump(zip_dict, f)



In [98]:
def find_zip(
    point,
    zip_dict
):
    lon, lat = float(point[0]), float(point[1])
    for zip_code, box in zip_dict.items():
        min_lat, max_lat, min_lon, max_lon = map(float, box)
        if min_lat <= lat <= max_lat and min_lon <= lon <= max_lon:
            return zip_code
    return None

def map_point_to_zip(
    df,
    zip_dict
):
    for i, row in tqdm(df.iterrows()):
        p = (row['LNG'], row["LAT"])
        zip_code = find_zip(p, zip_dict)
        df.at[i, 'Zipcode'] = zip_code
    return df

In [99]:
# map trees to zipcode

# takes a while only run if necessary
# map_zip_to_crs(df=df_tree_incidents)

# TODO: Werkt niet goed, ergens zit een probleem met de zips


with open(ZIPCODE_JSON_PATH, 'r') as f:
    zip_dict = json.load(f)

trees = map_point_to_zip(trees, zip_dict)
trees = extract_zip_4(df=trees, zip_col='Zipcode', new_col='Zip4')

trees.to_csv(TREE_DATA_WITH_ZIP_PATH, sep=",", encoding="utf-8")


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '1185 XW' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.

267974it [00:11, 24311.10it/s]


### Grid - Tree mapping

Nog niet perfect maar methode werkt.

In [100]:

geolocator = Nominatim(user_agent="my_geocoder")

# Get coordinates for Amsterdam
location = geolocator.geocode("Amsterdam, Netherlands")
amsterdam_lat, amsterdam_lon = location.latitude, location.longitude

amsterdam_bbox = (52.26618, 4.64663, 52.475115999999994, 5.150491999999999)

# Define grid size in meters
grid_size = 100

# Calculate grid bounds
lat_step = grid_size / 111000  # 1 degree of latitude is approximately 111 kilometers
lon_step = (grid_size / 111000) / np.cos(np.radians(amsterdam_lat))  # Correct for latitude

grid_polygons = []
for lat in np.arange(amsterdam_bbox[0], amsterdam_bbox[2], lat_step):
    for lon in np.arange(amsterdam_bbox[1], amsterdam_bbox[3], lon_step):
        polygon = Polygon([
            (lon, lat),
            (lon + lon_step, lat),
            (lon + lon_step, lat + lat_step),
            (lon, lat + lat_step),
            (lon, lat),
        ])
        grid_polygons.append(polygon)

grid_gdf = gpd.GeoDataFrame(geometry=grid_polygons, crs="EPSG:4326")

In [101]:
# create gdf from trees
tree_gdf = gpd.GeoDataFrame(trees, geometry=gpd.points_from_xy(trees['LNG'], trees['LAT']), crs="EPSG:4326")
# create gdf from indicents
incident_gdf = gpd.GeoDataFrame(incidents_weather_df, geometry=gpd.points_from_xy(incidents_weather_df['LON'], incidents_weather_df['LAT']), crs="EPSG:4326")
#join with grid gdf
tree_gdf = gpd.sjoin(tree_gdf, grid_gdf, how="left", op="within")
incident_gdf = gpd.sjoin(incident_gdf, grid_gdf, how="left", op="within")



The `op` parameter is deprecated and will be removed in a future release. Please use the `predicate` parameter instead.


The `op` parameter is deprecated and will be removed in a future release. Please use the `predicate` parameter instead.



In [102]:
#clean up gdf
tree_gdf = tree_gdf.rename(columns={"index_right" : "grid_id", "geometry" : "location"})
incident_gdf = incident_gdf.rename(columns={"index_right" : "grid_id", "geometry" : "location"})

#rename categories in new col, in place so only run once
tree_gdf['boomhoogte'] = [MAP_BOOMHOOGTE[klasse] if not klasse is np.nan else np.nan for klasse in tree_gdf.boomhoogteklasseActueel.values]
tree_gdf['stamdiameter'] = [MAP_STAMDIAMETER[klasse] if not klasse is np.nan else np.nan for klasse in tree_gdf.stamdiameterklasse.values]

# get rid of unnecessary columns
tree_gdf = tree_gdf[TREE_COLUMNS]

# save to new df
tree_gdf.to_csv(TREE_DATA_CLEAN_PATH, sep=",", encoding="utf-8")

In [103]:
incident_gdf

Unnamed: 0,Incident_ID,Date,Incident_Starttime,Hour,Date_time,Incident_Endtime,Incident_Duration,Incident_Priority,Service_Area,Municipality,...,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,Suburb,Residential,Road,Zipcode,location,grid_id
1,738,2018-09-10,16:46:38,16,2018/09/10 16:46,18:00:55,01:14:17,2.0,Victor,Amsterdam,...,0.527,0.484,0.459,0.617,Oost,Dapperbuurt,Linnaeusplantsoen,1093 KL,POINT (4.93097 52.35972),35418
3,1502,2018-09-21,08:29:22,8,2018/09/21 08:29,10:06:43,01:37:21,2.0,Dirk,Amsterdam,...,0.580,0.548,0.494,0.623,Zuid,Museumkwartier,Van Breestraat,1071 ZM,POINT (4.87066 52.35519),33667
4,1507,2018-09-21,09:25:55,9,2018/09/21 09:25,11:43:54,02:17:59,2.0,Teunis,Amsterdam,...,0.521,0.498,0.465,0.619,Nieuw-West,,M.C. Addicksstraat,1063 VZ,POINT (4.83233 52.38075),43559
5,1548,2018-09-21,17:53:27,17,2018/09/21 17:53,20:14:56,02:21:29,2.0,Pieter,Amsterdam,...,0.582,0.546,0.494,0.623,,,,,POINT (4.83505 52.36686),38089
7,1552,2018-09-21,18:25:03,18,2018/09/21 18:25,20:15:21,01:50:18,2.0,Willem,Amsterdam,...,0.593,0.546,0.494,0.623,Zuid,,Amsteldijk,1079 LK,POINT (4.90538 52.33572),26509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420,443469,2023-11-02,19:06:07,19,2023/11/02 19:06,19:06:38,00:00:31,2.0,Weesp,Amsterdam,...,0.732,0.737,0.724,0.635,,,,,POINT (5.07611 52.29911),12603
2421,443472,2023-11-02,19:45:17,19,2023/11/02 19:45,20:27:36,00:42:19,2.0,Willem,Amsterdam,...,0.740,0.743,0.708,0.632,,,,,POINT (4.90822 52.32678),23091
2422,443474,2023-11-02,19:53:02,19,2023/11/02 19:53,23:41:51,03:48:49,2.0,Pieter,Amsterdam,...,0.740,0.743,0.708,0.632,,,,,POINT (4.82213 52.34233),28846
2423,443476,2023-11-02,20:00:19,20,2023/11/02 20:00,20:52:15,00:51:56,2.0,Osdorp,Amsterdam,...,0.740,0.742,0.709,0.632,,,,,POINT (4.78724 52.36278),36689


In [104]:
import plotly.express as px
def plot_spacial_data(
    grid_gdf,
    tree_gdf,
    incident_gdf,
    plot_trees = True,
    plot_incidents = True
):
    # Create a plotly figure
    fig = px.choropleth_mapbox(grid_gdf, 
                                geojson=grid_gdf.geometry.__geo_interface__, 
                                locations=grid_gdf.index,
                                mapbox_style="open-street-map",
                                zoom=11, center={"lat": amsterdam_bbox[0], "lon": amsterdam_bbox[1]},
                                opacity=0.1,
                                )

    if plot_trees:
        # Add scatter plot for tree points
        fig.add_scattermapbox(
            lat=tree_gdf.location.y,
            lon=tree_gdf.location.x,
            mode='markers',
            marker=dict(
                size=4,
                color='green',
                opacity=0.7,
            ),
            text=tree_gdf['id'].astype(str),
            name='Trees'
        )
    if plot_incidents:
        # Add scatter plot for tree points
        fig.add_scattermapbox(
            lat=incident_gdf.location.y,
            lon=incident_gdf.location.x,
            mode='markers',
            marker=dict(
                size=4,
                color='red',
                opacity=0.7,
            ),
            text=incident_gdf.Incident_ID.astype(str),
            name='Incidents'
        )
    # Update the layout to make it interactive
    fig.update_layout(mapbox_style="carto-positron")
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.show()
    

In [105]:
incident_gdf

Unnamed: 0,Incident_ID,Date,Incident_Starttime,Hour,Date_time,Incident_Endtime,Incident_Duration,Incident_Priority,Service_Area,Municipality,...,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,Suburb,Residential,Road,Zipcode,location,grid_id
1,738,2018-09-10,16:46:38,16,2018/09/10 16:46,18:00:55,01:14:17,2.0,Victor,Amsterdam,...,0.527,0.484,0.459,0.617,Oost,Dapperbuurt,Linnaeusplantsoen,1093 KL,POINT (4.93097 52.35972),35418
3,1502,2018-09-21,08:29:22,8,2018/09/21 08:29,10:06:43,01:37:21,2.0,Dirk,Amsterdam,...,0.580,0.548,0.494,0.623,Zuid,Museumkwartier,Van Breestraat,1071 ZM,POINT (4.87066 52.35519),33667
4,1507,2018-09-21,09:25:55,9,2018/09/21 09:25,11:43:54,02:17:59,2.0,Teunis,Amsterdam,...,0.521,0.498,0.465,0.619,Nieuw-West,,M.C. Addicksstraat,1063 VZ,POINT (4.83233 52.38075),43559
5,1548,2018-09-21,17:53:27,17,2018/09/21 17:53,20:14:56,02:21:29,2.0,Pieter,Amsterdam,...,0.582,0.546,0.494,0.623,,,,,POINT (4.83505 52.36686),38089
7,1552,2018-09-21,18:25:03,18,2018/09/21 18:25,20:15:21,01:50:18,2.0,Willem,Amsterdam,...,0.593,0.546,0.494,0.623,Zuid,,Amsteldijk,1079 LK,POINT (4.90538 52.33572),26509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420,443469,2023-11-02,19:06:07,19,2023/11/02 19:06,19:06:38,00:00:31,2.0,Weesp,Amsterdam,...,0.732,0.737,0.724,0.635,,,,,POINT (5.07611 52.29911),12603
2421,443472,2023-11-02,19:45:17,19,2023/11/02 19:45,20:27:36,00:42:19,2.0,Willem,Amsterdam,...,0.740,0.743,0.708,0.632,,,,,POINT (4.90822 52.32678),23091
2422,443474,2023-11-02,19:53:02,19,2023/11/02 19:53,23:41:51,03:48:49,2.0,Pieter,Amsterdam,...,0.740,0.743,0.708,0.632,,,,,POINT (4.82213 52.34233),28846
2423,443476,2023-11-02,20:00:19,20,2023/11/02 20:00,20:52:15,00:51:56,2.0,Osdorp,Amsterdam,...,0.740,0.742,0.709,0.632,,,,,POINT (4.78724 52.36278),36689


### Enrich grid gdf

In [106]:
tree_gdf[tree_gdf.grid_id == 31296]

''' 
grid gdf cols
grid_id, geometery, has_tree, [tree_counts], [averages] 
'''

def convert_cat_to_avg(
    cat_values,
    delimeter = "-"
):
    ''' 
    Converts categorical values to means of type float
    Splits cat values on delimter, computes the mean for each cat
    Returns mean of all means of the categories
    '''
    means = []
    for cat in cat_values:
        if not isinstance(cat, str):
            continue
        if not delimeter in cat:
            continue
        vals = cat.split(delimeter)
        means.append(np.mean([float(val) for val in vals]))
    m = round(np.mean(means), 3)
    return 0 if np.isnan(m) else m


def enrich_grid_df(
    grid_gdf,
    tree_gdf
):
    for i in grid_gdf.index:
        tree_sub_df = tree_gdf[tree_gdf.grid_id == i]
        if len(tree_sub_df)>0:
            # Compute and add averages for height, diameter and year
            grid_gdf.at[i, "avg_height"] = convert_cat_to_avg(tree_sub_df.boomhoogte.values)
            grid_gdf.at[i, "avg_diameter"] = convert_cat_to_avg(tree_sub_df.stamdiameter.values)
            grid_gdf.at[i, "avg_year"] = round(np.mean(tree_sub_df.jaarVanAanleg.values), 3)
            # Add soortnaam counts
            for name, count in tree_sub_df.soortnaamKort.value_counts().items():
                grid_gdf.at[i, "has_tree"] = True
                grid_gdf.at[i, name] = count
        else:
            grid_gdf.at[i, "has_tree"] = False


enrich_grid_df(grid_gdf=grid_gdf, tree_gdf=tree_gdf)


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'False' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered in scalar divide


Mean of empty slice.


invalid value encountered i

In [107]:
incident_gdf.to_csv(INCIDENTS_WEATHER_GEO_PATH, sep=",", encoding="utf-8")

In [108]:
grid_gdf = grid_gdf.fillna(0)
grid_gdf[grid_gdf.has_tree == True]

grid_gdf.to_csv(GRID_DATA_PATH, sep=",", encoding="utf-8")


ExtensionArray.fillna added a 'copy' keyword in pandas 2.1.0. In a future version, ExtensionArray subclasses will need to implement this keyword or an exception will be raised. In the interim, the keyword is ignored by GeometryArray.



In [None]:
# plot_spacial_data(grid_gdf=grid_gdf, tree_gdf=tree_gdf, incident_gdf=incident_gdf)

### Create train / test sets

In [197]:
def display_full_df(
    df
):
    pd.set_option('display.max_rows', None)
    print(df.to_string(index=False))
    pd.reset_option('display.max_rows')

In [198]:
incident_gdf

Unnamed: 0,Incident_ID,Date,Incident_Starttime,Hour,Date_time,Incident_Endtime,Incident_Duration,Incident_Priority,Service_Area,Municipality,...,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,Suburb,Residential,Road,Zipcode,location,grid_id
1,738,2018-09-10,16:46:38,16,2018/09/10 16:46,18:00:55,01:14:17,2.0,Victor,Amsterdam,...,0.527,0.484,0.459,0.617,Oost,Dapperbuurt,Linnaeusplantsoen,1093 KL,POINT (4.93097 52.35972),35418
3,1502,2018-09-21,08:29:22,8,2018/09/21 08:29,10:06:43,01:37:21,2.0,Dirk,Amsterdam,...,0.580,0.548,0.494,0.623,Zuid,Museumkwartier,Van Breestraat,1071 ZM,POINT (4.87066 52.35519),33667
4,1507,2018-09-21,09:25:55,9,2018/09/21 09:25,11:43:54,02:17:59,2.0,Teunis,Amsterdam,...,0.521,0.498,0.465,0.619,Nieuw-West,,M.C. Addicksstraat,1063 VZ,POINT (4.83233 52.38075),43559
5,1548,2018-09-21,17:53:27,17,2018/09/21 17:53,20:14:56,02:21:29,2.0,Pieter,Amsterdam,...,0.582,0.546,0.494,0.623,,,,,POINT (4.83505 52.36686),38089
7,1552,2018-09-21,18:25:03,18,2018/09/21 18:25,20:15:21,01:50:18,2.0,Willem,Amsterdam,...,0.593,0.546,0.494,0.623,Zuid,,Amsteldijk,1079 LK,POINT (4.90538 52.33572),26509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420,443469,2023-11-02,19:06:07,19,2023/11/02 19:06,19:06:38,00:00:31,2.0,Weesp,Amsterdam,...,0.732,0.737,0.724,0.635,,,,,POINT (5.07611 52.29911),12603
2421,443472,2023-11-02,19:45:17,19,2023/11/02 19:45,20:27:36,00:42:19,2.0,Willem,Amsterdam,...,0.740,0.743,0.708,0.632,,,,,POINT (4.90822 52.32678),23091
2422,443474,2023-11-02,19:53:02,19,2023/11/02 19:53,23:41:51,03:48:49,2.0,Pieter,Amsterdam,...,0.740,0.743,0.708,0.632,,,,,POINT (4.82213 52.34233),28846
2423,443476,2023-11-02,20:00:19,20,2023/11/02 20:00,20:52:15,00:51:56,2.0,Osdorp,Amsterdam,...,0.740,0.742,0.709,0.632,,,,,POINT (4.78724 52.36278),36689


In [199]:
#convert dates to datetime objects
incident_gdf.Date = pd.to_datetime(incident_gdf.Date)

In [200]:
# Pick necessary columns
incident_sub_gdf = incident_gdf[RF_INCIDENT_COLUMNS]

grid_gdf['grid_id'] = grid_gdf.index
grid_sub_gdf = grid_gdf[RF_GRID_COLUMNS]

tree_gdf = tree_gdf.rename(columns={"id" : "tree_id"})
tree_sub_gdf = tree_gdf[RF_TREE_COLUMNS]

In [201]:
positive_samples = grid_sub_gdf.merge(incident_sub_gdf, on='grid_id', how='inner')

In [202]:
positive_samples.to_csv("positive_samples.csv", sep=",", encoding="utf-8")

In [203]:
display_full_df(positive_samples)

 grid_id  has_tree  avg_height  avg_diameter  avg_year  Fraxinus  Salix  Alnus  Quercus  Tilia  Acer  Populus  Betula  Prunus  Platanus  Malus  Robinia  Crataegus  Ulmus  Carpinus  Overig  Onbekend  Incident_ID              Service_Area       Date  Hour  temperature_2m  relative_humidity_2m  dew_point_2m  apparent_temperature  precipitation  rain  snowfall  snow_depth  weather_code  pressure_msl  surface_pressure  wind_speed_10m  wind_direction_10m  wind_gusts_10m  soil_temperature_0_to_7cm  soil_temperature_7_to_28cm  soil_temperature_28_to_100cm  soil_temperature_100_to_255cm  soil_moisture_0_to_7cm  soil_moisture_7_to_28cm  soil_moisture_28_to_100cm  soil_moisture_100_to_255cm
    7127      True      10.500         0.400  1984.000       0.0    0.0    1.0      0.0    0.0   0.0      0.0     0.0     0.0       0.0    0.0      0.0        0.0    0.0       0.0     0.0       0.0       394418                     Weesp 2022-02-21    12        8.243500             76.970634      4.443500      

In [204]:
p_sample_info = positive_samples[['Incident_ID', 'grid_id', 'Date', 'Hour']]

In [205]:
p_sample_info.to_csv("p_sample_info.csv", sep=",", encoding="utf-8")

In [245]:
import random
#TODO: change date window
def verify_sample(
    incidents,
    grid_id,
    date,
    window = DATE_WINDOW
):
    start_date = date - pd.DateOffset(days=window)
    end_date = date + pd.DateOffset(days=window)

    grids = incidents[(incidents['Date'] >= start_date) & (incidents['Date'] <= end_date)].values

    return False if grid_id not in grids else True


def sample_negatives(
    positives,
    incidents,
    grid
):
    grids_with_trees = list(grid[grid.has_tree == True].grid_id.values)
    negatives = positives[['Date', 'Hour']]
    negatives[RF_GRID_COLUMNS] = None

    for i, row in negatives.iterrows():
        random_grid = random.sample(grids_with_trees, 1)[0]
        while(verify_sample(incidents, random_grid, row.Date)):
            print(f"got here for {random_grid}")
            random_grid = random.sample(grids_with_trees, 1)[0]
        print(f"Found negative: {random_grid} for date {row.Date}")
        grid_data = grid[grid.grid_id == random_grid][RF_GRID_COLUMNS].reset_index(drop=True)
        negatives.loc[i, RF_GRID_COLUMNS] = grid_data.iloc[0]

    return negatives




negatives = sample_negatives(positive_samples, incident_sub_gdf, grid_sub_gdf)

Found negative: 45262 for date 2022-02-21 00:00:00




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [246]:
negatives

Unnamed: 0,Date,Hour,grid_id,has_tree,avg_height,avg_diameter,avg_year,Fraxinus,Salix,Alnus,...,Betula,Prunus,Platanus,Malus,Robinia,Crataegus,Ulmus,Carpinus,Overig,Onbekend
0,2022-02-21,12,45262,True,7.269,0.0,920.769,0.0,2.0,3.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023-11-02,17,,,,,,,,,...,,,,,,,,,,
2,2022-11-17,2,,,,,,,,,...,,,,,,,,,,
3,2017-09-13,15,,,,,,,,,...,,,,,,,,,,
4,2023-07-05,11,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1959,2023-07-06,17,,,,,,,,,...,,,,,,,,,,
1960,2019-08-27,21,,,,,,,,,...,,,,,,,,,,
1961,2023-11-02,16,,,,,,,,,...,,,,,,,,,,
1962,2023-07-05,9,,,,,,,,,...,,,,,,,,,,


In [227]:
grid_gdf[grid_gdf.grid_id == 27820][RF_GRID_COLUMNS].iloc[0]

grid_id          27820
has_tree          True
avg_height       8.318
avg_diameter       0.0
avg_year        2012.0
Fraxinus           0.0
Salix              0.0
Alnus              0.0
Quercus            0.0
Tilia              0.0
Acer               0.0
Populus            0.0
Betula             0.0
Prunus             0.0
Platanus           0.0
Malus              0.0
Robinia            0.0
Crataegus          0.0
Ulmus              0.0
Carpinus           0.0
Overig            11.0
Onbekend           0.0
Name: 27820, dtype: object