In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter, CoxPHFitter
from shapely.geometry import box
from shapely.ops import nearest_points
from shapely.geometry import box
from shapely.ops import unary_union
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from shapely import wkt
from scipy.spatial import cKDTree

In [2]:
burn_areas = gpd.read_file('middelhav/EFFIS_Burned_Areas_2008-2023_greater10ha_grided.shp')
df_wind = pd.read_csv('middelhav/wind-2008-2023.csv')
df_temp = pd.read_csv('middelhav/temp-2008-2023.csv')
df_rhum = pd.read_csv('middelhav/rhum-2008-2023.csv')
df_prcp = pd.read_csv('middelhav/prcp-2008-2023.csv')
static_pred = gpd.read_file('middelhav/static_predictors.shp')


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [102]:
# Drought 20-day rolling sum
value_cols = [col for col in df_prcp.columns if col.isdigit()]
rolling_sum = df_prcp[value_cols].T.rolling(window=20, min_periods=1).sum().T
meta_cols = [col for col in df_prcp.columns if col not in value_cols]
df_drought20 = pd.concat([df_prcp[meta_cols], rolling_sum], axis=1)

# Temperature 7-day rolling mean
value_cols = [col for col in df_temp.columns if col.isdigit()]
rolling_mean_temp = df_temp[value_cols].T.rolling(window=7, min_periods=1).mean().T
meta_cols = [col for col in df_temp.columns if col not in value_cols]
df_temp7 = pd.concat([df_temp[meta_cols], rolling_mean_temp], axis=1)

# Humidity 7-day rolling mean
value_cols = [col for col in df_rhum.columns if col.isdigit()]
rolling_mean_rhum = df_rhum[value_cols].T.rolling(window=7, min_periods=1).mean().T
meta_cols = [col for col in df_rhum.columns if col not in value_cols]
df_rhum7 = pd.concat([df_rhum[meta_cols], rolling_mean_rhum], axis=1)

# Wind Speed 7-day rolling mean
value_cols = [col for col in df_wind.columns if col.isdigit()]
rolling_mean_wind = df_wind[value_cols].T.rolling(window=7, min_periods=1).mean().T
meta_cols = [col for col in df_wind.columns if col not in value_cols]
df_wind7 = pd.concat([df_wind[meta_cols], rolling_mean_wind], axis=1)


In [103]:
df_drought20 = df_drought20.iloc[:, 1:]
df_temp7 = df_temp7.iloc[:, 1:]
df_prcp = df_prcp.iloc[:, 1:]
df_wind7 = df_wind.iloc[:, 1:]
df_rhum7 = df_rhum.iloc[:, 1:]

from datetime import datetime, timedelta

def parse_zero_indexed_day_corrected(col_str):
    try:
        if len(col_str) == 7:
            year = int(col_str[:4])
            month = int(col_str[4:6])
            day_offset = int(col_str[6])
            corrected_date = datetime(year, month, 1) + timedelta(days=day_offset)
            return corrected_date.strftime("%Y%m%d")
        elif len(col_str) == 8:
            year = int(col_str[:4])
            month = int(col_str[4:6])
            day_offset = int(col_str[6:])  # last two digits
            corrected_date = datetime(year, month, 1) + timedelta(days=day_offset)
            return corrected_date.strftime("%Y%m%d")
        else:
            return col_str  # unrecognized format
    except:
        return col_str

def reshape_climate_data_in_chunks(df, variable_name, chunk_size=1000):
    # Identify date columns: only numeric
    date_cols = [col for col in df.columns if str(col).isdigit()]
    df = df[["id"] + date_cols].copy()

    # Correctly parse column names first
    df.columns = ["id"] + [parse_zero_indexed_day_corrected(col) for col in date_cols]

    chunks = []
    for start in range(0, len(df), chunk_size):
        end = start + chunk_size
        df_chunk = df.iloc[start:end].copy()

        # Melt chunk
        df_long = df_chunk.melt(id_vars=["id"], var_name="date_str", value_name=variable_name)
        df_long["date"] = pd.to_datetime(df_long["date_str"], format="%Y%m%d", errors="coerce")
        df_long.drop(columns=["date_str"], inplace=True)
        df_long.dropna(subset=[variable_name], inplace=True)

        chunks.append(df_long)

    return pd.concat(chunks, ignore_index=True)




In [104]:

df_temp7_long = reshape_climate_data_in_chunks(df_temp7, "temperature")
df_rhum7_long = reshape_climate_data_in_chunks(df_rhum7, "humidity")
df_wind7_long = reshape_climate_data_in_chunks(df_wind7, "wind_speed")
df_drought20_long = reshape_climate_data_in_chunks(df_drought20, "drought_20")
# df_prcp_long = reshape_climate_data_in_chunks(df_prcp, "precipitation")

In [105]:
del df_wind7, df_rhum7, df_drought20, df_temp, df_temp7, df_rhum, df_wind
import gc
gc.collect()


0

In [None]:

# 1) Put your long tables into a dict for easy looping
tables = {
    "temperature": df_temp7_long,
    "wind_speed":  df_wind7_long,
    "drought20":   df_drought20_long.rename(columns={"drought_20":"drought20"}),
    "humidity":    df_rhum7_long,
}

# 2) OVERALL SUMMARY per grid
# overall_dfs = []

# for var, df in tables.items():
#     # ensure numeric
#     df[var] = pd.to_numeric(df[var], errors="coerce")
    
#     stats = (
#         df
#         .groupby("id")
#         .agg(
#             **{
#                 f"{var}_mean":   (var, "mean"),
#                 f"{var}_median": (var, "median"),
#                 f"{var}_q25":    (var, lambda x: x.quantile(0.25)),
#                 f"{var}_q75":    (var, lambda x: x.quantile(0.75)),
#             }
#         )
#     )
    
#     overall_dfs.append(stats)

# df_summary_overall = (
#     pd.concat(overall_dfs, axis=1)
#       .reset_index()
# )

# 3) MONTHLY SUMMARY per grid
# 3) MONTHLY SUMMARY per grid
monthly_dfs = []

for var, df in tables.items():
    # copy and ensure numeric
    tmp = df.copy()
    tmp[var] = pd.to_numeric(tmp[var], errors="coerce")
    # extract month
    tmp["month"] = tmp["date"].dt.month
    
    stats = (
        tmp
        .groupby(["id", "month"])
        .agg(
            **{
                f"{var}_mean":   (var, "mean"),
                f"{var}_median": (var, "median"),
                f"{var}_q25":    (var, lambda x: x.quantile(0.25)),
                f"{var}_q75":    (var, lambda x: x.quantile(0.75)),
            }
        )
    )
    
    monthly_dfs.append(stats)

df_summary_monthly = (
    pd.concat(monthly_dfs, axis=1)
      .reset_index()
)


# 4) Save out
# df_summary_overall.to_csv("df_summary_overall_middelhav.csv", index=False)
# df_summary_monthly.to_csv("df_summary_monthly_middelhav.csv", index=False)

# print("Overall summary:",  df_summary_overall.shape)
print("Monthly summary:",  df_summary_monthly.shape)

Monthly summary: (209052, 18)


In [9]:
12*17421

209052

In [48]:
df_temp7_long

Unnamed: 0,id,temperature,date
0,68467.0,-0.149567,2008-01-01
1,68468.0,-0.915192,2008-01-01
2,68474.0,0.080902,2008-01-01
3,68475.0,0.080902,2008-01-01
4,68476.0,-2.225738,2008-01-01
...,...,...,...
101808319,110148.0,17.755066,2023-12-31
101808320,110149.0,17.633972,2023-12-31
101808321,110150.0,18.064776,2023-12-31
101808322,110366.0,18.239999,2023-12-31


In [39]:
del df_summary_monthly
import gc
gc.collect()


508

In [14]:
print((burn_areas['id']==37298.0).sum())

2


In [None]:
# # den oprindelige event uden random fire grids

# import numpy as np
# import pandas as pd
# import geopandas as gpd
# from datetime import datetime, timedelta

# # ─── 0) assume you've already run:
# # df_temp_long, df_prcp_long, df_wind_long, df_drought20_long, df_rhum_long
# # each has columns ["id","date",<var>],
# # plus burn_areas GeoDataFrame, static_pred GeoDataFrame.

# # ─── 1) COMPUTE GRID RANGES (min/max date per id) ────────────────────────────

# # Use TEMP and PRCP as representatives (they should cover all your dates):
# rng_temp = df_temp7_long.groupby("id")["date"].agg(start_temp="min", end_temp="max")
# rng_prcp = df_rhum7_long.groupby("id")["date"].agg(start_pr="min", end_pr="max")

# # join the two (outer to catch any that appear in one but not the other)
# grid_ranges = rng_temp.join(rng_prcp, how="outer")

# # final start/end is the min of the _start_ columns, max of the _end_ columns
# grid_ranges["start_dt"] = grid_ranges[["start_temp","start_pr"]].min(axis=1)
# grid_ranges["stop_dt"]  = grid_ranges[["end_temp",  "end_pr"]].max(axis=1)
# grid_ranges = grid_ranges[["start_dt","stop_dt"]].reset_index()

# # ─── 2) EXTRACT FIRE DATES ────────────────────────────────────────────────────
# burn_areas["date"] = pd.to_datetime(burn_areas["initialdat"]).dt.normalize()
# fires = (
#     burn_areas
#     .groupby(["id","date"])
#     .size()
#     .rename("n_fires")
#     .reset_index()
# )

# # ─── 3) BUILD ~20k EVENT INTERVALS ────────────────────────────────────────────
# event_list = []
# np.random.seed(42)
# num_splits = 3

# for grid, row in grid_ranges.iterrows():
#     gid = row["id"]
#     start, end = row["start_dt"], row["stop_dt"]

#     # get fire dates (sorted)
#     fds = fires.loc[fires.id.eq(gid), "date"].sort_values().tolist()

#     if not fds:
#         # no fires → random splits
#         span = (end - start).days
#         if span < num_splits:
#             event_list.append([gid, start, end, 0])
#         else:
#             pts = np.sort(np.random.choice(range(1, span), size=num_splits-1, replace=False))
#             bounds = [start] + [start + timedelta(days=int(d)) for d in pts] + [end]
#             for i in range(num_splits):
#                 event_list.append([gid, bounds[i], bounds[i+1], 0])
#     else:
#         # one event per fire, then censored
#         cur = start
#         for fd in fds:
#             if fd > cur:
#                 event_list.append([gid, cur, fd, 1])
#             cur = fd + timedelta(days=1)
#         if cur < end:
#             event_list.append([gid, cur, end, 0])

# event_df = pd.DataFrame(event_list, columns=["id","start_dt","stop_dt","fire_occurred"])

# # add numeric times & seasonality
# base = grid_ranges["start_dt"].min()
# event_df["start_time"] = (event_df["start_dt"] - base).dt.days
# event_df["stop_time"]  = (event_df["stop_dt"]  - base).dt.days
# event_df["stop_date"]  = event_df["stop_dt"]
# event_df["month"]      = event_df["stop_date"].dt.month
# event_df["sin_month"]  = np.sin(2*np.pi*event_df["month"]/12)
# event_df["cos_month"]  = np.cos(2*np.pi*event_df["month"]/12)
# event_df.drop(columns="month", inplace=True)

# # ─── 4) MERGE STATIC & CLIMATE SNAPSHOTS ON ~20k EVENTS ────────────────────────
# # start by indexing event_df by id
# out = event_df.set_index("id")

# # static_pred already has index 'id'
# out = out.join(static_pred)  

# #
# # assume event_df is your DataFrame with columns:
# #    ["id","start_dt","stop_dt","fire_occurred","start_time","stop_time","stop_date","sin_month","cos_month",<statics>]

# event_df_final = event_df.copy()

# # for each climate var, build a Series indexed by (id,date), then reindex on (id,stop_date)
# for df_time, var in [
#     (df_temp7_long,      "temperature"),
#     (df_wind7_long,      "wind_speed"),
#     (df_drought20_long, "drought_20"),
#     (df_rhum7_long,      "humidity")
# ]:
#     # 1) make a Series mapping (id,date) -> var
#     s = df_time.set_index(["id","date"])[var]

#     # 2) build a MultiIndex of the exact (id,stop_date) pairs you need
#     idx = pd.MultiIndex.from_frame(
#         event_df_final[["id","stop_date"]].rename(columns={"stop_date":"date"})
#     )

#     # 3) reindex the small Series onto your ~50k keys
#     event_df_final[var] = s.reindex(idx).values

# static_df = static_pred.reset_index()  # now has a column "id"
# event_df_final = event_df_final.merge(
#     static_df,
#     on="id",
#     how="left"
# )
# event_df_final


Unnamed: 0,id,start_dt,stop_dt,fire_occurred,start_time,stop_time,stop_date,sin_month,cos_month,temperature,...,pline_den,urban,forest,agricultur,irrigated,shrubland,grassland,other,south,geometry
0,37297.0,2008-01-01,2008-06-27,0,0,178,2008-06-27,1.224647e-16,-1.000000e+00,21.207707,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
1,37297.0,2008-06-27,2020-07-21,0,178,4585,2020-07-21,-5.000000e-01,-8.660254e-01,23.788761,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
2,37297.0,2020-07-21,2023-12-31,0,4585,5843,2023-12-31,-2.449294e-16,1.000000e+00,13.811706,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
3,37298.0,2008-01-01,2018-10-07,1,0,3932,2018-10-07,-8.660254e-01,5.000000e-01,22.666133,...,0.000343,0.15113,0.50506,0.03453,0.0,0.27495,0.02171,0.00937,0.363963,"POLYGON ((2643879.27 1955289.001, 2637571.418 ..."
4,37298.0,2018-10-08,2023-07-26,1,3933,5685,2023-07-26,-5.000000e-01,-8.660254e-01,21.471484,...,0.000343,0.15113,0.50506,0.03453,0.0,0.27495,0.02171,0.00937,0.363963,"POLYGON ((2643879.27 1955289.001, 2637571.418 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63479,110366.0,2009-07-29,2019-06-02,0,575,4170,2019-06-02,1.224647e-16,-1.000000e+00,24.226000,...,0.000888,0.65421,0.22222,0.06542,0.0,0.00000,0.00000,0.05607,0.052189,"POLYGON ((5953879.27 1675289.001, 5953879.27 1..."
63480,110366.0,2019-06-02,2023-12-31,0,4170,5843,2023-12-31,-2.449294e-16,1.000000e+00,18.239999,...,0.000888,0.65421,0.22222,0.06542,0.0,0.00000,0.00000,0.05607,0.052189,"POLYGON ((5953879.27 1675289.001, 5953879.27 1..."
63481,110367.0,2008-01-01,2008-12-12,0,0,346,2008-12-12,-2.449294e-16,1.000000e+00,16.761950,...,0.000693,0.10039,0.06746,0.05019,0.0,0.31274,0.38224,0.08880,0.482014,"MULTIPOLYGON (((5953879.27 1675289.001, 595550..."
63482,110367.0,2008-12-12,2014-03-31,0,346,2281,2014-03-31,1.000000e+00,6.123234e-17,16.875528,...,0.000693,0.10039,0.06746,0.05019,0.0,0.31274,0.38224,0.08880,0.482014,"MULTIPOLYGON (((5953879.27 1675289.001, 595550..."


In [62]:
df_drought20_long["drought_20"].isna().sum(), df_temp7_long["temperature"].isna().sum(), df_rhum7_long["humidity"].isna().sum(), df_wind7_long["wind_speed"].isna().sum()


(np.int64(0), np.int64(0), np.int64(0), np.int64(0))

In [7]:
df_temp7_long.groupby("id")["date"].agg(start_temp="min", end_temp="max")

Unnamed: 0_level_0,start_temp,end_temp
id,Unnamed: 1_level_1,Unnamed: 2_level_1
37297.0,2008-01-01,2023-12-31
37298.0,2008-01-01,2023-12-31
37299.0,2008-01-01,2023-12-31
37317.0,2008-01-01,2023-12-31
37318.0,2008-01-01,2023-12-31
...,...,...
110148.0,2008-01-01,2023-12-31
110149.0,2008-01-01,2023-12-31
110150.0,2008-01-01,2023-12-31
110366.0,2008-01-01,2023-12-31


In [125]:
import pandas as pd
import numpy as np
from datetime import timedelta

# ---------------------------------------------------------
# 1) PREP: your inputs (unchanged)
# ---------------------------------------------------------
grid_range = (
    df_temp7_long
    .groupby("id")["date"]
    .agg(start_temp="min", end_temp="max")
    .reset_index()
)
grid_range['start_dt'] = pd.to_datetime(grid_range['start_temp']).dt.normalize()
grid_range['stop_dt']  = pd.to_datetime(grid_range['end_temp']).dt.normalize()
grid_range = grid_range.drop(columns=["start_temp", "end_temp"])

burn_areas["date"] = pd.to_datetime(burn_areas["initialdat"]).dt.normalize()
burn_areas_unique = burn_areas.drop_duplicates(subset=['id','date'])
fires = (
    burn_areas_unique
    .groupby(["id","date"])
    .size()
    .rename("n_fires")
    .reset_index()
)

# Parameters
np.random.seed(42)
num_splits = 3
min_days   = 2

# ---------------------------------------------------------
# 2) HELPER: random partition of a span into >=min_days parts
# ---------------------------------------------------------

# … (the prep steps stay the same) …

min_nf         = 2    # minimum days for a no-fire interval
min_fire_days  = 2    # minimum days for a fire interval (for non-consecutive fires)
num_splits     = 3    # only used when there are NO fires at all

def split_no_fire(start, end, k, min_days=2):
    # … (your helper, unchanged) …
    # make sure all day‐counts are int, as before
    total_days = (end - start).days + 1
    if total_days < k * min_days:
        return [(start, end)]
    rem = total_days - k * min_days
    cuts = np.sort(np.random.choice(range(rem + 1), k - 1, replace=False))
    parts, prev = [], 0
    for c in cuts:
        parts.append(int(c - prev))
        prev = c
    parts.append(int(rem - prev))
    lengths = [int(p + min_days) for p in parts]

    intervals, cur = [], start
    for L in lengths:
        e = cur + timedelta(days=L - 1)
        intervals.append((cur, e))
        cur = e + timedelta(days=1)
    return intervals

event_list = []

for _, row in grid_range.iterrows():
    gid, start, end = row["id"], row["start_dt"], row["stop_dt"]
    fire_dates = sorted(fires.loc[fires.id.eq(gid), "date"].tolist())
    current = start

    if not fire_dates:
        # No fires at all → just split into no-fire chunks
        for s, e in split_no_fire(start, end, num_splits, min_days=min_nf):
            event_list.append([gid, s, e, 0])

    else:
        for F in fire_dates:
            if current < F:
                gap_days = (F - current).days + 1

                if gap_days >= (min_nf + min_fire_days):
                    # carve out a no-fire interval of at least min_nf
                    # leaving at least min_fire_days for the fire interval
                    max_nf = gap_days - min_fire_days
                    L      = int(np.random.randint(min_nf, max_nf + 1))
                    nf_end = current + timedelta(days=L - 1)

                    event_list.append([gid, current, nf_end, 0])
                    fire_start = nf_end + timedelta(days=1)

                else:
                    # not enough room for both minima → absorb gap into the fire
                    fire_start = current

                # fire interval always ends on F
                event_list.append([gid, fire_start, F, 1])

            else:
                # consecutive-day fire
                event_list.append([gid, F, F, 1])

            current = F + timedelta(days=1)

        # final tail after last fire
        if current <= end:
            tail_len = (end - current).days + 1
            if tail_len >= min_nf:
                event_list.append([gid, current, end, 0])
            else:
                # unavoidable 1-day tail → merge it
                event_list.append([gid, current, end, 0])

# … (rest of your feature‐engineering & merges) …


# ---------------------------------------------------------
# 4) BUILD DataFrame and add features
# ---------------------------------------------------------
event_df = (
    pd.DataFrame(event_list, columns=["id","start_dt","stop_dt","fire_occurred"])
    .sort_values(["id","start_dt"])
    .reset_index(drop=True)
)

base = grid_range["start_dt"].min()
event_df["start_time"] = (event_df["start_dt"] - base).dt.days
event_df["stop_time"]  = (event_df["stop_dt"]  - base).dt.days
event_df["stop_date"]  = event_df["stop_dt"]
event_df["month"]      = event_df["stop_date"].dt.month
event_df["sin_month"]  = np.sin(2*np.pi*event_df["month"]/12)
event_df["cos_month"]  = np.cos(2*np.pi*event_df["month"]/12)
event_df.drop(columns="month", inplace=True)

# ---------------------------------------------------------
# 5) MERGE STATIC & CLIMATE SNAPSHOTS
# ---------------------------------------------------------
event_df_final = event_df.copy()

for df_time, var in [
    (df_temp7_long,      "temperature"),
    (df_wind7_long,      "wind_speed"),
    (df_drought20_long,  "drought_20"),
    (df_rhum7_long,      "humidity")
]:
    s = df_time.set_index(["id","date"])[var]
    idx = pd.MultiIndex.from_frame(
        event_df_final[["id","stop_date"]].rename(columns={"stop_date":"date"})
    )
    event_df_final[var] = s.reindex(idx).values

static_df = static_pred.reset_index()  # make sure 'id' is a column
event_df_final = event_df_final.merge(
    static_df,
    on="id",
    how="left"
)

# now `event_df_final` has your randomized, valid intervals ready for analysis
event_df_final  # check the first few rows

Unnamed: 0,id,start_dt,stop_dt,fire_occurred,start_time,stop_time,stop_date,sin_month,cos_month,temperature,...,pline_den,urban,forest,agricultur,irrigated,shrubland,grassland,other,south,geometry
0,37297.0,2008-01-01,2008-06-27,0,0,178,2008-06-27,1.224647e-16,-1.000000,21.207707,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
1,37297.0,2008-06-28,2020-07-21,0,179,4585,2020-07-21,-5.000000e-01,-0.866025,23.788761,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
2,37297.0,2020-07-22,2023-12-31,0,4586,5843,2023-12-31,-2.449294e-16,1.000000,13.811706,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
3,37298.0,2008-01-01,2009-01-16,0,0,381,2009-01-16,5.000000e-01,0.866025,9.955301,...,0.000343,0.15113,0.50506,0.03453,0.0,0.27495,0.02171,0.00937,0.363963,"POLYGON ((2643879.27 1955289.001, 2637571.418 ..."
4,37298.0,2009-01-17,2018-10-07,1,382,3932,2018-10-07,-8.660254e-01,0.500000,22.666133,...,0.000343,0.15113,0.50506,0.03453,0.0,0.27495,0.02171,0.00937,0.363963,"POLYGON ((2643879.27 1955289.001, 2637571.418 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84365,110366.0,2018-04-14,2019-12-18,0,3756,4369,2019-12-18,-2.449294e-16,1.000000,17.486751,...,0.000888,0.65421,0.22222,0.06542,0.0,0.00000,0.00000,0.05607,0.052189,"POLYGON ((5953879.27 1675289.001, 5953879.27 1..."
84366,110366.0,2019-12-19,2023-12-31,0,4370,5843,2023-12-31,-2.449294e-16,1.000000,18.239999,...,0.000888,0.65421,0.22222,0.06542,0.0,0.00000,0.00000,0.05607,0.052189,"POLYGON ((5953879.27 1675289.001, 5953879.27 1..."
84367,110367.0,2008-01-01,2010-08-20,0,0,962,2010-08-20,-8.660254e-01,-0.500000,30.956661,...,0.000693,0.10039,0.06746,0.05019,0.0,0.31274,0.38224,0.08880,0.482014,"MULTIPOLYGON (((5953879.27 1675289.001, 595550..."
84368,110367.0,2010-08-21,2018-05-28,0,963,3800,2018-05-28,5.000000e-01,-0.866025,24.287768,...,0.000693,0.10039,0.06746,0.05019,0.0,0.31274,0.38224,0.08880,0.482014,"MULTIPOLYGON (((5953879.27 1675289.001, 595550..."


In [127]:
event_df = event_df_final.copy()
event_df = event_df.sort_values(["id", "start_time"]).copy()

def fix_no_prior_fire(group):
    """
    For each grid (group), the first row is set to no_prior_fire=1.
    If we have already seen a fire in earlier rows, then no_prior_fire=0 for subsequent rows.
    """
    has_seen_fire = False
    for i in group.index:
        if i == group.index[0]:
            # first row in this grid
            group.loc[i, "no_prior_fire"] = 1
        else:
            # subsequent rows
            group.loc[i, "no_prior_fire"] = 0 if has_seen_fire else 1

        # If this row has a fire, update flag
        if group.loc[i, "fire_occurred"] == 1:
            has_seen_fire = True
    return group

# Apply the logic per grid (grouped by 'id')
event_df = event_df.groupby("id", group_keys=False).apply(fix_no_prior_fire)

# (Optional) re-sort or reset index if you wish
event_df = event_df.sort_values(["id", "start_time"]).reset_index(drop=True)

  event_df = event_df.groupby("id", group_keys=False).apply(fix_no_prior_fire)


In [None]:
def compute_days_since_fire(group):
    group = group.sort_values("start_dt").copy()
    last_fire_stop = None
    days_list = []
    no_prior_fire_list = []

    for i, row in group.iterrows():
        if last_fire_stop is None:
            no_prior_fire = 1
            days = 1  # Indicate no prior fire; adjust as needed
        else:
            no_prior_fire = 0
            days = (row["start_dt"] - last_fire_stop).days
            # Ensure days isn't negative (if events overlap)
            days = max(days, 0) if last_fire_stop is not None else days

        days_list.append(days)
        no_prior_fire_list.append(no_prior_fire)

        # Update last_fire_stop only if the current row is a fire
        if row["fire_occurred"] == 1:
            last_fire_stop = row["stop_dt"]

    group["days_since_last_fire"] = days_list
    group["no_prior_fire"] = no_prior_fire_list
    return group

# Apply the function to the DataFrame
df = (
    event_df.sort_values(["id", "start_dt"])
    .groupby("id", group_keys=False)
    .apply(compute_days_since_fire)
    .reset_index(drop=True)
)

# Optional: ensure all `days_since_last_fire == 0` becomes 1 (if that is your convention)
df["days_since_last_fire"] = df["days_since_last_fire"].replace(0, 1)

print(event_df[[
    "id", "stop_time", "fire_occurred",
    "days_since_last_fire", "no_prior_fire"
]].head(30))

         id  stop_time  fire_occurred  days_since_last_fire  no_prior_fire
0   37297.0        178              0                     1              1
1   37297.0       4585              0                     1              1
2   37297.0       5843              0                     1              1
3   37298.0        381              0                     1              1
4   37298.0       3932              1                     1              1
5   37298.0       5263              0                  1331              0
6   37298.0       5685              1                  1753              0
7   37298.0       5843              0                   158              0
8   37299.0        742              0                     1              1
9   37299.0       3932              1                     1              1
10  37299.0       4753              0                   821              0
11  37299.0       5685              1                  1753              0
12  37299.0       5843   

  event_df.sort_values(["id", "start_dt"])


In [157]:
def fix_violation_rows(df):
    df = df.sort_values(by=['id', 'start_time']).copy()

    # Violations: days_since_last_fire > start_time, but no_prior_fire says "fire occurred before"
    violations = df[
        (df['days_since_last_fire'] > df['start_time']) &
        (df['no_prior_fire'] == 0)
    ]

    print(f"Found {len(violations)} violations to fix.")

    # Go back and fix each one
    for idx in violations.index:
        row = df.loc[idx]
        id_val = row['id']
        time_val = row['start_time']

        # Subset of the same ID, but earlier in time
        history = df[(df['id'] == id_val) & (df['start_time'] < time_val)]

        # Check if any fire occurred before this row
        prior_fire = (history['fire_occurred'] == 1).any()

        if not prior_fire:
            # If no prior fire, update this row
            df.at[idx, 'no_prior_fire'] = 1
            df.at[idx, 'days_since_last_fire'] = 1

    return df


corrected_df = fix_violation_rows(event_df)
print(corrected_df[[
    "id", "stop_time", "fire_occurred",
    "days_since_last_fire", "no_prior_fire"
]].head(15))

Found 1991 violations to fix.
         id  stop_time  fire_occurred  days_since_last_fire  no_prior_fire
0   37297.0        178              0                     1              1
1   37297.0       4585              0                     1              1
2   37297.0       5843              0                     1              1
3   37298.0        381              0                     1              1
4   37298.0       3932              1                     1              1
5   37298.0       5263              0                  1331              0
6   37298.0       5685              1                  1753              0
7   37298.0       5843              0                   158              0
8   37299.0        742              0                     1              1
9   37299.0       3932              1                     1              1
10  37299.0       4753              0                   821              0
11  37299.0       5685              1                  1753           

In [160]:
print(df[df['id'] == 86155.0])


            id   start_dt    stop_dt  fire_occurred  start_time  stop_time  \
74380  86155.0 2008-01-01 2009-07-24              0           0        570   
74381  86155.0 2009-07-25 2011-08-08              1         571       1315   
74382  86155.0 2011-08-09 2012-04-21              0        1316       1572   
74383  86155.0 2012-04-22 2012-07-22              1        1573       1664   
74384  86155.0 2012-07-23 2012-07-25              0        1665       1667   
74385  86155.0 2012-07-26 2017-08-09              1        1668       3508   
74386  86155.0 2017-08-10 2018-08-08              0        3509       3872   
74387  86155.0 2018-08-09 2020-08-21              1        3873       4616   
74388  86155.0 2020-08-22 2020-08-22              1        4617       4617   
74389  86155.0 2020-08-23 2020-08-25              0        4618       4620   
74390  86155.0 2020-08-26 2020-09-08              1        4621       4634   
74391  86155.0 2020-09-09 2021-02-08              0        4635 

In [159]:
# assume ed is sorted and has a no_prior_fire flag (1 if truly no previous fire)
ed = corrected_df.sort_values(["id","start_time"]).reset_index(drop=True)

# look at violations
mask_viol = ed["days_since_last_fire"] > ed["start_time"] +1
viol = ed.loc[mask_viol, ["id","start_time","fire_occurred","days_since_last_fire", "no_prior_fire"]]

print(f"{len(viol)} violations")
print(viol.sample(20))


1986 violations
             id  start_time  fire_occurred  days_since_last_fire  \
74385   86155.0        1668              1                  1844   
15301   43495.0        1150              1                  1839   
11212   42128.0        2431              1                  3656   
32846   54241.0        2516              1                  3698   
81989  101561.0        1629              0                  4215   
55439   71838.0         598              0                  5246   
31245   53368.0        1641              0                  4203   
72712   85487.0        1319              0                  1563   
55773   72074.0        1895              1                  5161   
13929   43002.0         956              0                  1105   
69105   83098.0        1301              0                  1801   
18651   44972.0         294              1                   316   
75010   86579.0        1282              1                  1449   
56748   72945.0        2132     

In [146]:
corrected_df

Unnamed: 0,id,start_dt,stop_dt,fire_occurred,start_time,stop_time,stop_date,sin_month,cos_month,temperature,...,forest,agricultur,irrigated,shrubland,grassland,other,south,geometry,no_prior_fire,days_since_last_fire
16,37318.0,2008-01-01,2014-08-06,0,0,2409,2014-08-06,-8.660254e-01,-0.5,21.829463,...,0.08279,0.30852,0.0000,0.41363,0.10925,0.03187,0.569025,"POLYGON ((2639793.233 1757437.936, 2639519.934...",1,
84313,109927.0,2008-01-01,2010-12-11,0,0,1075,2010-12-11,-2.449294e-16,1.0,16.418217,...,0.31324,0.31379,0.0000,0.28886,0.01950,0.00808,0.201401,"POLYGON ((5933879.27 1665289.001, 5933879.27 1...",1,
84310,109926.0,2008-01-01,2010-04-18,0,0,838,2010-04-18,8.660254e-01,-0.5,19.396781,...,0.12500,0.55128,0.0000,0.00000,0.00000,0.05128,0.015962,"POLYGON ((5943879.27 1675289.001, 5942479.498 ...",1,
84307,109713.0,2008-01-01,2011-08-09,0,0,1316,2011-08-09,-8.660254e-01,-0.5,28.586927,...,0.00000,0.01869,0.0000,0.91302,0.02255,0.04575,0.534180,"MULTIPOLYGON (((5924033.135 1611330.79, 592499...",1,
65,37734.0,2008-01-01,2009-08-15,0,0,592,2009-08-15,-8.660254e-01,-0.5,26.191908,...,0.19705,0.62429,0.0000,0.02785,0.00986,0.01661,0.308823,"POLYGON ((2654234.911 1995453.28, 2654646.738 ...",1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33288,54442.0,2023-12-28,2023-12-31,0,5840,5843,2023-12-31,-2.449294e-16,1.0,10.223537,...,0.10730,0.06810,0.3836,0.20970,0.00740,0.01150,0.271544,"POLYGON ((3423879.27 2115289.001, 3413879.27 2...",1,
49167,64920.0,2023-12-28,2023-12-31,0,5840,5843,2023-12-31,-2.449294e-16,1.0,10.177220,...,0.06660,0.91930,0.0000,0.00000,0.00010,0.00000,0.444850,"POLYGON ((3903879.27 2935289.001, 3893879.27 2...",1,
8726,41666.0,2023-12-28,2023-12-31,0,5840,5843,2023-12-31,-2.449294e-16,1.0,8.281294,...,0.34710,0.06760,0.0000,0.50200,0.02730,0.05340,0.477944,"POLYGON ((2843879.27 2275289.001, 2833879.27 2...",0,402.0
48770,64521.0,2023-12-29,2023-12-31,0,5841,5843,2023-12-31,-2.449294e-16,1.0,9.277387,...,0.19060,0.24310,0.0000,0.00000,0.52300,0.01410,0.388644,"POLYGON ((3883879.27 2525289.001, 3873879.27 2...",1,


In [None]:
# assume ed is sorted and has a no_prior_fire flag (1 if truly no previous fire)
ed = event_df.sort_values(["id","start_time"]).reset_index(drop=True)

# look at violations
mask_viol = ed["days_since_last_fire"] > ed["start_time"] +1
viol = ed.loc[mask_viol, ["id","start_time","fire_occurred","days_since_last_fire"]]

print(f"{len(viol)} violations")
print(viol.sample(20))


1986 violations
             id  start_time  fire_occurred  days_since_last_fire
45670   62125.0        1321              0                  1512
81989  101561.0        1629              0                  4215
21693   46717.0        1012              1                  1108
9471    41691.0         256              0                  2031
79091   97613.0        2169              1                  4745
77770   95815.0        1702              0                  4142
24631   48518.0        1684              0                  4160
18187   44767.0        1685              0                  4159
1445    39928.0        1568              1                  1907
62529   80229.0        1503              1                  1848
3468    40609.0        2041              1                  2865
15206   43465.0        1296              0                  4548
16935   44304.0         539              0                  1057
56724   72938.0        1290              0                  4554
5833    4

In [118]:
# Find all intervals where start_time ≥ stop_time
# your “bad” filter
bad = event_df_final.loc[
    event_df_final["start_time"] == event_df_final["stop_time"]-1,
    ["id", "start_time", "stop_time", "fire_occurred"]
]

print(f"{len(bad)} bad intervals:\n", bad)

# count how many 1s vs 0s
counts = bad["fire_occurred"].value_counts().sort_index()
print("\nCounts by fire_occurred:")
print(counts)

# optionally, label them
print("\nFires (==1):", counts.get(1, 0))
print("Non-fires (==0):", counts.get(0, 0))


3095 bad intervals:
              id  start_time  stop_time  fire_occurred
339     38394.0        5672       5673              0
343     38394.0        5694       5695              1
459     38614.0        5341       5342              1
562     38831.0        1377       1378              0
563     38831.0        1379       1380              1
...         ...         ...        ...            ...
83546  104841.0        4696       4697              0
83548  104841.0        4703       4704              1
83550  104841.0        4708       4709              1
83712  105105.0        5082       5083              1
84331  109929.0        5678       5679              0

[3095 rows x 4 columns]

Counts by fire_occurred:
fire_occurred
0    1170
1    1925
Name: count, dtype: int64

Fires (==1): 1925
Non-fires (==0): 1170


In [119]:
# 1) sort & reset index
ed = event_df_final.sort_values(["id", "start_time"]).reset_index(drop=True)

# 2) build masks
is_bad     = ed["start_time"] >= ed["stop_time"]
prev_fire  = ed["fire_occurred"].shift(1) == 1
same_grid  = ed["id"].shift(1) == ed["id"]

# 3) count how many bad intervals follow a fire in the same grid
bad_after_fire = (is_bad & prev_fire & same_grid).sum()

print(f"{bad_after_fire} bad intervals immediately follow a fire")


1541 bad intervals immediately follow a fire


In [None]:
event_df_final

In [117]:
bad = event_df_final.loc[event_df["start_time"] == event_df["stop_time"]-1,
                   ["id","start_time","stop_time","fire_occurred"]]
print(f"{len(bad)} bad intervals:\n", bad)

3095 bad intervals:
              id  start_time  stop_time  fire_occurred
339     38394.0        5672       5673              0
343     38394.0        5694       5695              1
459     38614.0        5341       5342              1
562     38831.0        1377       1378              0
563     38831.0        1379       1380              1
...         ...         ...        ...            ...
83546  104841.0        4696       4697              0
83548  104841.0        4703       4704              1
83550  104841.0        4708       4709              1
83712  105105.0        5082       5083              1
84331  109929.0        5678       5679              0

[3095 rows x 4 columns]


In [112]:
bad = event_df_final[event_df_final['start_dt'] == event_df_final['stop_dt']]
sample_id = bad['id'].iloc[0]

print("Intervals for grid", sample_id)
print(event_df_final[event_df_final['id'] == sample_id][['start_dt','stop_dt','fire_occurred']])

print("\nAll fire dates for that grid:")
print(fires[fires['id'] == sample_id]['date'].tolist())

print("\nGrid date range:")
print(grid_range[grid_range['id'] == sample_id])


Intervals for grid 39080.0
      start_dt    stop_dt  fire_occurred
801 2008-01-01 2011-10-07              0
802 2011-10-08 2022-07-12              1
803 2022-07-13 2022-07-13              1
804 2022-07-14 2023-12-31              0

All fire dates for that grid:
[Timestamp('2022-07-12 00:00:00'), Timestamp('2022-07-13 00:00:00')]

Grid date range:
          id   start_dt    stop_dt
215  39080.0 2008-01-01 2023-12-31


In [94]:
bad = event_df_final.loc[event_df["start_time"] >=  event_df["stop_time"] ,
                   ["id","start_time","stop_time","fire_occurred"]]
print(f"{len(bad)} bad intervals:\n", bad)


2880 bad intervals:
              id  start_time  stop_time  fire_occurred
342     38394.0        5693       5693              1
459     38614.0        5342       5342              1
461     38614.0        5345       5345              1
804     39080.0        5307       5307              1
846     39269.0        5709       5709              1
...         ...         ...        ...            ...
83942  104621.0        5426       5426              1
83944  104621.0        5430       5430              1
84054  104841.0        4702       4702              1
84060  104841.0        5429       5429              1
84564  107054.0         940        940              1

[2880 rows x 4 columns]


In [77]:
# den korrekte event med random fire grids

grid_range = df_temp7_long.groupby("id")["date"].agg(start_temp="min", end_temp="max")
grid_range = grid_range.reset_index()
grid_range['start_dt'] = pd.to_datetime(grid_range['start_temp']).dt.normalize()
grid_range['stop_dt'] = pd.to_datetime(grid_range['end_temp']).dt.normalize()
grid_range = grid_range.drop(columns=["start_temp", "end_temp"])
burn_areas["date"] = pd.to_datetime(burn_areas["initialdat"]).dt.normalize()
fires = (
    burn_areas
    .groupby(["id","date"])
    .size()
    .rename("n_fires")
    .reset_index()
)
event_list = []
np.random.seed(42)
num_splits = 3
min_days = 30

for grid, row in grid_range.iterrows():
    # print(grid)
    # print(row)
    gid = row["id"]
    start, end = row["start_dt"], row["stop_dt"]
    fds = fires.loc[fires.id.eq(gid), "date"].sort_values().tolist()
    
    if not fds:
        # no fires → random splits
        span = (end - start).days
        pts = np.sort(np.random.choice(range(1, span), size=num_splits-1, replace=False))
        bounds = [start] + [start + timedelta(days=int(d)) for d in pts] + [end]
        for i in range(num_splits):
            s = bounds[i] if i == 0 else bounds[i] + timedelta(days=1)
            e = bounds[i + 1]
            event_list.append([gid, s, e, 0])
            

    else:
        # has fires → mix real fire dates with safe random cuts
        # -----------------------------------------------
        # 1) get sorted fire dates in (start, end)
        fds = sorted(fd for fd in fires.loc[fires.id.eq(gid), "date"]
                     if start < fd < end)
        fire_pts = set(fds)

        # 2) build safe candidates for random split days: 
        #    neither on a fire day nor the day before one
        span = (end - start).days
        rand_candidates = [
            d for d in range(1, span)
            if (start + timedelta(days=d)) not in fire_pts
            and (start + timedelta(days=d+1)) not in fire_pts
        ]

        # 3) pick up to (num_splits-1) random days, or none if not enough
        if len(rand_candidates) >= num_splits - 1:
            rand_pts = sorted(
                np.random.choice(rand_candidates,
                                 size=num_splits-1,
                                 replace=False).tolist()
            )
        else:
            rand_pts = []

        # 4) combine the day‐offsets of fds and rand_pts, then turn into bounds
        split_days = sorted(rand_pts + [(fd - start).days for fd in fds])
        bounds = [start] + [start + timedelta(days=d) for d in split_days] + [end]

        # 5) build intervals, but only if start < stop
        for i in range(len(bounds)-1):
            s = bounds[i] if i == 0 else bounds[i] + timedelta(days=1)
            e = bounds[i+1]
            if s < e:
                label = 1 if e in fire_pts else 0
                event_list.append([gid, s, e, label])



event_df = pd.DataFrame(event_list, columns=["id","start_dt","stop_dt","fire_occurred"])
event_df = event_df.sort_values(["id", "start_dt"]).reset_index(drop=True)

base = grid_range["start_dt"].min()
event_df["start_time"] = (event_df["start_dt"] - base).dt.days
event_df["stop_time"]  = (event_df["stop_dt"]  - base).dt.days
event_df["stop_date"]  = event_df["stop_dt"]
event_df["month"]      = event_df["stop_date"].dt.month
event_df["sin_month"]  = np.sin(2*np.pi*event_df["month"]/12)
event_df["cos_month"]  = np.cos(2*np.pi*event_df["month"]/12)
event_df.drop(columns="month", inplace=True)


# ─── 5) MERGE STATIC & CLIMATE SNAPSHOTS ──────────────────────────────────────
event_df_final = event_df.copy()

for df_time, var in [
    (df_temp7_long,      "temperature"),
    (df_wind7_long,      "wind_speed"),
    (df_drought20_long, "drought_20"),
    (df_rhum7_long,      "humidity")
]:
    s = df_time.set_index(["id","date"])[var]
    idx = pd.MultiIndex.from_frame(
        event_df_final[["id","stop_date"]].rename(columns={"stop_date":"date"})
    )
    event_df_final[var] = s.reindex(idx).values

static_df = static_pred.reset_index()  # now has a column "id"
event_df_final = event_df_final.merge(
    static_df,
    on="id",
    how="left"
)

event_df_final



Unnamed: 0,id,start_dt,stop_dt,fire_occurred,start_time,stop_time,stop_date,sin_month,cos_month,temperature,...,pline_den,urban,forest,agricultur,irrigated,shrubland,grassland,other,south,geometry
0,37297.0,2008-01-01,2008-06-27,0,0,178,2008-06-27,1.224647e-16,-1.000000,21.207707,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
1,37297.0,2008-06-28,2020-07-21,0,179,4585,2020-07-21,-5.000000e-01,-0.866025,23.788761,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
2,37297.0,2020-07-22,2023-12-31,0,4586,5843,2023-12-31,-2.449294e-16,1.000000,13.811706,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
3,37298.0,2008-01-01,2018-04-02,0,0,3744,2018-04-02,8.660254e-01,-0.500000,13.588775,...,0.000343,0.15113,0.50506,0.03453,0.0,0.27495,0.02171,0.00937,0.363963,"POLYGON ((2643879.27 1955289.001, 2637571.418 ..."
4,37298.0,2018-04-03,2018-10-07,1,3745,3932,2018-10-07,-8.660254e-01,0.500000,22.666133,...,0.000343,0.15113,0.50506,0.03453,0.0,0.27495,0.02171,0.00937,0.363963,"POLYGON ((2643879.27 1955289.001, 2637571.418 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72879,110366.0,2008-03-27,2011-01-18,0,86,1113,2011-01-18,5.000000e-01,0.866025,15.904572,...,0.000888,0.65421,0.22222,0.06542,0.0,0.00000,0.00000,0.05607,0.052189,"POLYGON ((5953879.27 1675289.001, 5953879.27 1..."
72880,110366.0,2011-01-19,2023-12-31,0,1114,5843,2023-12-31,-2.449294e-16,1.000000,18.239999,...,0.000888,0.65421,0.22222,0.06542,0.0,0.00000,0.00000,0.05607,0.052189,"POLYGON ((5953879.27 1675289.001, 5953879.27 1..."
72881,110367.0,2008-01-01,2013-01-05,0,0,1831,2013-01-05,5.000000e-01,0.866025,15.069270,...,0.000693,0.10039,0.06746,0.05019,0.0,0.31274,0.38224,0.08880,0.482014,"MULTIPOLYGON (((5953879.27 1675289.001, 595550..."
72882,110367.0,2013-01-06,2021-11-13,0,1832,5065,2021-11-13,-5.000000e-01,0.866025,21.524340,...,0.000693,0.10039,0.06746,0.05019,0.0,0.31274,0.38224,0.08880,0.482014,"MULTIPOLYGON (((5953879.27 1675289.001, 595550..."


In [120]:
# Find all intervals where start_time ≥ stop_time
bad = event_df_final.loc[event_df["start_time"] ==  event_df["stop_time"] -1,
                   ["id","start_time","stop_time","fire_occurred"]]
print(f"{len(bad)} bad intervals:\n", bad)


3095 bad intervals:
              id  start_time  stop_time  fire_occurred
339     38394.0        5672       5673              0
343     38394.0        5694       5695              1
459     38614.0        5341       5342              1
562     38831.0        1377       1378              0
563     38831.0        1379       1380              1
...         ...         ...        ...            ...
83546  104841.0        4696       4697              0
83548  104841.0        4703       4704              1
83550  104841.0        4708       4709              1
83712  105105.0        5082       5083              1
84331  109929.0        5678       5679              0

[3095 rows x 4 columns]


In [121]:


# 1) compute duration
df = event_df_final.copy()
df["duration"] = df["stop_time"] - df["start_time"]

# 2) (optional) drop zero-or-negative intervals if you only want positive ones:
# df = df[df["duration"] > 0]

# 3) count how many events have each duration
dur_counts = (
    df.groupby("duration")
      .size()            # number of rows per duration
      .reset_index(name="count")
)

# 4) pick the 10 smallest durations and their counts
low10 = dur_counts.sort_values("duration").head(10)

print("Top 10 smallest intervals and their frequencies:")
print(low10)

# 5) sum up how many events are in those ten buckets
total_in_low10 = low10["count"].sum()
print(f"\nTotal number of events in those top-10 lowest intervals: {total_in_low10}")


Top 10 smallest intervals and their frequencies:
   duration  count
0         0   1541
1         1   3095
2         2   1895
3         3   1045
4         4    838
5         5    692
6         6    587
7         7    556
8         8    497
9         9    404

Total number of events in those top-10 lowest intervals: 11150


In [None]:
# # den oprindelige event med random fire grids

# import numpy as np
# import pandas as pd
# import geopandas as gpd
# from datetime import datetime, timedelta

# # ─── 0) assume you've already run:
# # df_temp_long, df_prcp_long, df_wind_long, df_drought20_long, df_rhum_long
# # each has columns ["id","date",<var>],
# # plus burn_areas GeoDataFrame, static_pred GeoDataFrame.

# # ─── 1) COMPUTE GRID RANGES (min/max date per id) ────────────────────────────

# rng_temp = df_temp7_long.groupby("id")["date"].agg(start_temp="min", end_temp="max")
# rng_prcp = df_rhum7_long.groupby("id")["date"].agg(start_pr="min", end_pr="max")

# grid_ranges = rng_temp.join(rng_prcp, how="outer")
# grid_ranges["start_dt"] = grid_ranges[["start_temp","start_pr"]].min(axis=1)
# grid_ranges["stop_dt"]  = grid_ranges[["end_temp",  "end_pr"]].max(axis=1)
# grid_ranges = grid_ranges[["start_dt","stop_dt"]].reset_index()

# # ─── 2) EXTRACT FIRE DATES ────────────────────────────────────────────────────
# burn_areas["date"] = pd.to_datetime(burn_areas["initialdat"]).dt.normalize()
# fires = (
#     burn_areas
#     .groupby(["id","date"])
#     .size()
#     .rename("n_fires")
#     .reset_index()
# )

# # ─── 3) BUILD ~20k EVENT INTERVALS ────────────────────────────────────────────
# event_list = []
# np.random.seed(42)
# num_splits = 3
# MIN_DAYS = 3

# for grid, row in grid_ranges.iterrows():
#     gid = row["id"]
#     start, end = row["start_dt"], row["stop_dt"]

#     fds = fires.loc[fires.id.eq(gid), "date"].sort_values().tolist()

#     if not fds:
#         # No fires → random splits
#         span = (end - start).days
#         if span < num_splits:
#             event_list.append([gid, start, end, 0])
#         else:
#             pts = np.sort(np.random.choice(range(1, span), size=num_splits-1, replace=False))
#             bounds = [start] + [start + timedelta(days=int(d)) for d in pts] + [end]
#             for i in range(num_splits):
#                 event_list.append([gid, bounds[i], bounds[i+1], 0])
#     else:
#         # Fires exist: handle carefully
#         dates = [start] + fds + [end + timedelta(days=1)]  # (+1 day to make sure last gap is correct)

#         for i in range(len(dates)-1):
#             s = dates[i]
#             t = dates[i+1]

#             if (t - s).days <= 0:
#                 continue  # skip zero or negative intervals (should not happen)

#             if i % 2 == 0:
#                 # Even index → non-fire gap
#                 span = (t - s).days
#                 if span >= 2 * MIN_DAYS and np.random.rand() < 0.8:
#                     max_rand = span - MIN_DAYS
#                     if max_rand > MIN_DAYS:
#                         rand_start_day = np.random.randint(MIN_DAYS, max_rand)
#                         mid_point = s + timedelta(days=rand_start_day)
#                         event_list.append([gid, s, mid_point, 0])
#                         event_list.append([gid, mid_point, t, 0])
#                     else:
#                         event_list.append([gid, s, t, 0])
#                 else:
#                     event_list.append([gid, s, t, 0])
#             else:
#                 # Odd index → fire event
#                 event_list.append([gid, s, t, 1])

# event_df = pd.DataFrame(event_list, columns=["id","start_dt","stop_dt","fire_occurred"])

# event_df = event_df.sort_values(["id", "start_dt"]).reset_index(drop=True)

# # ─── 4) ADD NUMERIC TIMES & SEASONAL FEATURES ─────────────────────────────────
# base = grid_ranges["start_dt"].min()
# event_df["start_time"] = (event_df["start_dt"] - base).dt.days
# event_df["stop_time"]  = (event_df["stop_dt"]  - base).dt.days
# event_df["stop_date"]  = event_df["stop_dt"]
# event_df["month"]      = event_df["stop_date"].dt.month
# event_df["sin_month"]  = np.sin(2*np.pi*event_df["month"]/12)
# event_df["cos_month"]  = np.cos(2*np.pi*event_df["month"]/12)
# event_df.drop(columns="month", inplace=True)

# # ─── 5) MERGE STATIC & CLIMATE SNAPSHOTS ──────────────────────────────────────
# out = event_df.set_index("id")
# out = out.join(static_pred)  # static_pred already has 'id' as index

# event_df_final = event_df.copy()

# for df_time, var in [
#     (df_temp7_long,      "temperature"),
#     (df_wind7_long,      "wind_speed"),
#     (df_drought20_long, "drought_20"),
#     (df_rhum7_long,      "humidity")
# ]:
#     s = df_time.set_index(["id","date"])[var]
#     idx = pd.MultiIndex.from_frame(
#         event_df_final[["id","stop_date"]].rename(columns={"stop_date":"date"})
#     )
#     event_df_final[var] = s.reindex(idx).values

# static_df = static_pred.reset_index()  # now has a column "id"
# event_df_final = event_df_final.merge(
#     static_df,
#     on="id",
#     how="left"
# )

# event_df_final


Unnamed: 0,id,start_dt,stop_dt,fire_occurred,start_time,stop_time,stop_date,sin_month,cos_month,temperature,...,pline_den,urban,forest,agricultur,irrigated,shrubland,grassland,other,south,geometry
0,37297.0,2008-01-01,2008-06-27,0,0,178,2008-06-27,1.224647e-16,-1.000000,21.207707,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
1,37297.0,2008-06-27,2020-07-21,0,178,4585,2020-07-21,-5.000000e-01,-0.866025,23.788761,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
2,37297.0,2020-07-21,2023-12-31,0,4585,5843,2023-12-31,-2.449294e-16,1.000000,13.811706,...,0.000536,0.16667,0.53348,0.17000,0.0,0.08556,0.00000,0.03222,0.189219,"POLYGON ((2643879.27 1965289.001, 2640140.24 1..."
3,37298.0,2008-01-01,2010-01-14,0,0,744,2010-01-14,5.000000e-01,0.866025,11.083984,...,0.000343,0.15113,0.50506,0.03453,0.0,0.27495,0.02171,0.00937,0.363963,"POLYGON ((2643879.27 1955289.001, 2637571.418 ..."
4,37298.0,2010-01-14,2018-10-07,0,744,3932,2018-10-07,-8.660254e-01,0.500000,22.666133,...,0.000343,0.15113,0.50506,0.03453,0.0,0.27495,0.02171,0.00937,0.363963,"POLYGON ((2643879.27 1955289.001, 2637571.418 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74801,110366.0,2019-06-11,2020-12-29,0,4179,4746,2020-12-29,-2.449294e-16,1.000000,16.893263,...,0.000888,0.65421,0.22222,0.06542,0.0,0.00000,0.00000,0.05607,0.052189,"POLYGON ((5953879.27 1675289.001, 5953879.27 1..."
74802,110366.0,2020-12-29,2023-12-31,0,4746,5843,2023-12-31,-2.449294e-16,1.000000,18.239999,...,0.000888,0.65421,0.22222,0.06542,0.0,0.00000,0.00000,0.05607,0.052189,"POLYGON ((5953879.27 1675289.001, 5953879.27 1..."
74803,110367.0,2008-01-01,2009-05-05,0,0,490,2009-05-05,5.000000e-01,-0.866025,18.252019,...,0.000693,0.10039,0.06746,0.05019,0.0,0.31274,0.38224,0.08880,0.482014,"MULTIPOLYGON (((5953879.27 1675289.001, 595550..."
74804,110367.0,2009-05-05,2010-04-15,0,490,835,2010-04-15,8.660254e-01,-0.500000,17.220777,...,0.000693,0.10039,0.06746,0.05019,0.0,0.31274,0.38224,0.08880,0.482014,"MULTIPOLYGON (((5953879.27 1675289.001, 595550..."


In [122]:
len(event_df_final['id'].unique())

17421

In [123]:
print(event_df_final[["id", "start_time", "stop_time", "fire_occurred"]].head(20))

         id  start_time  stop_time  fire_occurred
0   37297.0           0        178              0
1   37297.0         179       4585              0
2   37297.0        4586       5843              0
3   37298.0           0        381              0
4   37298.0         382       3932              1
5   37298.0        3933       5263              0
6   37298.0        5264       5685              1
7   37298.0        5686       5843              0
8   37299.0           0        742              0
9   37299.0         743       3932              1
10  37299.0        3933       4753              0
11  37299.0        4754       5685              1
12  37299.0        5686       5843              0
13  37317.0           0       1497              0
14  37317.0        1498       1583              0
15  37317.0        1584       5843              0
16  37318.0           0       2409              0
17  37318.0        2410       2443              0
18  37318.0        2444       5843              0


In [57]:
# Create counters
count_before = 0
count_between = 0
count_after = 0

# Non-fire events only
non_fire_events = event_df_final[event_df_final["fire_occurred"] == 0]

# Fire grids only
fire_grids = event_df_final[event_df_final["fire_occurred"] == 1]["id"].unique()

for gid in fire_grids:
    sub = event_df_final[event_df_final["id"] == gid]
    fires = sub[sub["fire_occurred"] == 1]
    nonfires = sub[sub["fire_occurred"] == 0]

    first_fire_start = fires["start_time"].min()
    last_fire_stop   = fires["stop_time"].max()

    for _, nf in nonfires.iterrows():
        nf_start = nf["start_time"]
        nf_stop = nf["stop_time"]

        if nf_stop <= first_fire_start:
            count_before += 1
        elif nf_start >= last_fire_stop:
            count_after += 1
        else:
            count_between += 1

# Show results
print("Summary of non-fire intervals placement:")
print(f"  BEFORE first fire : {count_before}")
print(f"  BETWEEN fires     : {count_between}")
print(f"  AFTER last fire   : {count_after}")


Summary of non-fire intervals placement:
  BEFORE first fire : 8450
  BETWEEN fires     : 12682
  AFTER last fire   : 2897


In [19]:
event_df_final.columns

Index(['id', 'start_dt', 'stop_dt', 'fire_occurred', 'start_time', 'stop_time',
       'stop_date', 'sin_month', 'cos_month', 'temperature', 'wind_speed',
       'drought_20', 'humidity', 'index', 'mean_elev', 'north', 'east', 'west',
       'slope30', 'road_den', 'pline_den', 'urban', 'forest', 'agricultur',
       'irrigated', 'shrubland', 'grassland', 'other', 'south', 'geometry'],
      dtype='object')

In [124]:
# import geopandas as gpd

# # Convert to GeoDataFrame (assuming 'geometry' column came along in the merge)
# gdf_event = gpd.GeoDataFrame(
#     event_df_final,
#     geometry="geometry",
#     crs=static_pred.crs  # or whatever CRS your static_pred uses
# )

# # Save as GeoPackage (or Shapefile, or GeoJSON)

# gdf_event.to_file("event_df_middelhav.shp")


  gdf_event.to_file("event_df_middelhav.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [24]:
print(event_df_final[["id", "fire_occurred", "temperature"]].head(20))

         id  fire_occurred  temperature
0   37297.0              0    21.207707
1   37297.0              0    23.788761
2   37297.0              0    13.811706
3   37298.0              1    22.666133
4   37298.0              1    21.471484
5   37298.0              0    13.811706
6   37299.0              1    22.666133
7   37299.0              1    21.471484
8   37299.0              0    13.811706
9   37317.0              0    20.763964
10  37317.0              0    24.074929
11  37317.0              0    15.826494
12  37318.0              0    13.871416
13  37318.0              0    13.126814
14  37318.0              0    15.826494
15  37511.0              0    16.604762
16  37511.0              0    14.766649
17  37511.0              0    13.281852
18  37515.0              0    22.124595
19  37515.0              0    23.327894


In [11]:
event_df.head(10)

Unnamed: 0,id,start_dt,stop_dt,fire_occurred,start_time,stop_time,stop_date,sin_month,cos_month
0,37297.0,2008-01-01,2008-06-27,0,0,178,2008-06-27,1.224647e-16,-1.0
1,37297.0,2008-06-27,2020-07-21,0,178,4585,2020-07-21,-0.5,-0.866025
2,37297.0,2020-07-21,2023-12-31,0,4585,5843,2023-12-31,-2.449294e-16,1.0
3,37298.0,2008-01-01,2018-10-07,1,0,3932,2018-10-07,-0.8660254,0.5
4,37298.0,2018-10-08,2023-07-26,1,3933,5685,2023-07-26,-0.5,-0.866025
5,37298.0,2023-07-27,2023-12-31,0,5686,5843,2023-12-31,-2.449294e-16,1.0
6,37299.0,2008-01-01,2018-10-07,1,0,3932,2018-10-07,-0.8660254,0.5
7,37299.0,2018-10-08,2023-07-26,1,3933,5685,2023-07-26,-0.5,-0.866025
8,37299.0,2023-07-27,2023-12-31,0,5686,5843,2023-12-31,-2.449294e-16,1.0
9,37317.0,2008-01-01,2014-07-01,0,0,2373,2014-07-01,-0.5,-0.866025


In [31]:
burn_areas

Unnamed: 0,id,initialdat,finaldate,area_ha,iso2,admlvl1,admlvl2,admlvl3,admlvl5,map_source,year,geometry
0,68501.0,2022/01/16 12:21:00.000,2022/01/16 13:10:00.000,16,FR,Provence-Alpes-Côte d’Azur,Provence-Alpes-Côte d’Azur,Alpes-de-Haute-Provence,Fugeret,sentinel,0,"POLYGON ((4055897.62 2330024.857, 4055906.614 ..."
1,68503.0,2020/02/10 00:00:00.000,2020/02/10 00:00:00.000,26,FR,Provence-Alpes-Côte d’Azur,Provence-Alpes-Côte d’Azur,Alpes-Maritimes,Saint-Auban,sentinel,2020,"POLYGON ((4056992.338 2308130.216, 4057012.551..."
2,68503.0,2022/01/13 12:15:00.000,2022/01/13 13:16:00.000,13,FR,Provence-Alpes-Côte d’Azur,Provence-Alpes-Côte d’Azur,Alpes-Maritimes,Briançonnet,sentinel,0,"POLYGON ((4060696.301 2313002.716, 4060597.906..."
3,68503.0,2023/03/12 00:30:00.000,2023/03/12 01:21:00.000,21,FR,Provence-Alpes-Côte d’Azur,Provence-Alpes-Côte d’Azur,Alpes-Maritimes,Briançonnet,sentinel,0,"POLYGON ((4062643.325 2313621.909, 4062681.415..."
4,68506.0,2008/05/11 00:00:00.000,2008/05/11 00:00:00.000,41,FR,Provence-Alpes-Côte d’Azur,Provence-Alpes-Côte d’Azur,Var,Montauroux,modis,2008,"POLYGON ((4060158.823 2277343.775, 4060054.248..."
...,...,...,...,...,...,...,...,...,...,...,...,...
25027,109931.0,2008/07/22 00:00:00.000,2008/07/22 00:00:00.000,13117,EL,"Νησιά Αιγαίου, Κρήτη",Νότιο Αιγαίο,"Κάλυμνος, Κάρπαθος – Ηρωική Νήσος Κάσος, Κως, ...",Τοπική Κοινότητα Λαέρμων,modis,2008,"POLYGON ((5939365.14 1635289.001, 5939273.034 ..."
25028,109931.0,2023/07/18 19:13:00.000,2023/07/27 11:38:00.000,18626,EL,"Νησιά Αιγαίου, Κρήτη",Νότιο Αιγαίο,"Κάλυμνος, Κάρπαθος – Ηρωική Νήσος Κάσος, Κως, ...",Τοπική Κοινότητα Λαέρμων,sentinel,0,"MULTIPOLYGON (((5933879.27 1632176.849, 593390..."
25029,110147.0,2015/08/23 00:00:00.000,2015/08/23 00:00:00.000,78,EL,"Νησιά Αιγαίου, Κρήτη",Νότιο Αιγαίο,"Κάλυμνος, Κάρπαθος – Ηρωική Νήσος Κάσος, Κως, ...",Δημοτική Κοινότητα Κοσκινού,modis,2015,"POLYGON ((5949661.169 1674154.274, 5949873.18 ..."
25030,110147.0,2021/08/01 11:55:00.000,2021/08/03 00:20:00.000,1159,EL,"Νησιά Αιγαίου, Κρήτη",Νότιο Αιγαίο,"Κάλυμνος, Κάρπαθος – Ηρωική Νήσος Κάσος, Κως, ...",Δημοτική Κοινότητα Θεολόγου,sentinel,0,"MULTIPOLYGON (((5943879.27 1665974.209, 594393..."


In [28]:
static_pred

Unnamed: 0,id,mean_elev,north,east,west,slope30,road_den,pline_den,urban,forest,agricultur,irrigated,shrubland,grassland,other,south,geometry
0,68467.0,897.879144,0.385969,0.101913,0.105875,0.035588,948.827135,0.000251,0.05400,0.38480,0.06980,0.0,0.00280,0.48860,0.00000,0.406244,"POLYGON ((4063879.27 2665289.001, 4053879.27 2..."
1,68468.0,941.964173,0.530761,0.079197,0.095476,0.203457,711.775649,0.000295,0.04311,0.48905,0.05744,0.0,0.00000,0.40384,0.00622,0.294517,"POLYGON ((4053882.622 2657111.816, 4053879.27 ..."
2,68474.0,371.000000,1.000000,0.000000,0.000000,0.000000,2967.761459,0.000000,0.00000,0.00000,0.00000,0.0,0.00000,0.00000,1.00000,0.000000,"POLYGON ((4059225.489 2599572.709, 4061683.468..."
3,68475.0,676.237648,0.698925,0.016813,0.100288,0.139637,2893.000000,0.000483,0.17530,0.27210,0.06080,0.0,0.00000,0.31360,0.17820,0.183975,"POLYGON ((4063879.27 2585289.001, 4053879.27 2..."
4,68476.0,1142.683776,0.408637,0.126963,0.157275,0.742900,707.000000,0.000376,0.01400,0.72680,0.03940,0.0,0.02010,0.19710,0.00260,0.307125,"POLYGON ((4063879.27 2575289.001, 4053879.27 2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17471,110367.0,58.024957,0.343513,0.150501,0.024432,0.108967,1526.139516,0.000693,0.10039,0.06746,0.05019,0.0,0.31274,0.38224,0.08880,0.482014,"MULTIPOLYGON (((5953879.27 1675289.001, 595550..."
17472,112786.0,21.660365,0.423561,0.063399,0.047662,0.175809,18.445953,0.000000,0.00000,0.00000,0.00000,0.0,0.78417,0.00000,0.21583,0.469690,"POLYGON ((6070167.564 1678484.964, 6070567.833..."
17473,113006.0,103.777764,0.340062,0.139172,0.161357,0.407326,232.397162,0.000199,0.06101,0.00000,0.04343,0.0,0.29162,0.54602,0.05791,0.364471,"MULTIPOLYGON (((6083481.746 1676198.171, 60832..."
17474,113226.0,82.402808,0.349288,0.069858,0.036223,0.777490,371.000000,0.000000,0.00000,0.00000,0.00000,0.0,0.82000,0.00000,0.18000,0.536921,"POLYGON ((6084431.151 1677457.885, 6084487.364..."
