In [12]:
import sys
sys.path.append("/mnt/nvme2tb/ffp/code/mlfires/ML_fires_al/")
import pathlib
import pandas as pd
import xarray as xr
import numpy as np
import os
import crop_dataset
import geopandas as gpd
import random

In [22]:
'''
Adds a hash string ID in the dataset from x y coords 
'''
def applyid(df):
    df['xposst'] = (df['x'] * 10000).apply('{:06.0f}'.format)
    df['yposst'] = (df['y'] * 10000).apply('{:06.0f}'.format)
    df['id'] = df['xposst'] + df['yposst']
    df.drop(columns=['xposst', 'yposst'], inplace=True)
    return df

'''
Creates coarsen dataset subgrid from the initial dataset.


Input 
    coarsen_c : The number of points in the subgrid is 1/( coarsen_c * coarsen_c) 
                We need to use odd numbers to maintain existing points coordinates in the subgrid.
    file : The csv file to merge

Output  
    Corsen tabular pandas dataframe that contains only the instances of the points in the grid
'''

def getcoarsedf(coarsen_c, dfday):
    ds = xr.open_dataset('/mnt/nvme2tb/ffp/datasets/images/20211030_df.nc')
    dsc = ds.coarsen(y=coarsen_c, boundary='trim').mean().coarsen(x=coarsen_c, boundary='trim').mean()
    coordtupar = dsc.stack(dim=['x', 'y']).dim.to_numpy()
    coordnp = np.array([*coordtupar])
    dfcoocrds = pd.DataFrame(coordnp, columns=['x', 'y'], dtype=float)
    dfcoocrds = applyid(dfcoocrds)
    coarsen_df = pd.merge(dfday, dfcoocrds, on=['id'], suffixes=("", "_c")).drop(columns=['x_c', 'y_c'])
    return coarsen_df

def extract_day(date, extented = True):
    csvfolder = '/mnt/nvme2tb/ffp/datasets/prod/'
    '''
    gdfperif = crop_dataset.getperif()
    crop_dataset.cropfile(os.path.join(csvfolder,date,'%s_norm.csv'%date),
                      os.path.join(csvfolder,date, gdfperif, '_greece'),
                      usexyid='id')
    '''

    csvfile = os.path.join(csvfolder, date, '%s_norm_greece.csv' % date)

    # csv for xai input
    
    coarsen_coef=31
    dfday = pd.read_csv(csvfile, dtype={'id': str})
    coarsedf = getcoarsedf(coarsen_coef, dfday)
    
    if extented:
        dfpred=extract_xy(pd.read_csv(os.path.join(csvfolder,date,"%s_pred_greece.csv"%date), dtype={'id': str}))    
        dfexids=getextrapoints(coarsedf, dfpred, coarsen_coef)
        coarsedf = pd.merge(dfexids, dfday, on=['id'])
  
    xaifolder = '/mnt/nvme2tb/ffp/datasets/xai/%s' % date
   
    if not os.path.isdir(xaifolder): os.makedirs(xaifolder)
    csvcoarse = os.path.join(xaifolder, '%s_xai_ext_inp.csv' % date)
    coarsedf.to_csv(csvcoarse, index=False)
    return coarsedf

def extract_xy(dfxai):
    dfxai['x']=dfxai['id'].str.slice(0,6).astype(int)/10000
    dfxai['y']=dfxai['id'].str.slice(6,12).astype(int)/10000
    return dfxai

def getcenters(dfcoarse, dfpred):
    dfcenter=pd.merge(dfcoarse[['id','max_temp']], dfpred, on='id', how='right')
    dfcenter.loc[~dfcenter['max_temp'].isna(), 'max_temp']=1
    dfcenter.loc[dfcenter['max_temp'].isna(), 'max_temp']=0
    dfcenter.rename(columns={'max_temp':'center'},inplace=True)
    dfcenter['center']=dfcenter['center'].astype(int)
    
    geom = gpd.points_from_xy(dfcenter['x'], dfcenter['y'], crs=4326)
    gdfcenter = gpd.GeoDataFrame(dfcenter, geometry=geom)
    
    return dfcenter

def getextrapoints(dfcoarse, dfpred, coarsen_coef):
    #merge the points from the coarsening with the rest of the dataset. 
    #Create "center" column to mark which points are the chosen after the coarsening
    dfcenter=pd.merge(dfcoarse[['id','max_temp']], dfpred, on='id', how='right')
    dfcenter.loc[~dfcenter['max_temp'].isna(), 'max_temp']=1
    dfcenter.loc[dfcenter['max_temp'].isna(), 'max_temp']=0
    dfcenter.rename(columns={'max_temp':'center'},inplace=True)
    dfcenter['center']=dfcenter['center'].astype(int)
    
    #create geodataframe with point geometries. Change to crs 2100 for meter coordinates
    geom = gpd.points_from_xy(dfcenter['x'], dfcenter['y'], crs=4326)
    gdfcenter = gpd.GeoDataFrame(dfcenter, geometry=geom)
    gdfcenter=gdfcenter.to_crs(2100)
    
    # spatial join of center points with all points around centers 
    # using the coarsen coefficient to create a square buffer of 500*(coarsen_coef-1)/2 meters
    gdfcenter2=gdfcenter.loc[gdfcenter['center']==1].copy()
    gdfcenter2['geometry']=gdfcenter2.geometry.buffer((coarsen_coef-1)/2*500,cap_style=3)
    gdfcenter2.drop(columns=['ypred0','ypred1','x','y'], inplace=True)     
    gdfsjoin=gdfcenter2.sjoin(gdfcenter, how="left")
    
    
    # find and select one point id for each risk level
    # for the points in the buffer keeping the initial center point
    sampleids=[]
    for ind in gdfcenter2.index:
        sampleids+=[gdfcenter2.loc[gdfcenter2.index==ind,'id'].item()]
        for risk in range(1,6):
            allrows=gdfsjoin.loc[(gdfsjoin.index==ind)\
                         &(gdfsjoin['id_left']!=gdfsjoin['id_right'])\
                         &(gdfsjoin['risk_left']!=gdfsjoin['risk_right'])\
                         &(gdfsjoin['risk_right']==risk)]
            if not allrows.empty:
                #print(ind,allrows.iloc[0]["id_right"])
                celln = random.randint(0, len(allrows)-1)
                #print(len(allrows), celln)
                sampleids+=[allrows.iloc[celln]["id_right"]]
    dfextids = pd.DataFrame(sampleids, columns=['id'])
    return dfextids
    

In [23]:
extract_day('20230825')

223 144
100 92
37 36
73 17
41 30
141 82
57 52
20 12
41 18
54 10
2 1
4 2
70 52
169 54
20 18
38 26
138 35
201 8
56 33
256 101
326 283
3 0
4 3
68 38
88 70
70 50
123 102
208 89
5 2
37 5
241 28
171 169
66 2
339 157
92 34
3 1
16 8
111 54
193 97
16 13
7 0
99 82
269 108
261 95
11 3
91 8
365 284
137 98
18 17
47 23
102 19
262 49
246 14
254 145
139 23
5 1
113 90
335 169
121 67
21 11
100 90
329 140
86 84
24 14
261 10
284 41
150 56
22 2
9 4
235 21
266 212
191 6
32 10
189 16
102 94
27 11
106 80
22 1
432 139
274 30
10 8
1 0
79 15
1 0
312 172
154 102
105 35
84 28
178 30
248 61
160 150
56 1
205 173
196 40
142 7
40 18
22 0
225 127
352 72
113 8
226 203
252 229
224 173
561 285
41 2
1 0
141 124
70 39
23 0
639 528
19 6
293 196
48 33
9 0
369 132
296 115
39 30
15 1
117 8
465 358
24 2
5 0
14 10
140 25
303 237
222 74
18 1
211 54
107 89
81 23
250 222
6 2
321 79
104 8
12 1
245 116
99 2
2 1
339 262
134 63
1 0
22 17
185 171
218 195
3 2
13 9
310 25
113 5
15 14
319 220
238 26
23 5
448 414
193 150
1 0
300 142
27 16
4 

108 79
2 0
233 184
77 73
15 3
100 48
275 193
40 26
5 4
7 4
111 70
264 92
223 128
149 41
190 148
107 60
312 75
122 7
404 182
122 77
12 5
3 2
78 30
183 71
198 191
128 24
38 0
1 0
101 41
166 85
50 23
19 1
333 321
238 186
55 34
3 1
129 97
246 65
45 41
12 10
48 39
107 78
184 177
253 136
154 11
269 152
147 30
11 3
67 5
272 80
171 2
187 9
77 46
1 0
116 93
143 59
6 4
200 61
205 164
47 17
1 0
208 83
151 127
298 194
50 5
81 26
19 15
16 11
4 2
147 17
397 373
318 16
142 100
276 201
184 31
38 20
41 28
155 130
490 167
20 15
236 63
202 96
48 0
493 179
163 89
83 39
6 0
372 73
67 45
48 8
6 5
383 145
160 114
37 21
329 84
215 138
15 10
297 9
138 135
12 1
272 123
360 233
68 31
36 35
183 150
269 0
14 11
170 38
4 3
105 78
235 188
62 12
2 1
343 86
122 79
7 0
368 107
21 5
234 104
281 112
44 15
145 6
195 194
157 148
23 13
94 45
372 184
137 31
36 20
188 81
491 423
17 12
25 16
195 151
84 27
44 11
1 0
4 0
37 35
172 164
244 22
5 1
33 13
278 274
2 0
11 3
98 32
466 93
52 16
160 106
180 93
13 8
13 1
40 7
160 78
29 4


Unnamed: 0,id,x,y,dom_dir,dom_vel,res_max,dir_max,max_temp,min_temp,mean_temp,...,corine_gr4,corine_gr5,corine_gr21,corine_gr22,corine_gr23,corine_gr24,corine_gr31,corine_gr32,corine_gr33,fire
0,198017397712,0.050504,0.702992,1.0,0.061934,0.030523,1.0,0.775728,0.876531,0.824093,...,0.0,0.0,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,0
1,198326397661,0.053498,0.702266,8.0,0.082530,0.053493,2.0,0.761182,0.845185,0.806923,...,0.0,0.0,0.000000,0.000000,0.0,0.847744,0.000000,0.152256,0.000000,0
2,198739397970,0.057490,0.706622,7.0,0.122197,0.091405,7.0,0.798455,0.893466,0.840343,...,0.0,0.0,0.000000,0.095700,0.0,0.816888,0.000000,0.087412,0.000000,0
3,198584397867,0.055993,0.705170,4.0,0.072483,0.081607,7.0,0.783606,0.868262,0.824552,...,0.0,0.0,0.000000,0.029748,0.0,0.733538,0.000000,0.236713,0.000000,0
4,198790397403,0.057989,0.698636,2.0,0.087460,0.056311,2.0,0.771551,0.859031,0.818460,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2368,268323376940,0.731105,0.410425,1.0,0.234696,0.205059,1.0,0.849920,0.941994,0.902045,...,0.0,0.0,0.000000,0.000000,0.0,0.518187,0.000000,0.481813,0.000000,0
2369,269096376889,0.738590,0.409699,1.0,0.203712,0.173757,1.0,0.844061,0.981147,0.909653,...,0.0,0.0,0.283679,0.000000,0.0,0.044816,0.000000,0.000000,0.000000,0
2370,268169377559,0.729608,0.419137,1.0,0.138878,0.111331,8.0,0.754011,0.780954,0.775997,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.637379,0.362621,0.000000,0
2371,268426376476,0.732103,0.403891,1.0,0.164896,0.143897,2.0,0.798455,0.955374,0.873073,...,0.0,0.0,0.000000,0.129237,0.0,0.000000,0.000000,0.522446,0.348317,0


In [None]:
for d in range(25,29):
    date='202308'+str(d)
    extract_day(date)