## "No fire" class dataset selection

In [32]:
import pandas as pd
import numpy as np
from calendar import monthrange
import random
import datetime
import bisect
from tqdm import tqdm
import xarray as xr
import os

In [7]:
# calculate temporal distribution
def get_distrib(counts):
    cd = {c[0]: c[1] for c in counts}
    s = sum(cd.values())
    cdpcs = {c: cd[c] / s for c in cd}
    p = [cdpcs[pc] for pc in cdpcs]
    return p

In [8]:
# calculate cell count for no-fires
def get_random_cell_count(dfallf, not_burned_percent):
    f = not_burned_percent
    cntrandom = int(f / (1 - f) * len(dfallf))
    return cntrandom

In [9]:
# calculate month year temporal distribution - how many cells to select for each month year
def get_random_date_distrib(dfallf):

    month_counts_ls=list(dfallf.groupby(['month']).agg('count').reset_index()[['month','dom_dir']] \
                         .itertuples(index=False, name=None))
    mon_distrib = get_distrib(month_counts_ls)
    year_counts_ls=list(dfallf.groupby(['year']).agg('count').reset_index()[['year','dom_dir']]. \
                        itertuples(index=False, name=None))
    year_distrib = get_distrib(year_counts_ls)

    return mon_distrib, year_distrib, month_counts_ls, year_counts_ls

In [16]:
# create dataframe with fires instances
def get_fires_pd(calend=True):
    if calend:
        dftrain=pd.read_csv('/media/toshiba-hdd/mlfires/dataset/classic/traindataset_new.csv',dtype={'firedate':str})
    else:
        dftrain=pd.read_csv('/media/toshiba-hdd/mlfires/dataset/classic/traindataset_new.csv',parse_dates=['firedate'])
        dftrain['firedate']=dftrain['firedate'].apply(lambda x: x.date())
    dftrain.dropna(inplace=True)
    dfallf=dftrain[dftrain['fire']==1].copy()
    if calend:
        dftrain['month']=dftrain['month'].astype(int)
        dfallf['year']=dfallf['firedate'].str.slice(0,4)
    return dfallf

In [19]:
# get random month or year usng distribution
def get_rand_from_distrib(start, stop, distrib):
    return np.random.choice(np.arange(int(start), int(stop) + 1), p=distrib)

In [20]:
# get min max month year to select from
def get_min_max_year_month(dfallf):
    return dfallf['year'].min(), dfallf['year'].max(), dfallf['month'].min(), dfallf['month'].max()

In [12]:
# select all the date instances from which the total number of no-fire cells that will be sampled (the dates may repeated)
def select_random_dates(cntrandom, dff):
    mdist, ydist, mcount, ycount = get_random_date_distrib(dff)
    miny, maxy, minm, maxm = get_min_max_year_month(dff)
    noburndatelist = []
    for i in range(cntrandom):
        year = get_rand_from_distrib(miny, maxy, ydist)
        month = get_rand_from_distrib(minm, maxm, mdist)
        daysofmonth = monthrange(year, month)[1]
        day = random.randrange(1, daysofmonth + 1)
        randdate = datetime.date(year=year, day=day, month=month)
        noburndatelist.append(randdate)
    return noburndatelist

In [13]:
# add index (xyhash) to dataframe created from xarray
def addxyhash(df):
    df["xyhash"]=(df["x"]*10000).astype('float').astype('int').astype('str')+ \
    (df["y"]*10000).astype('float').astype('int').astype('str')
    df.set_index("xyhash", inplace=True)

In [22]:
# Validate the cell. If the cell belongs in the burned cells less than two years ago it is rejected
def validate_date(idc, burnids, burndict, randate):
    if BinarySearch(burnids, idc):
        #print(randate)
        # bl = get_sublist(burndict[idc],"<=", randate)
        bl = [e for e in burndict[idc] if e <= randate]
        #print(bl)
        # al = get_sublist(burndict[idc],">=", randate)
        bdate = max(bl) if bl else None
        #print(bdate)
        #print(randate - bdate)
        # adate = min(al) if al else None
        if (bdate and randate - bdate < datetime.timedelta(days=2 * 365.25)):
            return False
        else:
            return True
    else:
        return True

In [23]:
# Load all the fire cells in a dictionary
def get_burnid_dates(burnedrecs):
    burndict = {}
    burnids = []
    for ind, rec in burnedrecs.iterrows():
        if BinarySearch(burnids, ind):
            burndict[ind].append(rec['firedate'])
        else:
            burndict[ind] = [rec['firedate']]
            bisect.insort(burnids, ind)
    return burndict, burnids

In [24]:
def BinarySearch(a, x):
    i = bisect.bisect_left(a, x)
    if i != len(a) and a[i] == x:
        return True
    else:
        return False

In [25]:
def valid_proximity(idc, maskedlist):
    proxyids = [idc - 1, idc - 2228, idc - 2227, idc - 2226, idc + 1, idc + 2228, idc + 2227, idc + 2226]
    return all(BinarySearch(maskedlist, mid) for mid in proxyids)
    # return all(mid in maskedlist for mid in proxyids)

In [212]:
# select all no fire cells (id,date) that will form the no-fire class set
def select_random_cell(dateslist, allcellfile):
    cntrandom = len(dateslist)
    randomcells = []
    dfday = pd.read_csv(allcellfile)
    addxyhash(dfday)
 
    dffires=get_fires_pd(calend=False)
    addxyhash(dffires)
    
    burndict, burnids = get_burnid_dates(dffires)

    for i in tqdm(range(cntrandom)):
        rec = dfday.sample()
        while not validate_date(rec.index[0], burnids, burndict, dateslist[i]): #\
            #or not valid_proximity(rec['id'], maskids):
            rec = dfday.sample()
        randomcells.append({'xyhash':rec.index[0], 'firedate':dateslist[i].strftime("%Y%m%d")})
    return pd.DataFrame(randomcells)

In [1]:
# module for parallel execution of "daysamples" function

import multiprocessing as mp
ncpus=mp.cpu_count()
import time
import sys

def new_process(func, proclist, args):
    q = mp.Queue()
    proclist += [{'proc': mp.Process(target=func, args=args+(q,)), 'queue': q}]
    proclist[-1]['proc'].start()

def par_files(func, days, pthreads, args):
    procs = []
    dfres = pd.DataFrame()
    ndays = len(days)
    proctimetotal = 0
    dayscompleted = []
    recs = 0
    #print(days)
    for cpu in range(pthreads):
        d = days.pop()
        dayscompleted += [d]
        #print('initial proc')
        new_process(func, procs, tuple([d]+[a.copy() for a in args]))
    while len(procs) > 0:
        time.sleep(0.1)
        for p in procs:
            try:
                res = p['queue'].get_nowait()
                dfres = pd.concat([dfres,res])
                recs += len(res)
                sys.stdout.write("\r days : %d/%d , rows : %d/%d"%(len(dayscompleted),ndays,recs, len(dfcells)))
                sys.stdout.flush()
            except:
                pass
            if not p['proc'].is_alive():
                #print('remove, tot procs: %d' % len(procs))
                procs.remove(p)
                #print('tot procs: %d' % len(procs))
        while len(procs) < pthreads:
            if len(days) == 0: break
            #print('new proc')
            d = days.pop()
            dayscompleted += [d]
            new_process(func, procs, tuple([d]+[a.copy() for a in args]))
    return dfres

In [219]:
# Create the dataset for all the no-fire samples of one day. 
# The function finds the appropriate xarray daily file and picks for each one of the cells 
# in the batch the feature "column"
# input:
#   day : the date of batch of samples
#   dcellsall: the dtaframe with all the sampled cells (with id, date)
#   static_af15, static_bef15: the static features before and after 2015
#   q: the queue to pass the result for the parallel execution
# output:
#   the dataframe with the batch of the cells for the day containing all the features for each cell
#
def daysamples(day, dcellsall, static_af15, static_bef15, q):
    #print('Sampling day %s'%day)
    trainset=pd.DataFrame()
    dcells=dcellsall.loc[dfcells['firedate']==day]
    fnc=os.path.join('/media/toshiba-hdd/mlfires/dataset/images/final_dataset/',"%s_df.nc"%day)
    if not os.path.isfile(fnc):
        #print('Day %s is missing'%day)
        return
    dsday=xr.load_dataset(fnc)
    if int(day[:4])>=2015:
        dsdayall=xr.merge([dsday,static_af15],combine_attrs='drop')
    else:
        dsdayall=xr.merge([dsday,static_bef15],combine_attrs='drop')
    for ind,row in dcells.iterrows():
        xp=int(ind[0:6])/10000
        yp=int(ind[6:])/10000
        dspoint=dsdayall.loc[dict(x=slice(xp-0.0001, xp+0.0001), y=slice(yp+0.0001, yp-0.0001))]
        trainset=pd.concat([trainset,dspoint.to_dataframe().reset_index()])
        trainset['firedate']=day
    q.put(trainset)
    #print('Finished Sampling day %s'%day)

In [210]:
not_burned_percent = 0.5
dff=get_fires_pd()
cntrandom=get_random_cell_count(dff, not_burned_percent)
dl=select_random_dates(cntrandom, dff)


In [214]:
dfcells=select_random_cell(dl, '/media/toshiba-hdd/mlfires/dataset/classic/20190930_df_attica.csv')

100%|██████████| 13957/13957 [00:03<00:00, 4554.81it/s]


In [215]:
dfcells=dfcells.set_index('xyhash')

In [216]:
static_af15=xr.open_dataset('/media/toshiba-hdd/mlfires/dataset/images/static/static_aft_15.nc')
static_bef15=xr.open_dataset('/media/toshiba-hdd/mlfires/dataset/images/static/static_bef_15.nc')
dates = dfcells['firedate'].unique()
folder='/media/toshiba-hdd/mlfires/dataset/images/final_dataset/'

In [221]:
daydfs=par_files(daysamples, list(dates),ncpus-7, [dfcells, static_af15, static_bef15])

 days : 1302/1302 , rows : 13956/13957

In [223]:
daydfs.drop(columns=['time'],inplace=True)

In [224]:
df_train_full=pd.read_csv('/media/toshiba-hdd/mlfires/dataset/classic/traindataset_new.csv')
df_train_fire=df_train_full[df_train_full['fire']==1]
daydfsfull=pd.concat([daydfs,df_train_fire])

In [225]:
daydfsfull.drop(columns=['curvature'], inplace=True)

In [226]:
corinecols=[ccor for ccor in daydfsfull.columns if any([ccor.startswith(col) for col in ['corine_%d'%i for i in range(1,10)]])]
for c in corinecols: daydfsfull[c].fillna(0, inplace=True)

In [227]:
daydfsfull.to_csv('/media/toshiba-hdd/mlfires/dataset/classic/train_new_sample_1_2_attica.csv', index=False)

In [207]:
daydfsfull[daydfsfull['fire']==0]

Unnamed: 0,y,x,dom_dir,dom_vel,res_max,dir_max,max_dew_temp,min_dew_temp,mean_dew_temp,max_temp,...,corine_gr5,corine_gr21,corine_gr22,corine_gr23,corine_gr24,corine_gr31,corine_gr32,corine_gr33,band,firedate
0,39.147559,25.935452,0.0,9.046390,9.046390,2.0,280.635254,278.464600,279.653748,290.534912,...,0.0,0.0000,0.330564,0.0,0.669436,0.000000,0.000000,0.000000,1,20161018
0,38.730052,21.884088,4.0,1.898232,1.898232,4.0,284.662842,278.752930,280.977692,294.000000,...,0.0,0.0000,0.000000,0.0,0.000000,0.407008,0.592992,0.000000,1,20180412
0,37.152804,22.368602,2.0,1.067671,1.067671,2.0,285.868896,280.009521,282.381348,296.207764,...,0.0,0.0000,0.315602,0.0,0.000000,0.684398,0.000000,0.000000,1,20161026
0,40.425852,22.894352,0.0,3.642635,3.642635,4.0,286.590820,285.130127,285.602997,292.597900,...,0.0,0.0831,0.000000,0.0,0.916900,0.000000,0.000000,0.000000,1,20151015
0,40.317610,23.353094,3.0,3.525212,3.525212,3.0,293.971680,291.457520,292.659790,300.970215,...,0.0,0.0000,0.793981,0.0,0.000000,0.000000,0.000000,0.206019,1,20100618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,36.776532,25.306614,0.0,8.228325,8.228325,8.0,294.967285,293.925781,294.531708,298.943359,...,0.0,0.0000,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,1,20120720
0,41.235094,22.409837,5.0,1.337492,1.337492,5.0,288.047119,280.603760,284.725220,306.705078,...,0.0,0.0000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1,20120720
0,35.235364,25.512790,8.0,5.499441,5.499441,8.0,289.826172,286.600830,288.321503,301.924805,...,0.0,0.0000,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,1,20120720
0,41.735072,25.559180,1.0,3.370692,3.370692,1.0,286.910156,280.678955,283.717743,305.259766,...,0.0,0.0000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1,20120720


In [161]:
trainset=pd.DataFrame()
#for d in tqdm(dates):
countd=0
for d in dates:
    countd+=1
    dcells=dfcells.loc[dfcells['firedate']==d]
    fnc=os.path.join(folder,"%s_df.nc"%d)
    if not os.path.isfile(fnc): continue
    dsday=xr.load_dataset(fnc)
    if int(d[:4])>=2015:
        dsdayall=xr.merge([dsday,static_af15],combine_attrs='drop')
    else:
        dsdayall=xr.merge([dsday,static_bef15],combine_attrs='drop')
    for ind,row in dcells.iterrows():
        xp=int(ind[0:6])/10000
        yp=int(ind[6:])/10000
        dspoint=dsdayall.loc[dict(x=slice(xp-0.0001, xp+0.0001), y=slice(yp+0.0001, yp-0.0001))]
        trainset=pd.concat([trainset,dspoint.to_dataframe().reset_index()])
    sys.stdout.write("\r days : %d/%d , rows : %d/%d"%(countd,len(dates),len(trainset), len(dfcells)))
    sys.stdout.flush()
trainset.to_csv('/media/toshiba-hdd/mlfires/dataset/classic/train_new_sample.csv', index=False)

 days : 1409/1409 , rows : 27949/27955