In [19]:
import re
import os
import xarray as xr
import traceback
from parallel_file_process import par_files, walkmonthdays
import time
import multiprocessing as mp
import scipy

In [20]:
def preprocess(df, dropfilters=None, fillnafilters=None, renames=None, horizfilters=None, \
                  addcols=None, calibfilters=None):
    if fillnafilters is not None:
        fillnacols=[c for c in df.columns if any(re.search(p,c) for p in fillnafilters)]
        for c in fillnacols:
            df[c].fillna(0, inplace=True)
    #print('before drop %d',len(df))
    #print('after drop %d', len(df))
    if dropfilters is not None:
        dropcols=[c for c in df.columns if any(re.search(p,c) for p in dropfilters)]
        df.drop(columns=dropcols,inplace=True)
        #print('dropped cols %s, col num before: %d, col num after: %d'%(dropcols,colnb, len(df.columns)))
        #print('dropped cols %s, col num before: %d, col num after: %d'%(dropcols,colnb, len(df.columns)))
    if horizfilters is not None:
        for hf in horizfilters:
            cond=eval(hf)
            df=df.copy()[cond]
    df.dropna(inplace=True)
    if addcols is not None:
        for addc in addcols:
            df[addc]=addcols[addc]
    if renames is not None:
        df.rename(columns=renames,inplace=True)
    if calibfilters is not None:
        for calib in calibfilters:
            '''v is for the eval expression use'''
            v=df[calib]
            df[calib]=eval(calibfilters[calib])
            #df.drop(columns=[calib], inplace=True)
            #df.rename(columns={calib+'_temp':calib}, inplace=True)
    return df

In [21]:
'''
convert netcdf to tabular
'''
def netcdf_to_csv(ncname, statname, tfolder, \
                  dropfilters=None, fillnafilters=None, renames=None, horizfilters=None, \
                  addcols=None, calibfilters=None
                  ):
    bname = os.path.basename(ncname)
    g1 = re.search('^(.*?)\.nc', bname)
    csvname = os.path.join(tfolder, g1.group(1) + '.csv')
    firedate=re.search('^(.*?)_df\.nc', bname).group(1)
    if os.path.isfile(csvname):
        print('Found ready Unormalized CSV')
        return csvname
    try:
        print('Converting %s'%ncname)
        ds=xr.open_dataset(ncname)
        ds_stat=xr.open_dataset(statname)
        dsdayall=xr.merge([ds,ds_stat],combine_attrs='drop')
        dfday=dsdayall.to_dataframe().reset_index()
        if type(addcols)==dict:
            addcols=dict({'firedate':firedate},**addcols)
        else:
            addcols={'firedate':firedate}
        dfday = preprocess(dfday, dropfilters, fillnafilters, renames, horizfilters, \
                           addcols, calibfilters)
        dfday.to_csv(csvname, index=False)
        print('Done Converting %s' % csvname)
        return csvname
    except:
        print('Failed to create unnormalized csv %s\n'%csvname+traceback.format_exc())
        return None

In [22]:
# Specify the file path
file_path = '/mnt/nvme2tb/ffp/datasets/siamese/complementary_files/training_dates_with_fires.txt'  # Replace with the actual file path

# Initialize an empty list to store the data from the file
dates_with_fires = []

# Open the file for reading (use 'r' mode)
with open(file_path, 'r') as file:
    # Read each line from the file and append it to the list
    for line in file:
        dates_with_fires.append(line.strip())

In [23]:
dayfiles = [os.path.join('/mnt/nvme2tb/ffp/datasets/test/2021',i) for i in os.listdir('/mnt/nvme2tb/ffp/datasets/test/2021')]

In [24]:
dayfiles

['/mnt/nvme2tb/ffp/datasets/test/2021/20210704_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210910_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210806_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210708_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210824_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210715_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210722_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210909_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210607_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210615_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210624_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210916_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210826_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210725_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210714_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210918_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210630_df.nc',
 '/mnt/nvme2tb/ffp/datasets/test/2021/20210813_df.nc',
 '/mnt/nvm

In [25]:
statname='/mnt/nvme2tb/ffp/datasets/siamese/complementary_files/static_aft_15.nc'
targetfolder='/mnt/nvme2tb/ffp/datasets/test/2021'
start=time.time()
print("Start Preprocessing and Converting netcdf to csv")
#dayfiles=walkmonthdays('/media/toshiba-hdd/mlfires/dataset/images/final_dataset/', '201*','list')
#dayfiles = [os.path.join('/mnt/nvme2tb/ffp/datasets/test/2020',i) for i in dates_with_fires]
proctime=par_files(netcdf_to_csv, sorted(dayfiles), mp.cpu_count()-4,
                   [statname, targetfolder,
                    ['curvature','index'], #dropfilters
                    [r'corine_(\d+)'], #fillnafilters
                    {'tp': 'rain_7_days', 'time':'firedate'}, #renames
                     None, # horizfilters
                     None, #addcols
                     {'firedate': 'v.str.replace("-","")'} # calibfilters
                    ]
                   )
dur=time.time()-start
print("Done in %d min and %d secs"%(int(dur/60), dur%60))

Start Preprocessing and Converting netcdf to csv
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210930_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210929_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210928_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210927_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210926_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210925_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210924_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210923_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210922_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210921_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210920_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210919_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210918_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210917_df.nc
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210916_df.nc
Converting /mnt/nvme2

Done Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210811_df.csv
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210719_df.nc
Done Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210806_df.csv
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210718_df.nc
Done Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210808_df.csv
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210717_df.nc
Done Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210802_df.csv
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210716_df.nc
Done Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210804_df.csv
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210715_df.nc
Done Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210803_df.csv
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210714_df.nc
Done Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210809_df.csv
Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210713_df.nc
Done Converting /mnt/nvme2tb/ffp/datasets/test/2021/20210731_df.csv
Converting /mnt/nvme2t