In [39]:
import os
import calendar
import time
import datatable as dt
import fileutils
import traceback

In [40]:
def load_datatable(fpath, fill=False):
    if not os.path.exists(fpath):
        print('Path not found : %s'%fpath)
        raise
    DT = dt.fread(fpath, fill=fill)
    return DT

In [41]:
def walkmonthfiles(sfolder):
    for fmonth in fileutils.find_files(sfolder, '*_norm.csv', listtype="walk"):
        #print(fmonth)
        monthdir = os.path.dirname(fmonth)
        year = os.path.basename(os.path.dirname(monthdir))
        if year[0:2] != '20':
            continue
        yield fmonth
        #extractdays(fmonth, tfolder)

In [42]:
def read_multiple_csv(sfolder, pattern):
    #pattern example '*_norm.csv'
    exfeat = ["","id", "firedate"]
    DTall = None
    for f in fileutils.find_files(sfolder, pattern, listtype="walk"):
        print(f)
        try:
            DTpart=load_datatable(f)
        except:
            print("Fail to load %s :\n"%f+traceback.format_exc())
            raise
        if DTall is None: 
            DTall = DTpart
        else:
            DTall=dt.rbind(DTall, DTpart)
    return DTall

In [43]:
def filterDT(DT, filt):
    DT=DT[:, [name for name in filt]]
    return DT
    

In [44]:
def aggrfuncs(DTaggr, func, y, pattern, agg):
    DTa=func()
    DTa['Year'] = '%d'%(y)
    DTa['Pattern'] = pattern
    DTa['Aggr'] = agg
    if DTaggr==None:
        DTaggr=DTa
    else:
        DTaggr = dt.rbind(DTaggr,DTa)
    return DTaggr

In [45]:
def statmonths(years, sfolder, pattern, filt):
    #sfolder = '/data2/ffp/datasets/daily/%s/08'
    DTmeans=None; DTmin=None; DTmax=None; DTsd=None;
    #filt=['ndvi_new','evi', 'lst_day', 'lst_night']
    #filt=['max_temp','min_temp','mean_temp','rain_7days','res_max','dom_vel','max_dew_temp','min_dew_temp','mean_dew_temp']
    #exclude onehot = ['corine', 'dir_max', 'dom_dir', 'wkd', 'month']+["C0","id", "firedate"]
    for y in years:
        print(y)
        #sfolder = '/data2/ffp/datasets/daily/%d/08'%y
        yfolder = sfolder%y
        print(yfolder)
        DT = read_multiple_csv(yfolder, pattern)
        DT = DT[:, [name for name in filt]]
        DTmeans = aggrfuncs(DTmeans, DT.mean, y, pattern, 'mean')
        DTmin = aggrfuncs(DTmin, DT.min, y, pattern, 'min')
        DTmax = aggrfuncs(DTmax, DT.max, y, pattern, 'max')
        DTsd = aggrfuncs(DTsd, DT.sd, y, pattern, 'sd')
        DT = None
    return DTmeans, DTmin, DTmax, DTsd 

In [46]:
def load_test(years, sfolder, pattern):
    for y in years:
        print(y)
        #sfolder = '/data2/ffp/datasets/daily/%d/08'%y
        yfolder = sfolder%y
        print("%s, %s: Loading"%(yfolder,pattern))
        try:
            DT = read_multiple_csv(yfolder, pattern)
            if DT is not None:
                print("%s, %s: Loaded successfully"%(yfolder,pattern))
                print(DT.names)
                wdays = len([d for d in DT.names if 'wkd_' in d ])
                months = len([d for d in DT.names if 'month_' in d ])
                corines = len([d for d in DT.names if 'corine_' in d ])
                dir_maxs = len([d for d in DT.names if 'dir_max_' in d ])
                dom_dirs = len([d for d in DT.names if 'dom_dir_' in d ])
                total_feat = len(DT.names)
                print('Total features: %d, week days: %d, months: %d, corine: %d, dom_dir: %d, dir_max: %d'%\
                     (total_feat, wdays, months, corines, dom_dirs, dir_maxs))
            else:
                print("%s, %s: No dataset"%(yfolder,pattern))
        except:
            print(traceback.format_exc())

In [139]:
read_multiple_csv('/data2/ffp/datasets/monthly/2020/', 'june_*_norm.csv')

/data2/ffp/datasets/monthly/2020/june_2020/june_2020_norm.csv
Fail to load /data2/ffp/datasets/monthly/2020/june_2020/june_2020_norm.csv :
Traceback (most recent call last):
  File "/tmp/ipykernel_21804/1220714569.py", line 8, in read_multiple_csv
    DTpart=load_datatable(f)
  File "/tmp/ipykernel_21804/2284077608.py", line 5, in load_datatable
    DT = dt.fread(fpath, fill=fill)
datatable.exceptions.IOError: Too few fields on line 2996394: expected 89 but found only 88 (with sep=','). Set fill=True to ignore this error.  <<571272.0,20200620,0.5327007906783188,0.5907605818898644,0.5644809018037349,0.18547352348715113,0.05993656615338073,0.08501656076862274,0.12788477266746798,0.13705918297059183,0.5082965918950052,0.8946559260716122,1.4285476071369019,0.16680658159838815,0.5184615384615384,0.45719332625949305,0.6244364582068118,0.6114168310863944,0.6395486230502329,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0

In [186]:
#for m in ['june','july','august','september']:
for m in ['july','august','september']:
    load_test(range(2019,2021),'/data2/ffp/datasets/monthly/%d/','%s_*_norm.csv'%m)

2019
/data2/ffp/datasets/monthly/2019/, july_*_norm.csv: Loading
/data2/ffp/datasets/monthly/2019/07/july_2019_norm.csv
/data2/ffp/datasets/monthly/2019/, july_*_norm.csv: Loaded successfully
('C0', 'id', 'firedate', 'max_temp', 'min_temp', 'mean_temp', 'res_max', 'dom_vel', 'rain_7days', 'dem', 'slope', 'curvature', 'aspect', 'ndvi_new', 'evi', 'lst_day', 'lst_night', 'max_dew_temp', 'mean_dew_temp', 'min_dew_temp', 'fire', 'dir_max_1', 'dir_max_2', 'dir_max_3', 'dir_max_4', 'dir_max_5', 'dir_max_6', 'dir_max_7', 'dir_max_8', 'dom_dir_1', 'dom_dir_2', 'dom_dir_3', 'dom_dir_4', 'dom_dir_5', 'dom_dir_6', 'dom_dir_7', 'dom_dir_8', 'corine_111', 'corine_112', 'corine_121', 'corine_122', 'corine_123', 'corine_124', 'corine_131', 'corine_132', 'corine_133', 'corine_141', 'corine_142', 'corine_211', 'corine_212', 'corine_213', 'corine_221', 'corine_222', 'corine_223', 'corine_231', 'corine_241', 'corine_242', 'corine_243', 'corine_244', 'corine_311', 'corine_312', 'corine_313', 'corine_321',

In [48]:
filt=['ndvi_new','evi', 'lst_day', 'lst_night']
#exclude onehot = ['corine', 'dir_max', 'dom_dir', 'wkd', 'month']+["C0","id", "firedate"]

In [49]:
DTmeans, DTmin, DTmax, DTsd = statmonths(range(2016,2021),'/data2/ffp/datasets/monthly/%d/', 'august_*_norm.csv', filt)

2016
/data2/ffp/datasets/monthly/2016/
/data2/ffp/datasets/monthly/2016/08/august_2016_norm.csv
2017
/data2/ffp/datasets/monthly/2017/
/data2/ffp/datasets/monthly/2017/08/august_2017_norm.csv
2018
/data2/ffp/datasets/monthly/2018/
/data2/ffp/datasets/monthly/2018/08/august_2018_norm.csv
2019
/data2/ffp/datasets/monthly/2019/
/data2/ffp/datasets/monthly/2019/08/august_2019_norm.csv
2020
/data2/ffp/datasets/monthly/2020/
/data2/ffp/datasets/monthly/2020/august_2020/august_2020_norm.csv


In [47]:
filt=['max_temp','min_temp','mean_temp','rain_7days','res_max','dom_vel','max_dew_temp','min_dew_temp','mean_dew_temp']
DTmeans, DTmin, DTmax, DTsd = statmonths(range(2016,2021),'/data2/ffp/datasets/monthly/%d/', 'july_*_norm.csv', filt)

2016
/data2/ffp/datasets/monthly/2016/
/data2/ffp/datasets/monthly/2016/07/july_2016_norm.csv
2017
/data2/ffp/datasets/monthly/2017/
/data2/ffp/datasets/monthly/2017/07/july_2017_norm.csv
Fail to load /data2/ffp/datasets/monthly/2017/07/july_2017_norm.csv :
Traceback (most recent call last):
  File "/tmp/ipykernel_26599/516241387.py", line 8, in read_multiple_csv
    DTpart=load_datatable(f)
  File "/tmp/ipykernel_26599/2284077608.py", line 5, in load_datatable
    DT = dt.fread(fpath, fill=fill)
KeyboardInterrupt



KeyboardInterrupt: 

In [62]:
DTmeans.to_csv('august_means_sat.csv')
DTmax.to_csv('august_max_sat.csv')
DTmin.to_csv('august_min_sat.csv')
DTsd.to_csv('august_sd_sat.csv')

In [13]:
DTmeans

Unnamed: 0_level_0,max_temp,min_temp,mean_temp,rain_7days,res_max,dom_vel,max_dew_temp,min_dew_temp,mean_dew_temp,Year,Pattern,Aggr
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,0.628887,0.735546,0.675471,0.0521674,0.319967,0.149258,0.736163,0.745445,0.739142,2018,july_*_norm.csv,mean
1,0.631838,0.732991,0.677387,0.0339274,0.382876,0.160683,0.709956,0.703433,0.70098,2019,july_*_norm.csv,mean
2,0.642724,0.742816,0.687063,0.0177944,0.429905,0.165823,0.709551,0.706695,0.702376,2020,july_*_norm.csv,mean


In [3]:
DT2 = load_datatable('/data2/ffp/datasets/monthly/2020/august_2020/august_2020_norm.csv')

In [35]:
DT2[dt.f["ndvi_new"] > 1, dt.sd()]

TypeError: sd() missing 1 required positional argument: 'expr'

In [34]:
DT2[dt.f["evi"] <0, dt.count(dt.f.evi) ]

Unnamed: 0_level_0,evi
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪
0,0


In [None]:
DT2[dt.f["evi"] <= 1, dt.sd(dt.f["evi"])]

In [50]:
DTsd

Unnamed: 0_level_0,ndvi_new,evi,lst_day,lst_night,Year,Pattern,Aggr
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,0.147485,0.0981408,0.0497807,0.0461257,2016,august_*_norm.csv,sd
1,0.141489,0.0934132,0.0493042,0.0437773,2017,august_*_norm.csv,sd
2,0.144489,0.0975897,0.0509302,0.044631,2018,august_*_norm.csv,sd
3,0.142305,0.0988653,0.0499927,0.0431327,2019,august_*_norm.csv,sd
4,0.565736,0.0022491,0.050642,0.0434798,2020,august_*_norm.csv,sd


In [52]:
DTmeans

Unnamed: 0_level_0,ndvi_new,evi,lst_day,lst_night,Year,Pattern,Aggr
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,0.61504,0.438016,0.603699,0.565433,2016,august_*_norm.csv,mean
1,0.632167,0.445391,0.615199,0.568938,2017,august_*_norm.csv,mean
2,0.659176,0.453464,0.575333,0.557584,2018,august_*_norm.csv,mean
3,0.647192,0.461368,0.608678,0.561063,2019,august_*_norm.csv,mean
4,1.06259,0.168091,0.59755,0.566657,2020,august_*_norm.csv,mean


In [53]:
DT2 = load_datatable('/data2/ffp/datasets/monthly/2020_old/08/august_2020_norm.csv')

In [57]:
DT2[:, {'ndvi mean': dt.mean(dt.f.ndvi_new), 'ndvi sd': dt.sd(dt.f.ndvi_new)}]

Unnamed: 0_level_0,ndvi mean,ndvi sd
Unnamed: 0_level_1,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,0.998264,0.592549
