In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import xarray as xr

In [2]:
path_data = "../data/"

In [3]:
datacube = xr.open_dataset(path_data + "datacube5.nc")
#datacube = datacube.sel(
#    time=slice('2020-01-01', '2020-01-02'))

In [4]:
def print_missing_data(datacube):
    print("MISSING DATA: ")
    print("EVI: ", "\n\t percentage :", datacube["_1_km_16_days_EVI"].isnull().sum().values /datacube["_1_km_16_days_EVI"].size*100, "\n\t count :", datacube["_1_km_16_days_EVI"].isnull().sum().values.ravel()[0], " / ", datacube["_1_km_16_days_EVI"].size)
    print("LAI : ", "\n\t percentage :", datacube["Fpar_500m"].isnull().sum().values /datacube["Fpar_500m"].size*100, "\n\t count :", datacube["Fpar_500m"].isnull().sum().values.ravel()[0], " / ", datacube["Fpar_500m"].size)
    print("Evap : ", "\n\t percentage :", datacube["ET_500m"].isnull().sum().values /datacube["ET_500m"].size*100, "\n\t count :", datacube["ET_500m"].isnull().sum().values.ravel()[0], " / ", datacube["ET_500m"].size)
    print("u10 : ", "\n\t percentage :", datacube["u10"].isnull().sum().values /datacube["u10"].size*100, "\n\t count :", datacube["u10"].isnull().sum().values.ravel()[0], " / ", datacube["u10"].size)
    print("v10 : ", "\n\t percentage :", datacube["v10"].isnull().sum().values /datacube["v10"].size*100, "\n\t count :", datacube["v10"].isnull().sum().values.ravel()[0], " / ", datacube["v10"].size)
    print("t2m : ", "\n\t percentage :", datacube["t2m"].isnull().sum().values /datacube["t2m"].size*100, "\n\t count :", datacube["t2m"].isnull().sum().values.ravel()[0], " / ", datacube["t2m"].size)
    print("tp : ", "\n\t percentage :", datacube["tp"].isnull().sum().values /datacube["tp"].size*100, "\n\t count :", datacube["tp"].isnull().sum().values.ravel()[0], " / ", datacube["tp"].size)
    print("First_Day : ", "\n\t percentage :", datacube["First_Day"].isnull().sum().values /datacube["First_Day"].size*100, "\n\t count :", datacube["First_Day"].isnull().sum().values.ravel()[0], " / ", datacube["First_Day"].size)
    print("Last_Day : ", "\n\t percentage :", datacube["Last_Day"].isnull().sum().values /datacube["Last_Day"].size*100, "\n\t count :", datacube["Last_Day"].isnull().sum().values.ravel()[0], " / ", datacube["Last_Day"].size)
    print("Burn_Date : ", "\n\t percentage :", datacube["Burn_Date"].isnull().sum().values /datacube["Burn_Date"].size*100, "\n\t count :", datacube["Burn_Date"].isnull().sum().values.ravel()[0], " / ", datacube["Burn_Date"].size)
    print("FireMask : ", "\n\t percentage :", datacube["FireMask"].isnull().sum().values /datacube["FireMask"].size*100, "\n\t count :", datacube["FireMask"].isnull().sum().values.ravel()[0], " / ", datacube["FireMask"].size)
    print("Density : ", "\n\t percentage :", datacube["density"].isnull().sum().values /datacube["density"].size*100, "\n\t count :", datacube["density"].isnull().sum().values.ravel()[0], " / ", datacube["density"].size)


In [5]:
print_missing_data(datacube)

MISSING DATA: 
EVI:  
	 percentage : 0.20811654526534862 
	 count : 40992  /  19696656
LAI :  
	 percentage : 0.7860115950646648 
	 count : 154818  /  19696656
Evap :  
	 percentage : 7.261780883008771 
	 count : 1430328  /  19696656
u10 :  
	 percentage : 20.349779170636882 
	 count : 4008226  /  19696656
v10 :  
	 percentage : 20.349779170636882 
	 count : 4008226  /  19696656
t2m :  
	 percentage : 20.349779170636882 
	 count : 4008226  /  19696656
tp :  
	 percentage : 20.349779170636882 
	 count : 4008226  /  19696656
First_Day :  
	 percentage : 1.2365043081424583 
	 count : 243550  /  19696656
Last_Day :  
	 percentage : 1.2365043081424583 
	 count : 243550  /  19696656
Burn_Date :  
	 percentage : 1.2365043081424583 
	 count : 243550  /  19696656
FireMask :  
	 percentage : 0.20811654526534862 
	 count : 40992  /  19696656
Density :  
	 percentage : 6.994202467667608 
	 count : 3764  /  53816


In [11]:
def fbfill(datacube
           , dims : list = ["x","y","time"]
           , flimit : int = 1
           , blimit : int = 1
           , max_occurence : int = -1
          ):
    """
    A recursive function that remove the nan values on a datacube.
    
    It first apply a forward fill with a limit of *flimit*
    Then a bakward fill with a limit of *blimit*
    on the *coords* of a *datacube*
    untill there is no more nan or until *max_occurence* is reached
    
    Parameters
    ----------
    datacube : xarray.Dataset
        a datacube from the xarray library
        
    dims : list, default ["x", "y", "time"]
        Specifies the dimension along which to propagate values when filling.
        
    flimit : integer, default 1
        The maximum number of consecutive NaN values to forward fill. 
        In other words, if there is a gap with more than this number of consecutive NaNs, 
        it will only be partially filled. 
        Must be greater than 0 or None for no limit. 
        Must be None or greater than or equal to axis length if filling along chunked axes (dimensions).
        
    blimit : integer, default 1
        The maximum number of consecutive NaN values to backward fill.
        In other words, if there is a gap with more than this number of consecutive NaNs,
        it will only be partially filled. 
        Must be greater than 0 or None for no limit. 
        Must be None or greater than or equal to axis length if filling along chunked axes (dimensions).
        
    max_occurence : integer, default -1
        The maximum number of time the filling will be processed.
        If the number is negative the filling will be done untill there is no more Nan value in the datacube.
    
    
    Examples
    --------
    """
    _datacube = datacube.copy()
    
    #forward fill
    for _dim in dims:
        _datacube = _datacube.ffill(_dim, limit=flimit)
    #backward fill  
    for _dim in dims:
        _datacube = _datacube.bfill(_dim, limit=blimit)
    
    # If the original datacube has not been changed from the copy after the filling
    # or if the maximum occurence wanted is reached
    if _datacube.equals(datacube) or max_occurence==0:
        return datacube
    
    # recursive on the _datacube
    print("1")
    return fbfill(datacube = _datacube
                            , dims = dims
                            , flimit = flimit
                            , blimit = blimit
                            , max_occurence=max_occurence-1
                           )

In [7]:
datacube_nonan = fbfill(datacube
                       , max_occurence = 5
                       )

1
1
1
1
1


In [8]:
print_missing_data(datacube_nonan)


MISSING DATA: 
EVI:  
	 percentage : 0.0 
	 count : 0  /  19696656
LAI :  
	 percentage : 0.0 
	 count : 0  /  19696656
Evap :  
	 percentage : 2.759402408205738 
	 count : 543510  /  19696656
u10 :  
	 percentage : 12.41266537832615 
	 count : 2444880  /  19696656
v10 :  
	 percentage : 12.41266537832615 
	 count : 2444880  /  19696656
t2m :  
	 percentage : 12.41266537832615 
	 count : 2444880  /  19696656
tp :  
	 percentage : 12.41266537832615 
	 count : 2444880  /  19696656
First_Day :  
	 percentage : 0.0 
	 count : 0  /  19696656
Last_Day :  
	 percentage : 0.0 
	 count : 0  /  19696656
Burn_Date :  
	 percentage : 0.0 
	 count : 0  /  19696656
FireMask :  
	 percentage : 0.0 
	 count : 0  /  19696656
Density :  
	 percentage : 2.1071800208116547 
	 count : 1134  /  53816


In [9]:
datacube_nonan.mean()

In [10]:
datacube.mean()