# Import and setup

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import xarray as xr

In [3]:
path_data = "../data/"

# Download Dataset

In [4]:
datacube = xr.open_dataset(path_data + "datacube.nc")
datacube_test = datacube.sel(
    time=slice('2020-01-01', '2021-01-01'))

# Get information on the dataset

In [5]:
datacube_test.info()

xarray.Dataset {
dimensions:
	x = 298 ;
	y = 253 ;
	time = 367 ;

variables:
	float64 x(x) ;
		x:axis = X ;
		x:long_name = x coordinate of projection ;
		x:standard_name = projection_x_coordinate ;
		x:units = metre ;
	float64 y(y) ;
		y:axis = Y ;
		y:long_name = y coordinate of projection ;
		y:standard_name = projection_y_coordinate ;
		y:units = metre ;
	int32 crs() ;
		crs:crs_wkt = PROJCS["unknown",GEOGCS["unknown",DATUM["unknown",SPHEROID["unknown",6371007.181,0]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]]],PROJECTION["Sinusoidal"],PARAMETER["longitude_of_center",0],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]] ;
		crs:semi_major_axis = 6371007.181 ;
		crs:semi_minor_axis = 6371007.181 ;
		crs:inverse_flattening = 0.0 ;
		crs:reference_ellipsoid_name = unknown ;
		crs:longitude_of_prime_meridian = 0.0 ;
		crs:prime_mer

In [6]:
datacube_test.head()

In [7]:
# transphorm all variable as float
datacubefloat = datacube_test.astype("float")
datacubefloat.head()

# Handle Missing Data

## By collumn

In [8]:
datacube_test.info()

xarray.Dataset {
dimensions:
	x = 298 ;
	y = 253 ;
	time = 367 ;

variables:
	float64 x(x) ;
		x:axis = X ;
		x:long_name = x coordinate of projection ;
		x:standard_name = projection_x_coordinate ;
		x:units = metre ;
	float64 y(y) ;
		y:axis = Y ;
		y:long_name = y coordinate of projection ;
		y:standard_name = projection_y_coordinate ;
		y:units = metre ;
	int32 crs() ;
		crs:crs_wkt = PROJCS["unknown",GEOGCS["unknown",DATUM["unknown",SPHEROID["unknown",6371007.181,0]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]]],PROJECTION["Sinusoidal"],PARAMETER["longitude_of_center",0],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]] ;
		crs:semi_major_axis = 6371007.181 ;
		crs:semi_minor_axis = 6371007.181 ;
		crs:inverse_flattening = 0.0 ;
		crs:reference_ellipsoid_name = unknown ;
		crs:longitude_of_prime_meridian = 0.0 ;
		crs:prime_mer

In [10]:
sum_missing_data = datacube_test.where(x <1, True, False)
sum_missing_data

NameError: name 'x' is not defined

In [None]:
percent_missing_data = df.isna().mean()
percent_missing_data

In [None]:
plt.figure(figsize=(10,8))
plt.title("missing data",
          fontsize='large',
          loc='left',
          fontweight='bold',
          family='monospace')
plt.xlabel('collumns name', fontweight='bold', loc='left')
plt.ylabel('Data point', style='italic', loc='bottom')

cols = datacube.columns[:30]
colours = ['#ff3333', '#4dffa6']
sns.heatmap(datacube[cols].isna(), cmap=sns.color_palette(colours))

In [None]:
msno.matrix(datacube.iloc[:, :])

## By rows

In [None]:
missing_by_row = datacube.isna().sum(axis='columns')
missing_by_row.hist(bins=50)

## What to do with it ?

### drop collumn ?
This technique is straightforward. We drop the entire column or feature with missing data, which will certainly cause a loss of information. So we should only perform this when we are sure that the missing data is not informative. Otherwise, we should consider other solutions

In [None]:
percent_threshold_delete_collumn = 0.3
percent_missing_data[percent_missing_data > percent_threshold_delete]

In [None]:
datacube_less_missing_cols = datacube.loc[:, percent_missing_data <= percent_threshold_delete].copy()
print(f" tha shape of the orinigal datacube : {datacube.shape}")
print(f" tha shape of the datacube when we have removed the missing collumn with at least {percent_threshold_delete}% of missing data : {datacube_less_missing_cols.shape}")

### drop rows ?
We can drop the entire row with missing data like the first technique. Again, please be aware of the loss of information when removing rows.

If we only want to drop the rows with many missing values, we can do it similarly to the previous technique.

In [None]:
sum_threshold_delete_rows = 5
datacube_less_missing_rows = datacube[missing_by_row < sum_threshold_delete_rows].copy()
print(f" tha shape of the orinigal datacube : {datacube.shape}")
print(f" tha shape of the datacube when we have removed the missing collumn with at least {sum_threshold_delete_rows} of missing data : {datacube_less_missing_rows.shape}")