# EuroCrops Demo

This notebook assists with exploring the EuroCrops demo dataset.


### Imports

In [27]:
from dataclasses import dataclass
import datetime as dt
import numpy as np
import os
import h5py
import matplotlib.pyplot as plt
import pandas as pd
import json
from tqdm import tqdm

%matplotlib inline

In [28]:
from joblib import Memory
location = os.path.abspath('./cachedir')
print(f'cache location: {location}')
memory = Memory(location, verbose=0)


cache location: /home/przemek/Projects/pp/eurocrops/notebooks/cachedir


### Load Data

In [29]:
ROOT_DATA_PATH = '/media/data/local/eurocrops/m1615987/'
H5_FILE_PATH = os.path.join(ROOT_DATA_PATH, 'HDF5s/train/AT_T33UWP_train.h5')

In [30]:
NUMBER_OF_CHANNELS = 13

In [31]:
def _load_data_from_h5_file(h5_file_path):
    hdf = pd.HDFStore(h5_file_path, mode='r') #here we directly load the .h5 file in one go using pandas.
    region_names = hdf.keys()  #list all the keys or regions in the region (for eg- AT112)
    region_names = region_names[1:3]
    df_datas = []

    for region_name in tqdm(region_names):
        df_data_single = hdf.get(f'/{region_name}') #selecting a region from based on the key (AT112 for eg.)
        df_datas.append(df_data_single)
    
    #len(set.union(*[set(x.columns) for x in df_datas]))  120 columns now, but for one region there are only 80, intesection 44
    #len(set.intersection(*[set(x.columns) for x in df_datas]))  # 120 columns now, but for one region there are only 80
    
    return df_datas, region_names
        

def _find_closest_non_zero_column(time_index, common_days, dates_list, row):
    # this time step is zero, we needto find another one that is not zero.
    # To do it, find all closest non-zero columns (for all time steps for this row) 
        
    time_distance_to_nonzero_columns = [abs(common_days[time_index] - v) for v in dates_list]
    for k in range(len(time_distance_to_nonzero_columns)):
        if not np.any(row.iloc[k]):
            time_distance_to_nonzero_columns[k] = 9999
    closest_nonzero_column = np.argmin(time_distance_to_nonzero_columns)
    return closest_nonzero_column

    
def _resample_and_concatenate_regions_data(df_datas, resampled_days_interval):
    # Conatenation of data with different dates - fixed interval span, with finding closes date (better to use interpolation, but not with nois cloud data)
    DI = resampled_days_interval  # days interval
    common_days = list(range(DI, 365, DI))
    print(f'len(common_days) = {len(common_days)}')
    # common_days_datetime = [for day in common_days]

    # year = int(timesteps[10][:4])
    # new_year_day = dt.datetime(year=year, month=1, day=1)
    # dates_list = [((dt.datetime.strptime(date, tf)- new_year_day).days + 1) for date in timesteps]

    df_data_all = pd.DataFrame(columns=common_days)


    for df_data_single in tqdm(df_datas):
        timesteps = list(df_data_single.columns)
        year = int(timesteps[10][:4])
        new_year_day = dt.datetime(year=year, month=1, day=1)
        tf = '%Y%m%d'
        dates_list = [((dt.datetime.strptime(date, tf)- new_year_day).days + 1) for date in timesteps]
        df_data_single = df_data_single.rename(columns={old: new for old, new in zip(timesteps, dates_list)})

        closest_columns = []
        for common_day in common_days:
            closest_column = np.argmin([abs(common_day - v) for v in dates_list])
            closest_columns.append(closest_column)

        new_frames = []
        for index, row in df_data_single.iterrows():
            resampled_row_data = []
            
            for i, closest_column in enumerate(closest_columns):
                rc = row.iloc[closest_column]
                all_zeros = not np.any(rc)
                if all_zeros:
                    closest_nonzero_column = _find_closest_non_zero_column(
                        time_index=i,
                        common_days=common_days,
                        dates_list=dates_list, 
                        row=row)
                    rc = row.iloc[closest_nonzero_column]
                
                resampled_row_data.append(rc)

            resampled_row_df = pd.DataFrame([resampled_row_data], columns=common_days, index=[index])
            new_frames.append(resampled_row_df)

        new_frames_df = pd.concat(new_frames)
        df_data_all = pd.concat([df_data_all, new_frames_df])
    
    return df_data_all, common_days


def _load_all_labels(region_names):
    df_labels_all_lists = []
    for region_name in region_names:
        region_name = region_name.strip('/')
        LABELS_CSV_FILE_PATH = os.path.join(ROOT_DATA_PATH, f'csv_labels/train/demo_eurocrops_{region_name}.csv')
        GEO_JSON_FILE_PATH = os.path.join(ROOT_DATA_PATH, f'GeoJSONs_regional_split/train/AT/demo_eurocrops_{region_name}.geojson')

        # csv_file_path = os.path.join(train_csv_dir, csv_file_name)
        df_labels = pd.read_csv(LABELS_CSV_FILE_PATH, index_col=0)
        df_labels_all_lists.append(df_labels)


    df_labels_all = pd.concat(df_labels_all_lists)
    return df_labels_all
    

global df_datas
@memory.cache
def load_all_data_from_file_resampled(
        h5_file_path: str, 
        resampled_days_interval: int,
        ):
    global df_datas
    df_datas, region_names = _load_data_from_h5_file(h5_file_path=h5_file_path)
    df_data_all, common_days = _resample_and_concatenate_regions_data(df_datas=df_datas, resampled_days_interval=resampled_days_interval)
    
    df_labels_all = _load_all_labels(region_names=region_names)

    return df_data_all, df_labels_all, common_days, region_names

In [None]:
df_data_all, df_labels_all, common_days, region_names = load_all_data_from_file_resampled(
    h5_file_path=H5_FILE_PATH, 
    resampled_days_interval=7,
    )

  0%|                                                                                   | 0/2 [00:00<?, ?it/s]

In [None]:
df_data_all.head(3)

In [None]:
df_datas[0].head(3)

In [None]:
df_data_all.shape

In [None]:
df_labels_all.head(2)

In [None]:
df_labels_all.shape

### Check out the data for one parcel

In [None]:
# Pick the first row
example_row = df_data_all.iloc[0]
parcel_ID = example_row.name

# Get the corresponding label
label_code = df_labels_all.loc[parcel_ID]['crpgrpc']
label_name = df_labels_all.loc[parcel_ID]['crpgrpn']

print('{} grows on parcel {}'.format(label_name, parcel_ID))

In [None]:
example_row_np = example_row.to_numpy()
example_row_np = np.stack(example_row_np, axis=0)

plt.rcParams['figure.figsize'] = [15, 8]
plt.plot(common_days, example_row_np)
# plt.legend(bands)
plt.style.use('_classic_test_patch')
plt.xlabel('day of year')
plt.ylabel('channel value')
plt.title(f'Data for parcel id {parcel_ID}')
plt.grid()

#### Load geojson

In [None]:
def load_geometry_dict_by_parcelid_all(region_names):
    geometry_dict_by_parcelid_all = {}
    for region_name in tqdm(region_names):
        region_name = region_name.strip('/')
        GEO_JSON_FILE_PATH = os.path.join(ROOT_DATA_PATH, f'GeoJSONs_regional_split/train/AT/demo_eurocrops_{region_name}.geojson')    

        with open(GEO_JSON_FILE_PATH, 'r') as file:
            geojson_data = json.load(file)

        geometry_dict_by_parcelid = {feature['properties']['recno']: feature['geometry'] 
                                     for feature in geojson_data['features']}
        geometry_dict_by_parcelid_all.update(geometry_dict_by_parcelid)
    return geometry_dict_by_parcelid_all


geometry_dict_by_parcelid_all = load_geometry_dict_by_parcelid_all(region_names=region_names)


In [None]:
geometry_dict_by_parcelid_all[parcel_ID]

### Check crop types in the current dataset

In [None]:
def get_crop_types_counts_and_ids(df_data_all, df_labels_all):
    crop_types_counts = {}
    crop_types_ids = {}

    regions_id_set = set(df_data_all.index)
    
    for i, region_id in enumerate(df_labels_all.index):
        if region_id not in regions_id_set:
            continue

        crop_name = df_labels_all.iloc[i]['crpgrpn']
        current_count = crop_types_counts.get(crop_name, 0) 
        crop_types_counts[crop_name] = current_count + 1

        if crop_name not in crop_types_ids:
            crop_types_ids[crop_name] = []
        crop_types_ids[crop_name].append(region_id)
    
    return crop_types_counts, crop_types_ids


        
crop_types_counts, crop_types_ids = get_crop_types_counts_and_ids(df_data_all=df_data_all, df_labels_all=df_labels_all)

print(f'Total crop fields: {sum(crop_types_counts.values())}')
crop_types_counts = {k: v for k, v in sorted(crop_types_counts.items(), key=lambda item: -item[1])}
crop_types_counts

In [None]:
len(df_labels_all), len(df_data_all), len(geometry_dict_by_parcelid_all)

In [None]:
selected_crop_names = [
    'nuts',  # 'millet'
    'sugar_beet',
    'hemp',
    # 'sunflower_and_yellow_bloomer',
    #'soya',
    #'millet',
    #'grain_maize',
    #'pasture_meadow',
]



def get_data_for_crop_type(crop_name):
    data = np.zeros(shape=(len(crop_types_ids[crop_name]), len(common_days), NUMBER_OF_CHANNELS), dtype=float)
    for i, region_id in enumerate(crop_types_ids[crop_name]):
        region_data = df_data_all.loc[region_id].to_numpy()
        data[i, ...] = np.stack(region_data)
    return data
    
    
selected_crops_data = [get_data_for_crop_type(crop_name) for crop_name in selected_crop_names]

In [None]:
print(f'{selected_crops_data[0].shape} = [fields (for the crop type), time (common_days), channels (bands B0-B12)]')

In [None]:
crop_index = 0

data_crop_x = selected_crops_data[crop_index]
data_crop_x_mean = np.mean(data_crop_x, axis=0)
data_crop_x_std = np.std(data_crop_x, axis=0)


plt.rcParams['figure.figsize'] = [15, 8]
plt.plot(common_days, data_crop_x_mean)
# plt.legend(bands)
plt.style.use('_classic_test_patch')
plt.xlabel('day of year')
plt.ylabel('channel value')
plt.title(f'Data for crop "{selected_crop_names[crop_index]}"')
plt.grid()

In [None]:
d = selected_crops_data[0]
d[0, :, 3]

In [None]:
d[0]

In [None]:
@dataclass
class CropNdviData:
    mean: np.ndarray
    std: np.ndarray

        
def get_ndvi_data(data_crop) -> CropNdviData:
    """
    data_crop: [fields (for the crop), time (common_days), channels (bands B0-B12)]')
    return: mean and std for ndvi "channel"
    """
    
    # B8-B4 / (B8+B4)   ( counting from B1 to B13)
    B4 = data_crop[:, :, 4-1]
    B8 = data_crop[:, :, 8-1]

    data_crop_ndvi = (B8 - B4) / (B8 + B4)
    data_crop_mean_ndvi = np.mean(data_crop_ndvi, axis=0)
    data_crop_std_ndvi = np.std(data_crop_ndvi, axis=0) 
    
    return CropNdviData(mean=data_crop_mean_ndvi, std=data_crop_std_ndvi)
    

In [None]:
selected_crops_ndvi_data = [get_ndvi_data(crop_data) for crop_data in selected_crops_data]



In [None]:
crop_index = 0

data_crop_x_mean_ndvi = selected_crops_ndvi_data[crop_index].mean
data_crop_x_std_ndvi = selected_crops_ndvi_data[crop_index].std


plt.rcParams['figure.figsize'] = [15, 8]
plt.plot(common_days, data_crop_x_mean_ndvi)
plt.plot(common_days, data_crop_x_mean_ndvi - data_crop_x_std_ndvi, ':', color='b', linewidth=0.7)
plt.plot(common_days, data_crop_x_mean_ndvi + data_crop_x_std_ndvi, ':', color='b', linewidth=0.7)

# plt.legend(bands)
plt.style.use('_classic_test_patch')
plt.xlabel('day of year')
plt.ylabel('channel value')
plt.title(f'NDVI data for crop "{selected_crop_names[crop_index]}"')
plt.grid()
plt.legend(['mean NDVI', '+- std NDVI'])

In [None]:
plt.rcParams['figure.figsize'] = [15, 8]

for crop_ndvi_data in selected_crops_ndvi_data:
    plt.plot(common_days, crop_ndvi_data.mean)

# plt.legend(bands)
plt.style.use('_classic_test_patch')
plt.xlabel('day of year')
plt.ylabel('NDVI value (())')
plt.title(f'mean NDVI data for crop "{selected_crop_names[crop_index]}"')
plt.grid()
plt.legend(selected_crop_names)