# Generate features from raw data

In [1]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import ipyleaflet
import matplotlib.pyplot as plt
from IPython.display import Image
import seaborn as sns

# Data Science
import numpy as np
import numpy.ma as ma
import pandas as pd
import xarray as xr

# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score,classification_report,confusion_matrix

# Planetary Computer Tools
import pystac
import pystac_client
import odc
from pystac_client import Client
from pystac.extensions.eo import EOExtension as eo
from odc.stac import stac_load
import planetary_computer as pc
#pc.settings.set_subscription_key('******')

# Others
import requests
import rich.table
from itertools import cycle
from tqdm import tqdm
tqdm.pandas()
import os

In [2]:
os.chdir('/home/jovyan/EY-Crop-identification')
print(os.getcwd())

/home/jovyan/EY-Crop-identification


## Functions compute various indices using Sentinel 1 and 2 data

In [3]:
def replace_outliers_with_mean(arr):
    """
    Replace outliers in an array with the mean of non-outlier elements.

    Args:
    arr: numpy array
        Array containing the data with potential outliers.

    Returns:
    numpy array
        Array with outliers replaced by the mean of non-outlier elements.
    """
    # Convert arr to numpy array to enable mathematical operations
    arr = np.array(arr)
    # Calculate the quartiles and interquartile range
    q1 = np.percentile(arr, 25)
    q3 = np.percentile(arr, 75)
    iqr = q3 - q1
    # Calculate the lower and upper bounds for outliers
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    # Find the outliers
    outliers = (arr < lower_bound) | (arr > upper_bound)
    # Calculate the mean of the non-outlier elements
    mean_non_outliers = np.mean(arr[~outliers])
    # Replace outliers with the mean of non-outliers
    arr[outliers] = mean_non_outliers

    return arr


def read_vhvv(dir, train=True):
    """
    Read the vhvv data from .nc files in the specified directory.
    """
    if train:
        num_file = 600
    else:
        num_file = 250
    vhs, vvs = [], []
    for i in range(num_file):
        data = xr.open_dataset(os.path.join(dir, f"vhvv_row{i}.nc"))
        # Replace outliers with the mean of non-outliers
        vh = replace_outliers_with_mean(data['vv'].values)
        vv = replace_outliers_with_mean(data['vh'].values)
        vhs.append(vh)
        vvs.append(vv)
    return vhs, vvs


def calc_vhvv_agg(vhs, vvs, agg_func):
    '''
    Aggregate VH and VV by time slices using the aggregate function agg_func
    '''
    vhs_new = []
    for vh in vhs:
        vhs_new.append(agg_func(vh))
    vvs_new = []
    for vv in vvs:
        vvs_new.append(agg_func(vv))
    return np.array(vhs_new), np.array(vvs_new)

def calc_rvi_agg(vhs, vvs, agg_func):
    '''
    Calcualte RVI (Radar Vegetation Index) = sqrt (1- VV / (VV+VH)) * 4 * (VH / (VV + VH))
    then aggregate by time slices using the aggregate function agg_func
    '''
    rvis = []
    for vh, vv in zip(vhs, vvs):
        rvis_1year = np.sqrt(1-vv/(vv+vh))*4*(vh/(vv+vh))
        rvis.append(agg_func(rvis_1year))
    return np.array(rvis)


def read_sen2(dir, train=True):
    if train:
        num_file = 600
    else:
        num_file = 250
    sen2 = []
    for i in range(num_file):
        data = xr.open_dataset(os.path.join(dir, f"vhvv_sen2_row{i}.nc"))
        data['red'].values = replace_outliers_with_mean(data['red'].values)
        data['green'].values = replace_outliers_with_mean(data['green'].values)
        data['blue'].values = replace_outliers_with_mean(data['blue'].values)
        data['nir'].values = replace_outliers_with_mean(data['nir'].values)
        data['SCL'].values = replace_outliers_with_mean(data['SCL'].values)
        sen2.append(data)
    return sen2

def calc_ndvi_agg(sen2, agg_func):
    '''
    Calcualte Normalized Difference Vegetation Index (NDVI). 
    then aggregate by time slices using the aggregate function agg_func
    '''
    ndvis = []
    ndvis_clean = []
    for data in sen2:
        # Create a mask for no data, saturated data, clouds, cloud shadows, and water
        cloud_mask = \
            (data.SCL != 0) & \
            (data.SCL != 1) & \
            (data.SCL != 3) & \
            (data.SCL != 6) & \
            (data.SCL != 8) & \
            (data.SCL != 9) & \
            (data.SCL != 10)

        # Apply cloud mask ... NO Clouds, NO Cloud Shadows and NO Water pixels
        # All masked pixels are converted to "No Data" and stored as 16-bit integers
        cleaned_data = data.where(cloud_mask).astype("uint16")
        # Calculate the mean of the data across the sample region and then NDVI
        # Perform this calculation for the unfiltered and cloud-filtered (clean) datasets
        mean_unfiltered = data.mean(dim=['longitude','latitude']).compute()
        ndvi_mean = (mean_unfiltered.nir-mean_unfiltered.red)/(mean_unfiltered.nir+mean_unfiltered.red)
        mean_clean = cleaned_data.mean(dim=['longitude','latitude']).compute()
        ndvi_mean_clean = (mean_clean.nir-mean_clean.red)/(mean_clean.nir+mean_clean.red)

        ndvis.append(agg_func(ndvi_mean.values))
        ndvis_clean.append(agg_func(ndvi_mean_clean.values))
    return np.array(ndvis), np.array(ndvis_clean)


def calculate_evi_agg(sen2, agg_func):
    '''
    Calculate Enhanced Vegetation Index (EVI): EVI = 2.5 * ((NIR - RED) / (NIR + 6 * RED - 7.5 * BLUE + 1))
    then aggregate by time slices using the aggregate function agg_func
    '''
    L = 1
    C1 = 6
    C2 = 7.5
    G = 2.5
    evi = []

    for data in sen2: 
        mean_unfiltered = data.mean(dim=['longitude','latitude']).compute()
        evi_mean = G*((mean_unfiltered.nir-mean_unfiltered.red)/(mean_unfiltered.nir+C1*mean_unfiltered.red-C2*mean_unfiltered.blue+L))
        
        evi.append(agg_func(evi_mean.values))
    return np.array(evi)

def calculate_savi_agg(sen2, agg_func):
    '''
    Calculate Soil Adjusted Vegetation Index (SAVI): SAVI = ((NIR - RED) / (NIR + RED + L)) x (1 + L)
    then aggregate by time slices using the aggregate function agg_func
    '''
    L = 0.5
    savi = []
    
    for data in sen2: 
        mean_unfiltered = data.mean(dim=['longitude','latitude']).compute()
        savi_mean = (1+L) * (mean_unfiltered.nir - mean_unfiltered.red)/(mean_unfiltered.nir + mean_unfiltered.red + L)
        savi.append(agg_func(savi_mean.values))
    return np.array(savi)

def calculate_ndwi_agg(sen2, agg_func):
    '''
    Calculate Normalized Difference Water Index (NDWI): NDWI = (Green - NIR) / (Green + NIR)
    then aggregate by time slices using the aggregate function agg_func
    '''
    ndwi = []
    for data in sen2: 
        mean_unfiltered = data.mean(dim=['longitude','latitude']).compute()
        ndwi_mean = (mean_unfiltered.green - mean_unfiltered.nir) / (mean_unfiltered.green + mean_unfiltered.nir)
        ndwi.append(agg_func(ndwi_mean.values))
    return np.array(ndwi)

def calculate_gci_agg(sen2, agg_func):
    '''
    Calculate Green Chlorophyll Index (GCI): GCI = (ρGreen - ρRed) / (ρGreen + ρRed)
    then aggregate by time slices using the aggregate function agg_func
    '''
    gci = []
    for data in sen2: 
        mean_unfiltered = data.mean(dim=['longitude','latitude']).compute()
        gci_mean = (mean_unfiltered.green - mean_unfiltered.red) / (mean_unfiltered.green + mean_unfiltered.red)
        gci.append(agg_func(gci_mean.values))
    return np.array(gci)

def calculate_lai_agg(sen2, agg_func):
    '''
    Calculate Leaf Area Index (LAI) by the following steps
    Calculate the Normalized Difference Vegetation Index (NDVI)
    Convert NDVI to LAI using a conversion formula.
    '''
    lais = []
    for data in sen2: 
        mean_unfiltered = data.mean(dim=['longitude','latitude']).compute()
        ndvi_mean = (mean_unfiltered.nir-mean_unfiltered.red)/(mean_unfiltered.nir+mean_unfiltered.red)
        lai = np.exp(3.618 + 0.0037 * ndvi_mean)
        lais.append(agg_func(lai.values))
    return np.array(lais)


## Helper functions to compute indices for all bounding boxes

In [4]:
def read_bbox(dir, box, train):
    '''
    Read Sentinel 1 data for each bounding box 
    '''
    if train:
        num_file = 600
    else:
        num_file = 250
    vhs, vvs = [], []
    for i in range(num_file):
        data = xr.open_dataset(os.path.join(dir, f"vhvv_bbox{box}_row{i}.nc"))
        vh = replace_outliers_with_mean(data['vv'].values)
        vv = replace_outliers_with_mean(data['vh'].values)
        vhs.append(vh)
        vvs.append(vv)
    return vhs, vvs

def read_all_bboxes(train=True):
    '''
    Read Sentinel 1 data for all 3 bounding boxes
    '''
    vhs_boxes = []
    vvs_boxes = []
    for i in range(3):
        if train:
            dir = f"Data/bbox{i}/train"
        else:
            dir = f"Data/bbox{i}/test"
        vhs, vvs = read_bbox(dir, i, train)
        vhs_boxes.append(vhs)
        vvs_boxes.append(vvs)
    return vhs_boxes, vvs_boxes

def read_sen2_bboxes(train=True):
    '''
    Read Sentinel 2 data for all 3 bounding boxes
    '''
    sen2_bboxes = []
    for box in range(3):
        sen2_bbox = []
        if train:
            dir = f"Data/bbox{box}/train"
            num_file = 600
        else:
            dir = f"Data/bbox{box}/test"
            num_file = 250
        for j in range(num_file):
            data = xr.open_dataset(os.path.join(dir, f"vhvv_bbox{box}_sen2_row{j}.nc"))
            data['red'].values = replace_outliers_with_mean(data['red'].values)
            data['green'].values = replace_outliers_with_mean(data['green'].values)
            data['blue'].values = replace_outliers_with_mean(data['blue'].values)
            data['nir'].values = replace_outliers_with_mean(data['nir'].values)
            data['SCL'].values = replace_outliers_with_mean(data['SCL'].values)
            sen2_bbox.append(data)
        sen2_bboxes.append(sen2_bbox)
    return sen2_bboxes

def calc_bbox_agg(vhs_box, vvs_box, agg_func):
    '''
    Aggregate VH and VV by time slices using the aggregate function agg_func for data in all 3 bounding boxes
    '''
    vhs_box_agg, vvs_box_agg = [], []
    for vhs, vvs in zip(vhs_box, vvs_box):
        vhs_agg, vvs_agg = calc_vhvv_agg(vhs, vvs, agg_func)
        vhs_box_agg.append(vhs_agg)
        vvs_box_agg.append(vvs_agg)
    return vhs_box_agg, vvs_box_agg

def calc_rvi_bbox_agg(vhs_box, vvs_box, agg_func):
    '''
    Calculate RVI and aggregate by time slices using the aggregate function agg_func for data in all 3 bounding boxes
    '''
    rvis_box = []
    for vhs, vvs in zip(vhs_box, vvs_box):
        rvis = calc_rvi_agg(vhs, vvs, agg_func)
        rvis_box.append(rvis)
    return rvis_box

def calc_sen2_index_bbox_agg(sen2_bboxes, agg_func):
    '''
    Calculate Sentinel 2 indices and aggregate by time slices using the aggregate function agg_func for data in all 3 bounding boxes
    '''
    ndvi_bboxes_agg, ndvi_clean_bboxes_agg = [], []
    evi_bboxes_agg = []
    savi_bboxes_agg = []
    ndwi_bboxes_agg = []
    gci_bboxes_agg = []
    lai_bboxes_agg = []
    
    for i, sen2_bbox in enumerate(sen2_bboxes):
        ndvis_agg, ndvis_clean_agg = calc_ndvi_agg(sen2_bbox, agg_func)
        evi_agg = calculate_evi_agg(sen2_bbox, agg_func)
        savi_agg = calculate_savi_agg(sen2_bbox, agg_func)
        ndwi_agg = calculate_ndwi_agg(sen2_bbox, agg_func)
        gci_agg = calculate_gci_agg(sen2_bbox, agg_func)
        lai_agg = calculate_lai_agg(sen2_bbox, agg_func)
            
        ndvi_bboxes_agg.append(ndvis_agg)
        ndvi_clean_bboxes_agg.append(ndvis_clean_agg)
        evi_bboxes_agg.append(evi_agg)
        savi_bboxes_agg.append(savi_agg)
        ndwi_bboxes_agg.append(ndwi_agg)
        gci_bboxes_agg.append(gci_agg)
        lai_bboxes_agg.append(lai_agg)
    return ndvi_bboxes_agg, ndvi_clean_bboxes_agg, evi_bboxes_agg, savi_bboxes_agg, ndwi_bboxes_agg, gci_bboxes_agg, lai_bboxes_agg





## Combine all indices and features of coordinates into a big dataframe. 

In [5]:
def make_feature_df(agg_funcs, vhs, vvs, vhs_boxes, vvs_boxes, sen2, sen2_bboxes):
    '''
    Pull all calculated features together into a dataframe where each feature is aggregated by an aggregate function in agg_funcs
    '''
    vhs_vvs_data, vhs_vvs_cols = [], []
    for agg_func in tqdm(agg_funcs):
        # aggregate vh vv by time
        vhs_agg, vvs_agg = calc_vhvv_agg(vhs, vvs, agg_func)
        vhs_vvs_data.extend([vhs_agg, vvs_agg])
        vhs_vvs_cols.extend([f"vhs_{agg_func.__name__}", f"vvs_{agg_func.__name__}"])

        # aggregate rvi by time
        rvi_agg = calc_rvi_agg(vhs, vvs, agg_func)
        vhs_vvs_data.append(rvi_agg)
        vhs_vvs_cols.append(f"rvi_{agg_func.__name__}")

        # aggregate vh vv of each bounding box by time
        vhs_boxes_agg, vvs_boxes_agg = calc_bbox_agg(vhs_boxes, vvs_boxes, agg_func)
        vhs_boxes_cols = [f"vhs_box{i}_{agg_func.__name__}" for i in range(len(vhs_boxes_agg))]
        vvs_boxes_cols = [f"vvs_box{i}_{agg_func.__name__}" for i in range(len(vvs_boxes_agg))]
        vhs_vvs_data.extend(vhs_boxes_agg)
        vhs_vvs_cols.extend(vhs_boxes_cols)
        vhs_vvs_data.extend(vvs_boxes_agg)
        vhs_vvs_cols.extend(vvs_boxes_cols)

        # aggregate rvi of each bounding box by time
        rvi_boxes_agg = calc_rvi_bbox_agg(vhs_boxes, vvs_boxes, agg_func)
        rvi_boxes_cols = [f"rvi_box{i}_{agg_func.__name__}" for i in range(len(rvi_boxes_agg))]
        vhs_vvs_data.extend(rvi_boxes_agg)
        vhs_vvs_cols.extend(rvi_boxes_cols)

         # aggregate ndvi mean and cleaned ndvi mean by time
        ndvis_agg, ndvis_clean_agg = calc_ndvi_agg(sen2, agg_func)
        vhs_vvs_data.extend([ndvis_agg, ndvis_clean_agg])
        vhs_vvs_cols.extend([f"ndvi_{agg_func.__name__}", f"ndvi_clean_{agg_func.__name__}"])

        # aggregate various indices of each bounding box by time
        ndvi_bboxes_agg, ndvi_clean_bboxes_agg, evi_bboxes_agg, savi_bboxes_agg, ndwi_bboxes_agg, gci_bboxes_agg, lai_bboxes_agg = calc_sen2_index_bbox_agg(sen2_bboxes, agg_func)

        # create column names for each bounding box and index
        ndvi_bboxes_cols = [f"ndvi_box{i}_{agg_func.__name__}" for i in range(len(ndvi_bboxes_agg))]
        ndvi_clean_bboxes_cols = [f"ndvi_clean_box{i}_{agg_func.__name__}" for i in range(len(ndvi_clean_bboxes_agg))]
        evi_bboxes_cols = [f"evi_box{i}_{agg_func.__name__}" for i in range(len(evi_bboxes_agg))]
        savi_bboxes_cols = [f"savi_box{i}_{agg_func.__name__}" for i in range(len(savi_bboxes_agg))]
        ndwi_bboxes_cols = [f"ndwi_box{i}_{agg_func.__name__}" for i in range(len(ndwi_bboxes_agg))]
        gci_bboxes_cols = [f"gci_box{i}_{agg_func.__name__}" for i in range(len(gci_bboxes_agg))]
        lai_bboxes_cols = [f"lai_box{i}_{agg_func.__name__}" for i in range(len(lai_bboxes_agg))]

        # extend data and column lists with the new index values
        vhs_vvs_data.extend(ndvi_bboxes_agg)
        vhs_vvs_cols.extend(ndvi_bboxes_cols)
        vhs_vvs_data.extend(ndvi_clean_bboxes_agg)
        vhs_vvs_cols.extend(ndvi_clean_bboxes_cols)
        vhs_vvs_data.extend(evi_bboxes_agg)
        vhs_vvs_cols.extend(evi_bboxes_cols)
        vhs_vvs_data.extend(savi_bboxes_agg)
        vhs_vvs_cols.extend(savi_bboxes_cols)
        vhs_vvs_data.extend(ndwi_bboxes_agg)
        vhs_vvs_cols.extend(ndwi_bboxes_cols)
        vhs_vvs_data.extend(gci_bboxes_agg)
        vhs_vvs_cols.extend(gci_bboxes_cols)
        vhs_vvs_data.extend(lai_bboxes_agg)
        vhs_vvs_cols.extend(lai_bboxes_cols)
    
    vhs_vvs_data = np.array(vhs_vvs_data).T
    vh_vv_df = pd.DataFrame(data=vhs_vvs_data, columns=vhs_vvs_cols)
    return vh_vv_df

## Generate features for training data

### Read all the raw data

In [6]:
vhs, vvs = read_vhvv("Data/vhvv/train")

In [7]:
vhs_boxes, vvs_boxes = read_all_bboxes(train=True)

In [8]:
sen2 = read_sen2("Data/vhvv/train")

In [9]:
sen2_bboxes = read_sen2_bboxes(train=True)

### Generate features

In [10]:
# Define various aggregate functions
agg_funcs = [np.nanmean, np.nanmedian, np.nanvar, np.nanstd, np.nanmax, np.nanmin]
vh_vv_df = make_feature_df(agg_funcs, vhs, vvs, vhs_boxes, vvs_boxes, sen2, sen2_bboxes)


100%|██████████| 6/6 [08:04<00:00, 80.70s/it]


In [11]:
vh_vv_df

Unnamed: 0,vhs_nanmean,vvs_nanmean,rvi_nanmean,vhs_box0_nanmean,vhs_box1_nanmean,vhs_box2_nanmean,vvs_box0_nanmean,vvs_box1_nanmean,vvs_box2_nanmean,rvi_box0_nanmean,...,savi_box2_nanmin,ndwi_box0_nanmin,ndwi_box1_nanmin,ndwi_box2_nanmin,gci_box0_nanmin,gci_box1_nanmin,gci_box2_nanmin,lai_box0_nanmin,lai_box1_nanmin,lai_box2_nanmin
0,0.116084,0.023671,2.890777,0.114230,0.110538,0.107018,0.024592,0.024231,0.023408,2.850685,...,-0.062847,-0.776849,-0.776897,-0.768202,-0.479617,-0.104442,-0.112254,37.257164,37.257717,37.257195
1,0.108883,0.024959,2.720052,0.098655,0.098226,0.098964,0.024312,0.024350,0.024156,2.732355,...,-0.073076,-0.758108,-0.753487,-0.744937,-0.284393,-0.111168,-0.097025,37.256344,37.256172,37.256252
2,0.082586,0.024718,2.584966,0.086315,0.091521,0.093510,0.023292,0.023603,0.023605,2.690331,...,-0.098071,-0.741007,-0.741054,-0.736151,-0.100157,-0.088793,-0.124537,37.254261,37.254028,37.253952
3,0.093211,0.022523,2.830698,0.104241,0.105620,0.106400,0.023353,0.023850,0.023901,2.877911,...,-0.190823,-0.715783,-0.708499,-0.700550,-0.201358,-0.446832,-0.469740,37.245354,37.245304,37.245426
4,0.100303,0.024824,2.777920,0.107460,0.107866,0.107308,0.023705,0.023326,0.023790,2.830453,...,-0.089476,-0.767557,-0.765676,-0.765937,-0.142532,-0.143463,-0.135810,37.253937,37.254360,37.254742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.282844,0.063456,2.878281,0.266151,0.273217,0.277246,0.062586,0.064599,0.066390,2.846788,...,-0.605747,-0.766043,-0.766482,-0.765904,-0.060560,-0.497146,-0.496859,37.209030,37.207111,37.207333
596,0.242690,0.075850,2.598955,0.253094,0.258333,0.268514,0.073574,0.070548,0.069480,2.679361,...,-0.603732,-0.732047,-0.728804,-0.738467,-0.213724,-0.218755,-0.164904,37.207714,37.207645,37.207523
597,0.272460,0.056198,2.988059,0.257981,0.265610,0.272252,0.061351,0.064981,0.067032,2.858875,...,-0.629877,-0.668690,-0.700106,-0.724475,-0.373982,-0.083092,-0.021035,37.203938,37.204540,37.205116
598,0.278292,0.066754,2.862590,0.294886,0.290926,0.286203,0.071718,0.069992,0.068467,2.837691,...,-0.633934,-0.812792,-0.807100,-0.791983,-0.028334,-0.013460,-0.007390,37.203831,37.204239,37.204742


In [12]:
def combine_two_datasets(dataset1, dataset2):
    '''
    Returns a vertically concatenated dataset.
    Attributes:
    dataset1 - Dataset 1 to be combined
    dataset2 - Dataset 2 to be combined
    '''
    data = pd.concat([dataset1,dataset2], axis=1)
    return data

In [13]:
def separate_latlong(crop_presence_data):
    '''
    separate Latitude and Longitude and save them as floats
    '''
    def sep_latlong_lat(latlong):
        latlong = latlong.replace('(','').replace(')','').replace(' ','').split(',')
        return float(latlong[0])
    def sep_latlong_long(latlong):
        latlong = latlong.replace('(','').replace(')','').replace(' ','').split(',')
        return float(latlong[1])
    crop_presence_data['Latitude'] = crop_presence_data["Latitude and Longitude"].apply(sep_latlong_lat)
    crop_presence_data['Longitude'] = crop_presence_data["Latitude and Longitude"].apply(sep_latlong_long)
    return crop_presence_data

### Combine with location data with features dataframe and output for training

In [14]:
crop_presence_data = pd.read_csv("Data/Crop_Location_Data_20221201.csv")
crop_presence_data = separate_latlong(crop_presence_data)
crop_data = combine_two_datasets(crop_presence_data,vh_vv_df)
crop_data.head()

Unnamed: 0,Latitude and Longitude,Class of Land,Latitude,Longitude,vhs_nanmean,vvs_nanmean,rvi_nanmean,vhs_box0_nanmean,vhs_box1_nanmean,vhs_box2_nanmean,...,savi_box2_nanmin,ndwi_box0_nanmin,ndwi_box1_nanmin,ndwi_box2_nanmin,gci_box0_nanmin,gci_box1_nanmin,gci_box2_nanmin,lai_box0_nanmin,lai_box1_nanmin,lai_box2_nanmin
0,"(10.323727047081501, 105.2516346045924)",Rice,10.323727,105.251635,0.116084,0.023671,2.890777,0.11423,0.110538,0.107018,...,-0.062847,-0.776849,-0.776897,-0.768202,-0.479617,-0.104442,-0.112254,37.257164,37.257717,37.257195
1,"(10.322364360592521, 105.27843410554115)",Rice,10.322364,105.278434,0.108883,0.024959,2.720052,0.098655,0.098226,0.098964,...,-0.073076,-0.758108,-0.753487,-0.744937,-0.284393,-0.111168,-0.097025,37.256344,37.256172,37.256252
2,"(10.321455902933202, 105.25254306225168)",Rice,10.321456,105.252543,0.082586,0.024718,2.584966,0.086315,0.091521,0.09351,...,-0.098071,-0.741007,-0.741054,-0.736151,-0.100157,-0.088793,-0.124537,37.254261,37.254028,37.253952
3,"(10.324181275911162, 105.25118037576274)",Rice,10.324181,105.25118,0.093211,0.022523,2.830698,0.104241,0.10562,0.1064,...,-0.190823,-0.715783,-0.708499,-0.70055,-0.201358,-0.446832,-0.46974,37.245354,37.245304,37.245426
4,"(10.324635504740822, 105.27389181724476)",Rice,10.324636,105.273892,0.100303,0.024824,2.77792,0.10746,0.107866,0.107308,...,-0.089476,-0.767557,-0.765676,-0.765937,-0.142532,-0.143463,-0.13581,37.253937,37.25436,37.254742


In [15]:
crop_data.to_parquet("Data/crop_data_features_train.parquet.gzip", compression="gzip")

In [16]:
#Reading the coordinates for the submission
test_file = pd.read_csv('Data/challenge_1_submission_template_correct_columns_fixed.csv')
test_file = separate_latlong(test_file)
test_file.head()

Unnamed: 0,Latitude and Longitude,target,Latitude,Longitude
0,"(10.18019073690894, 105.32022315786804)",,10.180191,105.320223
1,"(10.561107033461816, 105.12772097986661)",,10.561107,105.127721
2,"(10.623790611954897, 105.13771401411867)",,10.623791,105.137714
3,"(10.583364246115156, 105.23946127195805)",,10.583364,105.239461
4,"(10.20744446668854, 105.26844107128906)",,10.207444,105.268441


## Generate features for testing data

In [17]:
vhs_test, vvs_test = read_vhvv("Data/vhvv/test", train=False)
vhs_boxes_test, vvs_boxes_test = read_all_bboxes(train=False)
sen2_test = read_sen2("Data/vhvv/test", train=False)
sen2_bboxes_test = read_sen2_bboxes(train=False)

In [18]:
vh_vv_df_test = make_feature_df(agg_funcs, vhs_test, vvs_test, vhs_boxes_test, vvs_boxes_test, sen2_test, sen2_bboxes_test)

100%|██████████| 6/6 [03:22<00:00, 33.76s/it]


In [19]:
vh_vv_df_test

Unnamed: 0,vhs_nanmean,vvs_nanmean,rvi_nanmean,vhs_box0_nanmean,vhs_box1_nanmean,vhs_box2_nanmean,vvs_box0_nanmean,vvs_box1_nanmean,vvs_box2_nanmean,rvi_box0_nanmean,...,savi_box2_nanmin,ndwi_box0_nanmin,ndwi_box1_nanmin,ndwi_box2_nanmin,gci_box0_nanmin,gci_box1_nanmin,gci_box2_nanmin,lai_box0_nanmin,lai_box1_nanmin,lai_box2_nanmin
0,0.067722,0.018949,2.615462,0.077514,0.075411,0.075428,0.017724,0.017735,0.018113,2.701212,...,-0.582092,-0.770674,-0.770016,-0.770145,-0.141747,-0.141956,-0.141479,37.210316,37.209835,37.209507
1,0.050133,0.014027,2.727983,0.062882,0.060505,0.059333,0.016430,0.016085,0.016062,2.752180,...,-0.236639,-0.760787,-0.756376,-0.749618,-0.105525,-0.106418,-0.105732,37.239414,37.240742,37.241219
2,0.118173,0.020505,3.075562,0.120807,0.125415,0.128808,0.019030,0.019469,0.019895,3.105102,...,-0.404277,-0.735567,-0.735597,-0.732626,-0.128692,-0.180987,-0.149524,37.221874,37.226017,37.225807
3,0.013566,0.004096,2.646386,0.013842,0.013596,0.013645,0.004118,0.004172,0.004167,2.639626,...,-0.963989,-0.452009,-0.451476,-0.453146,-0.081035,-0.080456,-0.079687,37.174389,37.174297,37.174446
4,0.066634,0.016418,2.763066,0.082104,0.081916,0.082733,0.017188,0.017096,0.016641,2.792047,...,-0.098064,-0.815320,-0.815817,-0.816114,-0.150656,-0.150205,-0.148285,37.254379,37.254139,37.253952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,0.015096,0.004561,2.548298,0.015398,0.015165,0.015038,0.005215,0.005228,0.005112,2.499191,...,-0.845845,-0.333109,-0.059832,-0.132172,-0.091207,-0.090948,-0.106509,37.185650,37.185413,37.185280
246,0.013573,0.003809,2.661986,0.013079,0.013624,0.013647,0.003974,0.004014,0.004088,2.611577,...,-0.965720,-0.468875,-0.467394,-0.470040,-0.079123,-0.079142,-0.079881,37.174091,37.174225,37.174286
247,0.014165,0.004556,2.555335,0.013810,0.013399,0.013203,0.004396,0.004385,0.004386,2.586780,...,-0.965016,-0.488759,-0.487296,-0.488495,-0.075068,-0.074357,-0.075696,37.174339,37.174480,37.174347
248,0.132837,0.019895,3.064059,0.139959,0.134685,0.129951,0.019763,0.019834,0.019779,3.086462,...,-0.224462,-0.729404,-0.729824,-0.736419,-0.089391,-0.085979,-0.088899,37.244141,37.243355,37.242329


In [20]:
crop_data_test = combine_two_datasets(test_file, vh_vv_df_test)
crop_data_test.head()

Unnamed: 0,Latitude and Longitude,target,Latitude,Longitude,vhs_nanmean,vvs_nanmean,rvi_nanmean,vhs_box0_nanmean,vhs_box1_nanmean,vhs_box2_nanmean,...,savi_box2_nanmin,ndwi_box0_nanmin,ndwi_box1_nanmin,ndwi_box2_nanmin,gci_box0_nanmin,gci_box1_nanmin,gci_box2_nanmin,lai_box0_nanmin,lai_box1_nanmin,lai_box2_nanmin
0,"(10.18019073690894, 105.32022315786804)",,10.180191,105.320223,0.067722,0.018949,2.615462,0.077514,0.075411,0.075428,...,-0.582092,-0.770674,-0.770016,-0.770145,-0.141747,-0.141956,-0.141479,37.210316,37.209835,37.209507
1,"(10.561107033461816, 105.12772097986661)",,10.561107,105.127721,0.050133,0.014027,2.727983,0.062882,0.060505,0.059333,...,-0.236639,-0.760787,-0.756376,-0.749618,-0.105525,-0.106418,-0.105732,37.239414,37.240742,37.241219
2,"(10.623790611954897, 105.13771401411867)",,10.623791,105.137714,0.118173,0.020505,3.075562,0.120807,0.125415,0.128808,...,-0.404277,-0.735567,-0.735597,-0.732626,-0.128692,-0.180987,-0.149524,37.221874,37.226017,37.225807
3,"(10.583364246115156, 105.23946127195805)",,10.583364,105.239461,0.013566,0.004096,2.646386,0.013842,0.013596,0.013645,...,-0.963989,-0.452009,-0.451476,-0.453146,-0.081035,-0.080456,-0.079687,37.174389,37.174297,37.174446
4,"(10.20744446668854, 105.26844107128906)",,10.207444,105.268441,0.066634,0.016418,2.763066,0.082104,0.081916,0.082733,...,-0.098064,-0.81532,-0.815817,-0.816114,-0.150656,-0.150205,-0.148285,37.254379,37.254139,37.253952


In [21]:
crop_data_test.to_parquet("Data/crop_data_features_test.parquet.gzip", compression="gzip")