# Africa Biomass Challenge
**Can you predict biomass in cocoa plantations in Côte d'Ivoire?**

# Images Extractions

In [7]:
import h5py
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [5]:
!wget -q  https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_train.h5
!wget -q  https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_val.h5
!wget -q https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_test.h5

'wget' is not recognized as an internal or external command,
operable program or batch file.
'wget' is not recognized as an internal or external command,
operable program or batch file.
'wget' is not recognized as an internal or external command,
operable program or batch file.


In [6]:
trainset = h5py.File("09072022_1154_train.h5", "r")
validateset = h5py.File("09072022_1154_val.h5", "r")
testset = h5py.File("09072022_1154_test.h5", "r")

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = '09072022_1154_train.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
# attributes of trainset
trainset.keys()

In [None]:
# attributes of validset
validateset.keys()

In [None]:
# attributes of testset
testset.keys()

## Explore datasets

In [None]:
# train
train_images = np.array(trainset['images'],dtype=np.float64)
train_images = train_images.transpose(0,3,1,2)

train_biomasses = np.array(trainset['agbd'],dtype=np.float64)

# validate
validate_images = np.array(validateset['images'],dtype=np.float64)
validate_images = validate_images.transpose(0,3,1,2)
validate_biomasses = np.array(validateset['agbd'],dtype=np.float64)

# test 
test_images = np.array(testset['images'],dtype=np.float32)
test_images = test_images.transpose(0,3,1,2)
test_biomasses = np.array(testset['agbd'],dtype=np.float32)

In [None]:
print(f"train dataset size {train_images.shape} train lab size {train_biomasses.shape}")
print()
print(f"validate dataset size {validate_images.shape} validate lab size {validate_biomasses.shape}")
print()
print(f"test dataset size {test_images.shape} test lab size {test_biomasses.shape}")

##  Some visulizations

In [None]:
plt.imshow(train_images[1,1,:,:])

In [None]:
train_biomasses[1]

 **Below we explore the skewness of train dataset in each channel**

In [None]:
band_skewness = []
band_skewness_after_sqrt = []
for c in range(12):
    #calculate skewness
    band_skewness.append(pd.Series(train_images[:,c].flatten()).skew())
    #calculate skewness after applying sqrt
    band_skewness_after_sqrt.append(pd.Series(np.sqrt(train_images[:,c].flatten())).skew())

In [None]:
# skweness
width = 0.25

rng = [i for i in range(0,12)]
rng2 = [i+width for i in range(0,12)]

plt.bar(rng, band_skewness, align='center', width = width, label='Original', color ='b')
plt.bar(rng2, band_skewness_after_sqrt, align='center', width = width, label='After sqrt', color ='g')
plt.gca().set_xticks(rng)
plt.legend()
plt.xlabel('Band ID')
plt.ylabel('Skewness')
plt.show()

# First Baseline With Sklearn

In [None]:
!pip install -qq catboost --quiet

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn import model_selection
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import kurtosis, skew
from xgboost.sklearn import XGBRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

In [None]:
from sklearn import set_config
set_config(display='diagram')

## Modelisation

### Pipeline of training

In [None]:
# constante for standardization
MEAN = train_images.mean((0,2,3))
STD = train_images.std((0,2,3))

In [None]:
class CustomScaler(BaseEstimator, TransformerMixin):
  def __init__(self,mean,std):
    self.mean = mean
    self.std = std
  
  def fit(self, X, y=None):
    self.mean = X.mean((0,2,3)) 
    self.std = X.std((0,2,3))
    return self

  def transform(self, X, y=None):
    return (X-self.mean[None,:,None,None])/self.std[None,:,None,None] 

    
class FlattenTransformer(BaseEstimator, TransformerMixin):

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    return X.reshape((X.shape[0], -1))


pipe = Pipeline(steps=[("scaler", CustomScaler(MEAN,STD)),  
                       ("flatten", FlattenTransformer()),
                       ("classifier",Lasso())   
])

In [None]:
pipe.fit(train_images,train_biomasses)

In [None]:
# predict
pred_train = pipe.predict(train_images)

# train mse error
mse = mean_squared_error(train_biomasses, pred_train) 
mse

In [None]:
# predict
pred_validate = pipe.predict(validate_images)

#  validate mse error
mse = mean_squared_error(validate_biomasses, pred_validate) 
mse

### Gridsearch

**Initiaze the hyperparameters for each dictionary:**

In [None]:
##### LinearRegression ######
param1 = {}
param1['classifier'] = [LinearRegression()]


##### Ridge ######
param2 = {}
param2['classifier__alpha'] = np.logspace(-5, 2, 8)   # [0.1,0.5, 1]  
param2['classifier'] = [Ridge()]


##### Lasso ######
param3 = {}
param3['classifier__alpha'] = np.logspace(-6, 2, 8)  # [0.08,0.09, 0.1]
param3['classifier'] = [Lasso()]


##### KNeighborsRegressor ######
param4 = {}
param4['classifier__n_neighbors'] = [2,5,10,25,50]
param4['classifier__leaf_size']= [12, 11, 13]
param4['classifier__p'] = [1] 
param4['classifier'] = [KNeighborsRegressor()]


##### XGBRegressor ###### 
param5 = {}
param5['classifier__learning_rate']= [0.025]
param5['classifier__max_depth']= [12,13]    
param5['classifier__nthread']= [4]
param5['classifier__objective']= ['reg:squarederror']
param5['classifier__tree_method']= ['gpu_hist']
param5['classifier__min_child_weight'] = [2,3] 
param5['classifier__subsample'] = [0.9] # 0.8,
param5['classifier__colsample_bytree'] = [0.4,0.5] 
param5['classifier__n_estimators'] = [480,500] 
param5['classifier'] = [XGBRegressor()]

##### ElasticNet ######
param6 = {}
param6['classifier__alpha'] = np.logspace(-5, 2, 8)    # [2,5,10,25,50]
param6['classifier__l1_ratio'] = [.2,.4,.6,.8] 
param6['classifier'] = [ElasticNet()]


##### RandomForestRegressor ######
param7 = {}
param7['classifier__n_estimators'] = [400,500]
param7['classifier'] = [RandomForestRegressor()]


##### GradientBoostingRegressor ######
param8 = {}
param8['classifier__n_estimators'] = [400,500] 
param8['classifier__min_samples_split'] = range(400,600,100)
param8['classifier__min_samples_leaf'] = range(30,71,10)
param8['classifier__learning_rate'] = [0.05]
param8['classifier__max_depth'] = [4,5]
param8['classifier__max_features'] = ['sqrt']
param8['classifier__subsample'] = [0.7,0.8]
param8['classifier'] = [GradientBoostingRegressor()]


##### LinearSVR ######
# param9 = {}
# param9['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2] 
# param9['classifier__epsilon'] = [0.0,0.2,0.5,1]
# param9['classifier'] = [LinearSVR()]

In [None]:
pipeline = pipe = Pipeline(steps=[("scaler", CustomScaler(MEAN,STD)),
                       ("flatten", FlattenTransformer()),
                       ("classifier", LinearRegression())])
# params = [param1, param2, param3, param4, param5, param6, param7, param8] 
params = [param5] # param1, param3, 

In [None]:
%%time
# Train the grid search model
grid_search = GridSearchCV(pipeline, params, cv=3, scoring='neg_mean_squared_error').fit(train_images,train_biomasses)

In [None]:
# Best performing model and its corresponding hyperparameters
grid_search.best_params_

In [None]:
model = grid_search.best_estimator_
# predict
pred = model.predict(train_images)

# validate mse error
mse = mean_squared_error(train_biomasses, pred) 
rmse = np.sqrt(mse)

print(mse, rmse)

In [None]:
# 2802.6255438618155 52.939829465741724

In [None]:
model = grid_search.best_estimator_
# predict
pred = model.predict(validate_images)

# validate mse error
mse = mean_squared_error(validate_biomasses, pred) 
rmse = np.sqrt(mse)

print(mse, rmse)

In [None]:
# 3790.385611052348 61.56610764903323

### Predict GIZ Biomass

In [None]:
s2_images_h5 = h5py.File("/kaggle/input/biomass-data/images_test.h5", "r")
s2_images_h5

In [None]:
#prepare test set sentinel 2 images 
s2_images = np.array(s2_images_h5["images"])
s2_images = s2_images.transpose(0,3,1,2)

In [None]:
s2_images

In [None]:
# predict on giz test data
# pred_giz = pipe.predict(s2_images)
pred_giz = model.predict(s2_images)

In [None]:
ID_S2_pair = pd.read_csv('/kaggle/input/biomass-data/UniqueID-SentinelPair.csv')

preds = pd.DataFrame({'Target':pred_giz}).rename_axis('S2_idx').reset_index()
preds = ID_S2_pair.merge(preds, on='S2_idx').drop(columns=['S2_idx'])
preds['Target'] = preds['Target'].round().astype(int)

In [None]:
preds.to_csv('GIZ_Biomass_predictions.csv', index=False)

In [None]:
preds.head()

In [None]:
import os
os.chdir(r'./')

preds.to_csv(r'xgbregressor_v2.csv', index=False)

from IPython.display import FileLink
FileLink(r'xgbregressor_v2.csv') 

In [None]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]