In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_csv('../data/RtmSimulation_kickstart.csv', index_col= 0)

In [4]:
df.shape

(1000, 2114)

In [5]:
df.head()

Unnamed: 0_level_0,lai,wetness,treeSpecies,Sentinel_2A_492.4,Sentinel_2A_559.8,Sentinel_2A_664.6,Sentinel_2A_704.1,Sentinel_2A_740.5,Sentinel_2A_782.8,Sentinel_2A_832.8,...,w2491,w2492,w2493,w2494,w2495,w2496,w2497,w2498,w2499,w2500
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.1,0.36,predominantly_pine,0.062092,0.131702,0.043197,0.177134,0.40175,0.458003,0.463287,...,0.037392,0.0261,0.03348,0.027219,0.033905,0.017797,0.038259,0.020111,0.027921,0.02752
2,5.34,0.47,predominantly_beech,0.052807,0.129987,0.043061,0.153641,0.407523,0.466853,0.477236,...,0.028002,0.030901,0.03965,0.029524,0.032461,0.030735,0.023527,0.029523,0.021542,0.02678
3,4.53,0.68,predominantly_pine,0.047937,0.139421,0.04578,0.157121,0.395428,0.44162,0.448626,...,0.015169,0.020155,0.025696,0.026504,0.028994,0.031195,0.025666,0.032225,0.016128,0.026495
4,2.1,0.8,predominantly_beech,0.045907,0.107761,0.033984,0.128237,0.341315,0.385277,0.382241,...,0.029487,0.031408,0.032888,0.029878,0.027617,0.034964,0.031996,0.032882,0.019543,0.121085
5,5.34,0.48,mixed,0.051712,0.136293,0.041502,0.167564,0.40746,0.454137,0.464966,...,0.02451,0.023754,0.026276,0.031367,0.039625,0.036997,0.022577,0.039619,0.031636,0.02223


## _Leave area index_ [`lai`] prediction

### Baseline model

#### No feature engineering

In [6]:
df.dropna(inplace = True)

In [7]:
df.shape

(935, 2114)

In [8]:
df.drop('treeSpecies', axis = 1, inplace= True)

In [9]:
df.head()

Unnamed: 0_level_0,lai,wetness,Sentinel_2A_492.4,Sentinel_2A_559.8,Sentinel_2A_664.6,Sentinel_2A_704.1,Sentinel_2A_740.5,Sentinel_2A_782.8,Sentinel_2A_832.8,Sentinel_2A_864.7,...,w2491,w2492,w2493,w2494,w2495,w2496,w2497,w2498,w2499,w2500
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.1,0.36,0.062092,0.131702,0.043197,0.177134,0.40175,0.458003,0.463287,0.465697,...,0.037392,0.0261,0.03348,0.027219,0.033905,0.017797,0.038259,0.020111,0.027921,0.02752
2,5.34,0.47,0.052807,0.129987,0.043061,0.153641,0.407523,0.466853,0.477236,0.469157,...,0.028002,0.030901,0.03965,0.029524,0.032461,0.030735,0.023527,0.029523,0.021542,0.02678
3,4.53,0.68,0.047937,0.139421,0.04578,0.157121,0.395428,0.44162,0.448626,0.448503,...,0.015169,0.020155,0.025696,0.026504,0.028994,0.031195,0.025666,0.032225,0.016128,0.026495
4,2.1,0.8,0.045907,0.107761,0.033984,0.128237,0.341315,0.385277,0.382241,0.380013,...,0.029487,0.031408,0.032888,0.029878,0.027617,0.034964,0.031996,0.032882,0.019543,0.121085
5,5.34,0.48,0.051712,0.136293,0.041502,0.167564,0.40746,0.454137,0.464966,0.459594,...,0.02451,0.023754,0.026276,0.031367,0.039625,0.036997,0.022577,0.039619,0.031636,0.02223


#### Use only `wetness` and `Sentinel_2A_....` columns

In [10]:
df.columns[1:12]

Index(['wetness', 'Sentinel_2A_492.4', 'Sentinel_2A_559.8',
       'Sentinel_2A_664.6', 'Sentinel_2A_704.1', 'Sentinel_2A_740.5',
       'Sentinel_2A_782.8', 'Sentinel_2A_832.8', 'Sentinel_2A_864.7',
       'Sentinel_2A_1613.7', 'Sentinel_2A_2202.4'],
      dtype='object')

In [11]:
y = df['lai']

In [12]:
X = df.iloc[:,1:12]

In [13]:
X.columns

Index(['wetness', 'Sentinel_2A_492.4', 'Sentinel_2A_559.8',
       'Sentinel_2A_664.6', 'Sentinel_2A_704.1', 'Sentinel_2A_740.5',
       'Sentinel_2A_782.8', 'Sentinel_2A_832.8', 'Sentinel_2A_864.7',
       'Sentinel_2A_1613.7', 'Sentinel_2A_2202.4'],
      dtype='object')

#### Simple regression models, no hyperparameter optimization

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

# split train into train and val 
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, random_state=42)

##### Linear Regression

In [15]:
m = LinearRegression()

In [16]:
m.fit(X_train, y_train)

In [17]:
m.score(X_train, y_train)

0.5089986581328299

In [18]:
m.score(X_test, y_test)

0.5154667730598539

##### Random Forest Regressor

In [19]:
rf = RandomForestRegressor()

In [20]:
rf.fit(X_train, y_train)

In [21]:
rf.score(X_train, y_train)

0.9799373378800561

In [22]:
rf.score(X_test, y_test)

0.8409934622926775

#### XGBoost Regressor


In [23]:
import xgboost as xgb


xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)

xgb_model.fit(X_train, y_train)




In [24]:

xgb_model.score(X_train, y_train)

0.999992870591667

In [25]:
xgb_model.score(X_test, y_test)

0.8497400653582362

### Polynomial features

In [26]:
# polynomial regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

degree = 3
model = make_pipeline(PolynomialFeatures(degree), Ridge())
model.fit(X_train, y_train)


In [27]:
model.score(X_train, y_train)


0.6545725612890169

In [28]:
model.score(X_test, y_test)

0.629446435228515

### Hyperparameter tuning

In [29]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3]
}

# Create the model
rf = RandomForestRegressor()

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Best parameters: {'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 200}
Best score: 0.8392671140200132


In [None]:
grid_search.score(X_train, y_train)

In [None]:
grid_search.score(X_test, y_test)