# **Exploratory Data Analysis** of Independent Features in **PlantTraits2024**: MODIS vs. VOD

## 1. Wczytanie danych i wstępna eksploracja

### Kroki:
* Załadowanie zbiorów treningowego i testowego.
* Sprawdzenie liczby kolumn i wierszy w obu zbiorach.
* Sprawdzenie duplikatów → czy mamy powielone wiersze?
* Podstawowe statystyki (średnia, min, max, percentyle) dla pierwszego rozeznania w wartościach.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
pd.set_option('display.max_columns', 200)

In [12]:
train_csv = "I:/Jacob/Documents/PlantTraits2024/data/train.csv"
test_csv = "I:/Jacob/Documents/PlantTraits2024/data/test.csv"

train_data = pd.read_csv(train_csv)
test_data = pd.read_csv(test_csv)

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55489 entries, 0 to 55488
Columns: 176 entries, id to X3112_sd
dtypes: float64(54), int64(122)
memory usage: 74.5 MB


***
Mamy 55489 wierszy (próbek) oraz 176 kolumn
***

In [5]:
all_without_id = [col for col in train_data.columns if col != 'id']
num_duplicates = train_data[all_without_id].duplicated().sum()
print("Liczba zduplikowanych wierszy:", num_duplicates)

Liczba zduplikowanych wierszy: 4200


***
W celu sprawdzenia duplikatów odrzucamy kolumnę id. Liczba duplikatów zawarta w całym zbiorze wynosi 4200. Poniżej zaprezentowano również podstawowe statystki zbioru dla obu zbiorów - treningowego i testowego.
***

In [7]:
train_data.describe()

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,SOIL_bdod_30.60cm_mean_0.01_deg,SOIL_bdod_5.15cm_mean_0.01_deg,SOIL_bdod_60.100cm_mean_0.01_deg,SOIL_cec_0.5cm_mean_0.01_deg,SOIL_cec_100.200cm_mean_0.01_deg,SOIL_cec_15.30cm_mean_0.01_deg,SOIL_cec_30.60cm_mean_0.01_deg,SOIL_cec_5.15cm_mean_0.01_deg,SOIL_cec_60.100cm_mean_0.01_deg,SOIL_cfvo_0.5cm_mean_0.01_deg,SOIL_cfvo_100.200cm_mean_0.01_deg,SOIL_cfvo_15.30cm_mean_0.01_deg,SOIL_cfvo_30.60cm_mean_0.01_deg,SOIL_cfvo_5.15cm_mean_0.01_deg,SOIL_cfvo_60.100cm_mean_0.01_deg,SOIL_clay_0.5cm_mean_0.01_deg,SOIL_clay_100.200cm_mean_0.01_deg,SOIL_clay_15.30cm_mean_0.01_deg,SOIL_clay_30.60cm_mean_0.01_deg,SOIL_clay_5.15cm_mean_0.01_deg,SOIL_clay_60.100cm_mean_0.01_deg,SOIL_nitrogen_0.5cm_mean_0.01_deg,SOIL_nitrogen_100.200cm_mean_0.01_deg,SOIL_nitrogen_15.30cm_mean_0.01_deg,SOIL_nitrogen_30.60cm_mean_0.01_deg,SOIL_nitrogen_5.15cm_mean_0.01_deg,SOIL_nitrogen_60.100cm_mean_0.01_deg,SOIL_ocd_0.5cm_mean_0.01_deg,SOIL_ocd_100.200cm_mean_0.01_deg,SOIL_ocd_15.30cm_mean_0.01_deg,SOIL_ocd_30.60cm_mean_0.01_deg,SOIL_ocd_5.15cm_mean_0.01_deg,SOIL_ocd_60.100cm_mean_0.01_deg,SOIL_ocs_0.30cm_mean_0.01_deg,SOIL_phh2o_0.5cm_mean_0.01_deg,SOIL_phh2o_100.200cm_mean_0.01_deg,SOIL_phh2o_15.30cm_mean_0.01_deg,SOIL_phh2o_30.60cm_mean_0.01_deg,SOIL_phh2o_5.15cm_mean_0.01_deg,SOIL_phh2o_60.100cm_mean_0.01_deg,SOIL_sand_0.5cm_mean_0.01_deg,SOIL_sand_100.200cm_mean_0.01_deg,SOIL_sand_15.30cm_mean_0.01_deg,SOIL_sand_30.60cm_mean_0.01_deg,SOIL_sand_5.15cm_mean_0.01_deg,SOIL_sand_60.100cm_mean_0.01_deg,SOIL_silt_0.5cm_mean_0.01_deg,SOIL_silt_100.200cm_mean_0.01_deg,SOIL_silt_15.30cm_mean_0.01_deg,SOIL_silt_30.60cm_mean_0.01_deg,SOIL_silt_5.15cm_mean_0.01_deg,SOIL_silt_60.100cm_mean_0.01_deg,SOIL_soc_0.5cm_mean_0.01_deg,SOIL_soc_100.200cm_mean_0.01_deg,SOIL_soc_15.30cm_mean_0.01_deg,SOIL_soc_30.60cm_mean_0.01_deg,SOIL_soc_5.15cm_mean_0.01_deg,SOIL_soc_60.100cm_mean_0.01_deg,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m9,VOD_C_2002_2018_multiyear_mean_m01,VOD_C_2002_2018_multiyear_mean_m02,VOD_C_2002_2018_multiyear_mean_m03,VOD_C_2002_2018_multiyear_mean_m04,VOD_C_2002_2018_multiyear_mean_m05,VOD_C_2002_2018_multiyear_mean_m06,VOD_C_2002_2018_multiyear_mean_m07,VOD_C_2002_2018_multiyear_mean_m08,VOD_C_2002_2018_multiyear_mean_m09,VOD_C_2002_2018_multiyear_mean_m10,VOD_C_2002_2018_multiyear_mean_m11,VOD_C_2002_2018_multiyear_mean_m12,VOD_Ku_1987_2017_multiyear_mean_m01,VOD_Ku_1987_2017_multiyear_mean_m02,VOD_Ku_1987_2017_multiyear_mean_m03,VOD_Ku_1987_2017_multiyear_mean_m04,VOD_Ku_1987_2017_multiyear_mean_m05,VOD_Ku_1987_2017_multiyear_mean_m06,VOD_Ku_1987_2017_multiyear_mean_m07,VOD_Ku_1987_2017_multiyear_mean_m08,VOD_Ku_1987_2017_multiyear_mean_m09,VOD_Ku_1987_2017_multiyear_mean_m10,VOD_Ku_1987_2017_multiyear_mean_m11,VOD_Ku_1987_2017_multiyear_mean_m12,VOD_X_1997_2018_multiyear_mean_m01,VOD_X_1997_2018_multiyear_mean_m02,VOD_X_1997_2018_multiyear_mean_m03,VOD_X_1997_2018_multiyear_mean_m04,VOD_X_1997_2018_multiyear_mean_m05,VOD_X_1997_2018_multiyear_mean_m06,VOD_X_1997_2018_multiyear_mean_m07,VOD_X_1997_2018_multiyear_mean_m08,VOD_X_1997_2018_multiyear_mean_m09,VOD_X_1997_2018_multiyear_mean_m10,VOD_X_1997_2018_multiyear_mean_m11,VOD_X_1997_2018_multiyear_mean_m12,X4_mean,X11_mean,X18_mean,X26_mean,X50_mean,X3112_mean,X4_sd,X11_sd,X18_sd,X26_sd,X50_sd,X3112_sd
count,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,55489.0,39148.0,39148.0,39148.0,39148.0,39148.0,39148.0
mean,171455300.0,14.577142,1058.710939,138.164359,56.727213,518.711405,25.227185,116.895673,140.259205,128.187695,135.103624,123.118222,138.84788,234.641569,179.556417,193.516607,187.413415,204.166357,182.421723,119.398259,178.743282,133.947701,151.662221,127.794248,164.169277,242.660131,298.189623,273.477626,297.658022,250.800753,301.043973,418.234371,107.576619,189.781741,144.151508,260.256249,116.335292,411.531961,112.225414,240.62479,161.029916,307.470219,129.465732,56.794266,61.526158,64.562922,62.244787,63.26234,61.884392,64.054912,462.391357,429.861072,441.757916,425.70587,455.900178,425.37047,294.94143,271.952783,284.767215,276.636144,293.299122,273.587107,590.070951,155.326209,265.789219,192.896826,382.845933,160.682676,1498.789111,2886.715926,1182.061129,1446.03689,2701.092865,863.161618,2431.552722,491.057723,785.840761,2666.023176,1047.842852,2497.702644,690.838887,970.02004,2628.104417,1334.863901,2724.712429,1014.658671,1276.689975,2652.682496,1558.239687,2956.744706,1221.727982,1499.898538,2735.675359,1395.552452,2822.588279,1034.286453,1330.809079,2690.984447,1110.36209,2633.803925,738.720882,1049.81144,2650.867595,908.86974,2617.747085,548.491395,869.674674,2689.725657,801.319793,2640.615455,452.346735,772.497955,2736.229992,785.624106,2636.734308,437.225,752.666961,2763.610193,793.842167,2588.687992,439.502118,749.031826,2760.634991,800.925409,2503.592424,437.728288,742.005533,2725.615762,0.342655,0.340245,0.346531,0.3544,0.358838,0.359311,0.360271,0.359685,0.358741,0.353126,0.347826,0.344916,0.666786,0.666085,0.675314,0.68875,0.700093,0.704702,0.706513,0.705679,0.704904,0.698213,0.687075,0.674864,0.370689,0.369228,0.374976,0.385622,0.391511,0.394641,0.396362,0.393833,0.391273,0.38542,0.378187,0.373244,0.522575,132.5267,19699.02,3459.399,13.528574,397555.2,0.007833,1.073573,16.003476,110.73315,0.075108,453.017146
std,40878680.0,7.58429,768.992166,136.409312,31.692061,316.61844,9.151237,17.607857,15.075214,16.030602,15.609636,16.641816,15.170108,78.356729,58.204716,62.496032,60.264736,66.13075,58.680856,58.850612,78.27394,66.461816,74.288738,62.156819,77.617876,85.989567,94.769409,87.776912,96.055958,84.489059,96.930149,275.469705,108.18189,146.914379,127.046936,179.656298,111.750122,154.3776,126.707287,112.624767,102.066359,125.310642,120.232767,23.466471,9.067507,9.630548,9.329811,9.560445,9.223978,9.679373,164.571713,143.109459,155.197473,149.162198,161.650116,145.974699,116.337423,102.899908,113.290686,110.757625,116.263803,106.522373,420.820136,210.491309,243.78433,206.664374,308.163242,205.900773,1712.02496,1277.2232,1705.980379,1682.498083,689.774946,643.095896,592.679085,470.6155,516.460843,557.09939,990.704856,793.359099,910.933634,918.672102,613.204267,1476.805143,1128.667249,1461.036631,1441.215307,682.924974,1795.517377,1309.28317,1778.908409,1763.010216,649.285099,1597.845929,1141.615786,1552.77959,1557.013367,592.280108,1244.473664,890.700813,1159.417965,1187.536826,570.203176,878.828402,693.429624,741.064546,786.627998,564.980885,621.987339,659.153786,419.021835,483.322961,582.607054,606.506308,667.949203,409.356868,468.179497,583.319256,611.921308,636.138819,418.079667,475.576637,561.853029,579.632813,590.233311,367.940851,432.716082,546.927119,0.120126,0.121291,0.120244,0.123727,0.126774,0.131089,0.131181,0.131062,0.12997,0.127999,0.12416,0.121571,0.204154,0.205466,0.204209,0.207259,0.213377,0.218739,0.218574,0.219105,0.221237,0.218185,0.210712,0.204457,0.125348,0.126725,0.126546,0.130438,0.134787,0.140129,0.139662,0.138971,0.138265,0.134961,0.128688,0.126095,0.174853,12782.2,2309747.0,247191.9,1356.480785,91524630.0,0.012046,4.098486,881.605417,8361.85575,0.15952,4196.228211
min,26375.0,-13.73113,0.0,0.0,0.0,9.877081,7.658674,29.0,46.0,42.0,45.0,40.0,45.0,45.0,22.0,25.0,23.0,26.0,22.0,0.0,0.0,0.0,-1.0,0.0,0.0,15.0,21.0,10.0,18.0,14.0,23.0,-176.0,2.0,13.0,4.0,20.0,3.0,51.0,6.0,44.0,36.0,48.0,26.0,6.0,39.0,43.0,39.0,42.0,38.0,43.0,9.0,5.0,7.0,6.0,9.0,6.0,20.0,15.0,18.0,19.0,19.0,18.0,12.0,3.0,11.0,5.0,-189.0,4.0,67.0,85.0,1.0,97.0,45.0,94.0,233.0,4.0,158.0,361.0,107.0,284.0,8.0,154.0,201.0,4.0,12.0,2.0,1.0,19.0,81.0,322.0,8.0,149.0,384.0,79.0,346.0,-32.0,133.0,314.0,87.0,198.0,-13.0,106.0,181.0,81.0,182.0,33.0,129.0,167.0,79.0,230.0,32.0,156.0,132.0,73.0,229.0,-35.0,160.0,147.0,101.0,158.0,26.0,159.0,142.0,76.0,223.0,16.0,98.0,172.0,0.004967,0.005374,0.004125,0.004878,0.000691,0.001806,0.001029,0.001756,0.003105,0.005769,0.004972,0.004271,0.133861,0.111988,0.133168,0.130426,0.120867,0.121964,0.133418,0.136463,0.127941,0.125441,0.129849,0.132471,0.013437,0.008991,0.010382,0.008422,0.003341,0.005615,0.005608,0.008641,0.00732,0.007639,0.010804,0.010404,-2.431157,6.78e-05,2.33e-08,5.5e-07,9.7e-05,7.69e-08,0.0,0.0,0.0,0.0,0.0,0.0
25%,165861400.0,9.247916,528.315552,53.0,29.685472,300.47876,18.800001,106.0,131.0,118.0,126.0,112.0,129.0,181.0,142.0,153.0,149.0,160.0,145.0,77.0,124.0,86.0,99.0,83.0,109.0,186.0,237.0,218.0,236.0,196.0,239.0,221.0,46.0,103.0,73.0,147.0,56.0,303.0,44.0,169.0,100.0,226.0,64.0,41.0,54.0,56.0,54.0,55.0,54.0,56.0,341.0,329.0,327.0,318.0,337.0,321.0,207.0,193.0,198.0,190.0,205.0,190.0,294.0,47.0,128.0,81.0,186.0,55.0,513.0,2123.0,296.0,569.0,2262.0,457.0,2037.0,268.0,515.0,2327.0,490.0,2024.0,284.0,529.0,2239.0,506.0,2088.0,294.0,556.0,2229.0,516.0,2164.0,299.0,575.0,2303.0,508.0,2153.0,293.0,567.0,2290.0,484.0,2134.0,278.0,548.0,2278.0,443.0,2177.0,257.0,533.0,2339.0,415.0,2176.0,246.0,521.0,2384.0,411.0,2155.0,247.0,512.0,2400.0,416.0,2129.0,251.0,508.0,2398.0,430.0,2086.0,257.0,508.0,2375.0,0.259723,0.25571,0.262847,0.26949,0.270218,0.266189,0.267547,0.267842,0.268322,0.264345,0.261236,0.26367,0.517538,0.513065,0.523269,0.534159,0.535273,0.535938,0.537901,0.533803,0.533828,0.534211,0.530811,0.527064,0.282443,0.278508,0.285188,0.292933,0.29315,0.29182,0.294253,0.290927,0.290144,0.287874,0.289432,0.287467,0.410995,10.63987,0.3102831,0.5613414,1.173489,255.2815,0.003292,0.174866,0.029985,0.042226,0.017207,17.259899
50%,191510300.0,15.536692,853.776184,95.5,50.196312,446.522308,23.681778,118.0,142.0,129.0,137.0,123.0,140.0,228.0,179.0,192.0,186.0,201.0,182.0,114.0,171.0,126.0,142.0,122.0,153.0,239.0,298.0,270.0,296.0,246.0,301.0,357.0,74.0,152.0,109.0,217.0,82.0,411.0,64.0,216.0,130.0,287.0,86.0,55.0,61.0,63.0,61.0,62.0,61.0,63.0,438.0,416.0,425.0,410.0,435.0,411.0,298.0,275.0,288.0,280.0,297.0,277.0,504.0,87.0,200.0,132.0,305.0,95.0,822.0,2616.0,452.0,770.0,2685.0,682.0,2361.0,374.0,661.0,2666.0,735.0,2384.0,409.0,700.0,2638.0,803.0,2530.0,439.0,752.0,2657.0,826.0,2647.0,449.0,770.0,2717.0,798.0,2577.0,432.0,744.0,2685.0,722.0,2491.0,393.0,699.0,2667.0,646.0,2537.0,355.0,668.0,2717.0,611.0,2584.0,339.0,648.0,2758.0,613.0,2586.0,340.0,646.0,2783.0,622.0,2537.0,346.0,645.0,2777.0,639.0,2433.0,354.0,647.0,2730.0,0.328645,0.326466,0.331716,0.342002,0.345296,0.342815,0.343316,0.343399,0.34288,0.338517,0.334033,0.331147,0.641926,0.638621,0.648714,0.663489,0.675773,0.675785,0.677852,0.675323,0.676279,0.666866,0.659124,0.649121,0.356922,0.357106,0.359904,0.372863,0.376943,0.376403,0.377365,0.37507,0.372533,0.368636,0.363573,0.359759,0.509009,15.11232,0.7156651,2.519985,1.48006,724.4318,0.004996,0.357821,0.095136,0.238664,0.033162,63.322952
75%,195100100.0,20.238457,1352.0,181.485718,80.811249,690.003845,30.399998,130.0,150.0,139.0,145.0,135.0,149.0,279.0,213.0,230.0,222.0,242.0,217.0,152.0,227.0,175.0,196.0,168.0,212.0,298.0,359.0,328.0,359.0,304.0,363.0,557.0,129.0,228.0,171.0,323.0,137.0,513.0,120.0,282.0,185.0,367.0,139.0,69.0,68.0,72.0,69.0,70.0,68.0,71.0,579.0,528.0,549.0,527.0,571.0,524.0,385.0,350.0,373.0,363.0,383.0,356.0,768.0,180.0,308.0,223.0,471.0,184.0,1643.0,3228.0,997.0,1355.0,3128.0,1038.0,2747.0,539.0,862.0,3000.0,1192.0,2818.0,638.0,976.0,3001.0,1468.0,3033.0,853.0,1202.0,3060.0,1684.0,3254.0,1033.0,1406.0,3130.0,1457.0,3101.0,822.0,1195.0,3056.0,1164.0,2904.0,624.0,977.0,3005.0,988.0,2990.0,525.0,871.0,3043.0,936.0,3074.0,493.0,841.0,3109.0,934.0,3075.0,486.0,831.0,3142.0,951.0,2989.0,492.0,827.0,3116.0,969.0,2875.0,498.0,824.0,3059.0,0.420693,0.42033,0.429095,0.438893,0.447669,0.452771,0.44718,0.444662,0.444326,0.439987,0.431046,0.424898,0.793944,0.798718,0.810269,0.828689,0.845978,0.860092,0.863993,0.861264,0.856697,0.84564,0.824773,0.800284,0.453025,0.454224,0.461714,0.478076,0.48828,0.496335,0.496437,0.490374,0.487336,0.47887,0.465322,0.455181,0.622383,19.6816,3.586311,14.91886,1.926343,2148.63,0.007774,1.092862,0.330147,1.516574,0.082851,226.9904
max,196766800.0,30.294445,8392.463867,2448.964355,169.582291,2025.894653,63.657616,194.0,192.0,201.0,185.0,204.0,189.0,795.0,692.0,736.0,706.0,692.0,700.0,449.0,516.0,462.0,460.0,448.0,478.0,643.0,708.0,642.0,695.0,649.0,703.0,2465.0,1602.0,2323.0,2158.0,2320.0,1889.0,1032.0,990.0,990.0,812.0,1007.0,920.0,173.0,91.0,95.0,91.0,92.0,91.0,95.0,963.0,960.0,969.0,961.0,963.0,954.0,775.0,708.0,740.0,719.0,777.0,712.0,3798.0,4350.0,3298.0,4114.0,3460.0,4271.0,13283.0,12900.0,12127.0,12981.0,7285.0,9469.0,9310.0,8792.0,9367.0,6288.0,11216.0,11021.0,9890.0,10818.0,6332.0,13804.0,13246.0,12009.0,13456.0,7399.0,12522.0,11889.0,11591.0,12271.0,6519.0,11236.0,10705.0,10527.0,11078.0,6349.0,10001.0,9409.0,9722.0,10016.0,6312.0,9148.0,8126.0,8955.0,9211.0,6300.0,8853.0,8329.0,8345.0,8792.0,6274.0,10236.0,9481.0,9776.0,10225.0,6282.0,10050.0,9211.0,9691.0,10063.0,6274.0,8496.0,7613.0,8173.0,8504.0,6301.0,0.736396,0.739559,0.768923,0.777298,0.767647,0.802514,0.785759,0.791396,0.785267,0.800478,0.928287,0.965172,1.851174,1.828875,1.828262,1.849966,1.809891,1.816861,1.837026,1.859175,1.864058,1.835494,1.861244,1.868899,0.784032,0.784718,0.784878,0.803896,0.798269,0.820608,0.869486,0.851053,0.83662,0.832753,0.805387,0.780904,4.475172,1504254.0,272049400.0,31065550.0,159759.8977,21559110000.0,0.284052,515.672017,63535.386846,739701.798818,9.729029,387491.201058


In [9]:
test_data.describe()

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,SOIL_bdod_30.60cm_mean_0.01_deg,SOIL_bdod_5.15cm_mean_0.01_deg,SOIL_bdod_60.100cm_mean_0.01_deg,SOIL_cec_0.5cm_mean_0.01_deg,SOIL_cec_100.200cm_mean_0.01_deg,SOIL_cec_15.30cm_mean_0.01_deg,SOIL_cec_30.60cm_mean_0.01_deg,SOIL_cec_5.15cm_mean_0.01_deg,SOIL_cec_60.100cm_mean_0.01_deg,SOIL_cfvo_0.5cm_mean_0.01_deg,SOIL_cfvo_100.200cm_mean_0.01_deg,SOIL_cfvo_15.30cm_mean_0.01_deg,SOIL_cfvo_30.60cm_mean_0.01_deg,SOIL_cfvo_5.15cm_mean_0.01_deg,SOIL_cfvo_60.100cm_mean_0.01_deg,SOIL_clay_0.5cm_mean_0.01_deg,SOIL_clay_100.200cm_mean_0.01_deg,SOIL_clay_15.30cm_mean_0.01_deg,SOIL_clay_30.60cm_mean_0.01_deg,SOIL_clay_5.15cm_mean_0.01_deg,SOIL_clay_60.100cm_mean_0.01_deg,SOIL_nitrogen_0.5cm_mean_0.01_deg,SOIL_nitrogen_100.200cm_mean_0.01_deg,SOIL_nitrogen_15.30cm_mean_0.01_deg,SOIL_nitrogen_30.60cm_mean_0.01_deg,SOIL_nitrogen_5.15cm_mean_0.01_deg,SOIL_nitrogen_60.100cm_mean_0.01_deg,SOIL_ocd_0.5cm_mean_0.01_deg,SOIL_ocd_100.200cm_mean_0.01_deg,SOIL_ocd_15.30cm_mean_0.01_deg,SOIL_ocd_30.60cm_mean_0.01_deg,SOIL_ocd_5.15cm_mean_0.01_deg,SOIL_ocd_60.100cm_mean_0.01_deg,SOIL_ocs_0.30cm_mean_0.01_deg,SOIL_phh2o_0.5cm_mean_0.01_deg,SOIL_phh2o_100.200cm_mean_0.01_deg,SOIL_phh2o_15.30cm_mean_0.01_deg,SOIL_phh2o_30.60cm_mean_0.01_deg,SOIL_phh2o_5.15cm_mean_0.01_deg,SOIL_phh2o_60.100cm_mean_0.01_deg,SOIL_sand_0.5cm_mean_0.01_deg,SOIL_sand_100.200cm_mean_0.01_deg,SOIL_sand_15.30cm_mean_0.01_deg,SOIL_sand_30.60cm_mean_0.01_deg,SOIL_sand_5.15cm_mean_0.01_deg,SOIL_sand_60.100cm_mean_0.01_deg,SOIL_silt_0.5cm_mean_0.01_deg,SOIL_silt_100.200cm_mean_0.01_deg,SOIL_silt_15.30cm_mean_0.01_deg,SOIL_silt_30.60cm_mean_0.01_deg,SOIL_silt_5.15cm_mean_0.01_deg,SOIL_silt_60.100cm_mean_0.01_deg,SOIL_soc_0.5cm_mean_0.01_deg,SOIL_soc_100.200cm_mean_0.01_deg,SOIL_soc_15.30cm_mean_0.01_deg,SOIL_soc_30.60cm_mean_0.01_deg,SOIL_soc_5.15cm_mean_0.01_deg,SOIL_soc_60.100cm_mean_0.01_deg,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m9,VOD_C_2002_2018_multiyear_mean_m01,VOD_C_2002_2018_multiyear_mean_m02,VOD_C_2002_2018_multiyear_mean_m03,VOD_C_2002_2018_multiyear_mean_m04,VOD_C_2002_2018_multiyear_mean_m05,VOD_C_2002_2018_multiyear_mean_m06,VOD_C_2002_2018_multiyear_mean_m07,VOD_C_2002_2018_multiyear_mean_m08,VOD_C_2002_2018_multiyear_mean_m09,VOD_C_2002_2018_multiyear_mean_m10,VOD_C_2002_2018_multiyear_mean_m11,VOD_C_2002_2018_multiyear_mean_m12,VOD_Ku_1987_2017_multiyear_mean_m01,VOD_Ku_1987_2017_multiyear_mean_m02,VOD_Ku_1987_2017_multiyear_mean_m03,VOD_Ku_1987_2017_multiyear_mean_m04,VOD_Ku_1987_2017_multiyear_mean_m05,VOD_Ku_1987_2017_multiyear_mean_m06,VOD_Ku_1987_2017_multiyear_mean_m07,VOD_Ku_1987_2017_multiyear_mean_m08,VOD_Ku_1987_2017_multiyear_mean_m09,VOD_Ku_1987_2017_multiyear_mean_m10,VOD_Ku_1987_2017_multiyear_mean_m11,VOD_Ku_1987_2017_multiyear_mean_m12,VOD_X_1997_2018_multiyear_mean_m01,VOD_X_1997_2018_multiyear_mean_m02,VOD_X_1997_2018_multiyear_mean_m03,VOD_X_1997_2018_multiyear_mean_m04,VOD_X_1997_2018_multiyear_mean_m05,VOD_X_1997_2018_multiyear_mean_m06,VOD_X_1997_2018_multiyear_mean_m07,VOD_X_1997_2018_multiyear_mean_m08,VOD_X_1997_2018_multiyear_mean_m09,VOD_X_1997_2018_multiyear_mean_m10,VOD_X_1997_2018_multiyear_mean_m11,VOD_X_1997_2018_multiyear_mean_m12
count,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0,6545.0
mean,193024700.0,13.565744,1058.339104,125.92957,51.638085,542.317973,25.467087,115.637892,141.002903,127.907105,135.385943,122.638808,139.517341,245.199236,180.308021,196.979068,189.765775,209.829335,183.887548,116.269824,173.734759,130.557219,146.640031,125.532773,159.042475,240.099465,291.94301,268.689534,291.229488,247.470741,294.722689,459.702521,117.711077,203.683117,155.849045,276.318411,126.266005,432.070588,118.944691,253.367914,168.048892,320.190374,137.004889,59.295951,60.75615,63.934148,61.470283,62.487089,61.135065,63.333537,453.034072,426.387777,436.079756,421.379985,447.186402,421.41864,306.867227,281.65699,295.221085,287.39893,305.330787,283.867532,652.422154,176.735065,295.345149,214.745149,422.797708,182.399236,1515.910466,2865.258365,1231.067838,1479.862796,2641.078839,847.029335,2405.992819,498.677922,786.122231,2619.010848,1030.18793,2456.464324,700.036822,969.204278,2566.904354,1345.130023,2701.611765,1057.181054,1305.546677,2590.941176,1578.713827,2936.809931,1270.396333,1534.582582,2681.714133,1418.310008,2807.324675,1080.766539,1365.930787,2642.6356,1121.012681,2628.614973,771.289228,1075.066769,2609.132162,902.32987,2644.420015,562.643239,881.054087,2673.532315,800.643392,2674.879297,470.097173,786.062032,2726.447823,791.951566,2662.201833,461.09809,771.177846,2750.078533,804.586555,2606.0,467.993277,772.27838,2740.702063,800.571887,2501.173873,458.181818,755.808098,2693.651642,0.341399,0.338827,0.346702,0.356099,0.361309,0.362676,0.362929,0.361882,0.36111,0.355195,0.348207,0.344661,0.661572,0.659749,0.670366,0.687567,0.700887,0.704848,0.707542,0.706592,0.705976,0.699885,0.686512,0.67122,0.368734,0.367606,0.374341,0.386791,0.39341,0.397236,0.399315,0.395898,0.393781,0.387597,0.378937,0.372905
std,26013770.0,7.295813,750.088379,139.210627,30.663688,296.284472,8.424315,18.500107,15.43276,16.717081,16.244423,17.315779,15.51167,81.215023,56.018958,61.95852,59.063559,66.50504,56.879813,56.044237,75.297328,64.032353,71.035256,60.253584,74.278319,81.759672,92.925922,85.400927,94.494134,81.210889,95.046821,291.918397,119.364241,158.445567,137.36798,192.033552,120.955134,156.012757,145.857843,127.783683,118.684766,136.626079,138.101944,24.641078,8.70126,9.285694,8.957439,9.163694,8.867837,9.299132,161.243608,141.173631,152.86054,147.119091,159.119245,143.959696,116.740758,103.08176,113.150214,111.318245,116.491473,106.714543,475.725107,252.61995,298.537978,247.887277,367.255684,247.128243,1753.603277,1310.764297,1757.628959,1732.573057,687.446332,652.09628,588.540876,517.659541,555.609416,541.777228,1004.804167,807.668041,941.385827,948.447404,603.822911,1529.795442,1174.833779,1524.789651,1504.548263,683.556825,1845.137493,1349.558325,1839.770883,1821.973153,644.79656,1659.468576,1179.547617,1627.743304,1629.536918,576.443817,1313.417167,919.045547,1243.144968,1268.488537,555.231572,924.399891,717.077524,802.625518,846.041069,555.165452,659.264197,675.782563,492.771416,547.732089,580.592209,670.424318,689.835613,522.579122,569.053148,580.302212,701.830599,670.155719,564.007179,607.955244,558.216387,631.849132,600.169482,482.407767,528.642367,535.186677,0.112385,0.113352,0.112231,0.115868,0.119903,0.12802,0.128919,0.128548,0.126668,0.122802,0.11731,0.113272,0.190099,0.192704,0.192122,0.196124,0.203658,0.210567,0.210859,0.211506,0.214424,0.210243,0.199825,0.190501,0.117561,0.119423,0.119471,0.123467,0.128634,0.137264,0.137448,0.137287,0.135884,0.129999,0.121263,0.117669
min,1040495.0,-10.908333,8.0,3.566667,5.651237,18.719866,7.725,44.0,56.0,46.0,54.0,43.0,54.0,44.0,20.0,23.0,21.0,30.0,20.0,0.0,1.0,1.0,0.0,0.0,0.0,17.0,26.0,13.0,18.0,16.0,26.0,24.0,8.0,24.0,15.0,25.0,8.0,56.0,17.0,46.0,40.0,47.0,26.0,7.0,40.0,46.0,41.0,43.0,39.0,45.0,34.0,31.0,35.0,29.0,34.0,30.0,14.0,15.0,13.0,13.0,14.0,14.0,27.0,7.0,18.0,13.0,22.0,10.0,66.0,71.0,1.0,69.0,33.0,116.0,314.0,43.0,204.0,419.0,80.0,67.0,47.0,97.0,36.0,7.0,6.0,17.0,8.0,19.0,148.0,426.0,41.0,238.0,438.0,153.0,346.0,39.0,239.0,395.0,163.0,325.0,46.0,193.0,378.0,126.0,344.0,42.0,179.0,424.0,132.0,371.0,41.0,168.0,427.0,123.0,380.0,54.0,207.0,439.0,73.0,352.0,33.0,178.0,398.0,94.0,358.0,49.0,207.0,414.0,0.012404,0.013685,0.0128,0.006347,0.006429,0.000902,0.000684,0.001209,0.001351,0.002707,0.006905,0.007407,0.141941,0.102428,0.152738,0.149905,0.136452,0.136854,0.147167,0.145134,0.141948,0.143026,0.143536,0.14396,0.015426,0.021764,0.024555,0.022465,0.015992,0.005243,0.008057,0.006664,0.004333,0.012781,0.01312,0.009986
25%,197169000.0,8.374963,564.633362,51.92857,26.002605,346.196808,19.654222,104.0,131.0,118.0,126.0,111.0,130.0,188.0,145.0,156.0,151.0,166.0,147.0,78.0,121.0,85.0,96.0,83.0,107.0,188.0,231.0,215.0,232.0,197.0,233.0,249.0,48.0,113.0,79.0,159.0,58.0,327.0,44.0,174.0,99.0,231.0,63.0,43.0,54.0,56.0,54.0,55.0,54.0,55.0,340.0,330.0,326.0,317.0,336.0,321.0,217.0,204.0,211.0,201.0,217.0,202.0,332.0,47.0,136.0,84.0,203.0,56.0,534.0,2115.0,303.0,576.0,2193.0,468.0,2028.0,270.0,516.0,2299.0,504.0,1985.0,292.0,531.0,2194.0,528.0,2061.0,302.0,566.0,2153.0,543.0,2145.0,308.0,583.0,2251.0,537.0,2132.0,299.0,577.0,2253.0,503.0,2138.0,282.0,557.0,2250.0,449.0,2189.0,256.0,540.0,2321.0,414.0,2188.0,245.0,522.0,2372.0,410.0,2163.0,245.0,510.0,2378.0,413.0,2134.0,247.0,506.0,2368.0,431.0,2086.0,256.0,503.0,2349.0,0.26417,0.257541,0.264377,0.27461,0.278231,0.271642,0.269695,0.268371,0.270216,0.266947,0.265838,0.270181,0.519196,0.51537,0.525303,0.537248,0.541346,0.539683,0.539249,0.534868,0.531964,0.535293,0.534256,0.531138,0.28439,0.278782,0.286581,0.296758,0.300443,0.295987,0.297844,0.293414,0.291561,0.291953,0.292901,0.290589
50%,200827600.0,14.376865,839.700012,89.60714,44.043266,476.105438,24.289333,117.0,143.0,129.0,138.0,123.0,142.0,240.0,180.0,196.0,190.0,208.0,184.0,112.0,167.0,124.0,140.0,121.0,149.0,238.0,294.0,265.0,289.0,244.0,295.0,400.0,78.0,161.0,116.0,226.0,88.0,424.0,61.0,221.0,128.0,289.0,84.0,55.0,60.0,63.0,60.0,61.0,60.0,62.0,427.0,418.0,419.0,409.0,424.0,410.0,315.0,290.0,304.0,297.0,315.0,292.0,542.0,87.0,203.0,135.0,313.0,96.0,833.0,2590.0,464.0,778.0,2611.0,665.0,2340.0,365.0,655.0,2618.0,720.0,2339.0,399.0,686.0,2570.0,801.0,2492.0,445.0,757.0,2588.0,832.0,2608.0,466.0,780.0,2646.0,795.0,2530.0,442.0,744.0,2623.0,717.0,2464.0,394.0,699.0,2620.0,637.0,2558.0,353.0,669.0,2708.0,605.0,2621.0,342.0,649.0,2750.0,607.0,2613.0,342.0,644.0,2762.0,621.0,2544.0,348.0,648.0,2751.0,630.0,2442.0,352.0,644.0,2696.0,0.326893,0.324936,0.330976,0.343262,0.346496,0.341095,0.343905,0.342396,0.342488,0.33899,0.334774,0.33147,0.637221,0.632924,0.640823,0.664189,0.6762,0.67257,0.679958,0.677875,0.679432,0.670729,0.660385,0.646126,0.356757,0.353742,0.35926,0.374658,0.378144,0.375051,0.378714,0.375422,0.375623,0.370606,0.364082,0.359433
75%,202267200.0,18.858631,1327.666626,154.409531,74.72126,723.948975,29.916,129.0,152.0,140.0,146.0,136.0,150.0,291.0,215.0,232.0,227.0,248.0,220.0,149.0,221.0,170.0,189.0,164.0,205.0,294.0,352.0,323.0,352.0,300.0,356.0,614.0,141.0,243.0,187.0,337.0,149.0,530.0,117.0,289.0,185.0,377.0,141.0,71.0,66.0,71.0,67.0,69.0,67.0,70.0,557.0,511.0,531.0,508.0,551.0,506.0,398.0,358.0,382.0,372.0,395.0,366.0,822.0,193.0,326.0,235.0,502.0,199.0,1657.0,3161.0,1203.0,1464.0,3061.0,979.0,2700.0,519.0,831.0,2930.0,1101.0,2747.0,619.0,937.0,2921.0,1394.0,2988.0,872.0,1178.0,3003.0,1635.0,3183.0,1130.0,1412.0,3080.0,1386.0,3038.0,811.0,1167.0,2992.0,1091.0,2848.0,611.0,953.0,2946.0,927.0,3007.0,504.0,850.0,3025.0,914.0,3130.0,492.0,831.0,3101.0,926.0,3111.0,490.0,835.0,3131.0,950.0,3001.0,495.0,829.0,3098.0,956.0,2845.0,496.0,818.0,3028.0,0.415997,0.417162,0.429696,0.44111,0.447187,0.454829,0.442399,0.442492,0.442556,0.436384,0.427533,0.420018,0.791309,0.790568,0.807855,0.832122,0.850222,0.854681,0.853029,0.850969,0.846555,0.837616,0.820365,0.800046,0.448961,0.449481,0.460342,0.479379,0.491111,0.497005,0.493472,0.486382,0.483449,0.475766,0.463359,0.452274
max,203065400.0,28.243513,6760.715332,2401.733398,174.304184,1743.707031,58.549999,160.0,188.0,174.0,180.0,169.0,187.0,809.0,528.0,577.0,532.0,601.0,523.0,417.0,480.0,441.0,457.0,423.0,468.0,566.0,654.0,583.0,621.0,582.0,644.0,2422.0,1231.0,2007.0,1727.0,2349.0,1338.0,1017.0,914.0,980.0,837.0,1066.0,874.0,173.0,90.0,90.0,90.0,90.0,90.0,90.0,966.0,956.0,969.0,964.0,966.0,951.0,661.0,658.0,658.0,658.0,657.0,653.0,3798.0,2422.0,2670.0,2323.0,3347.0,2157.0,12546.0,12285.0,10947.0,12157.0,6502.0,7918.0,7817.0,7241.0,7706.0,5276.0,10045.0,9727.0,9318.0,9838.0,5735.0,12288.0,11961.0,10681.0,11849.0,6425.0,11383.0,10976.0,10486.0,11135.0,5868.0,10746.0,9932.0,10170.0,10691.0,5408.0,9684.0,8752.0,9263.0,9626.0,5452.0,8515.0,7625.0,8738.0,8688.0,5538.0,6937.0,6988.0,6219.0,6807.0,5615.0,9433.0,8800.0,8555.0,9286.0,5618.0,9296.0,8809.0,8579.0,9218.0,5533.0,7734.0,7131.0,7310.0,7716.0,5441.0,0.702531,0.703547,0.715175,0.731704,0.728001,0.73125,0.781623,0.791396,0.76018,0.796628,0.881747,0.853731,1.701391,1.682245,1.66042,1.667239,1.667029,1.677357,1.671102,1.682955,1.680495,1.658774,1.693824,1.712369,0.75814,0.755134,0.752027,0.774559,0.780773,0.801355,0.839684,0.839229,0.825483,0.803632,0.775951,0.75935


### Porównanie liczb kolumn train i test
Zauważamy, że dane treningowe mają więcej kolumn niż testowe

In [15]:
train_columns = set(train_data.columns)
test_columns = set(test_csv.columns)

only_in_train = train_columns - test_columns

print("Kolumny tylko w train: ", only_in_train)

AttributeError: 'str' object has no attribute 'columns'

Te kolumny to są nasze zmienne zależne, które musimy przewidzieć.

## 2. Wyodrębnienie danych MODIS i VOD

### Kroki:

* Filtrowanie kolumn związanych z MODIS i VOD.
* Podstawowe statystyki dla obu grup cech (min, max, rozkład wartości).
* Sprawdzenie liczby miesięcy w każdej grupie danych (czy mamy kompletne miesiące?).


In [16]:
modis_vod_cols = [col for col in train_data.columns if col.startswith("MODIS_") or col.startswith("VOD_")]

df_modis_vod = train_data[modis_vod_cols].copy()

***
Szybki podgląd
***

In [17]:
df_modis_vod.head()

Unnamed: 0,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m1,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m10,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m11,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m12,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m2,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m3,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m4,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m5,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m6,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m7,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m8,MODIS_2000.2020_monthly_mean_surface_reflectance_band_01_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_02_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_03_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_04_._month_m9,MODIS_2000.2020_monthly_mean_surface_reflectance_band_05_._month_m9,VOD_C_2002_2018_multiyear_mean_m01,VOD_C_2002_2018_multiyear_mean_m02,VOD_C_2002_2018_multiyear_mean_m03,VOD_C_2002_2018_multiyear_mean_m04,VOD_C_2002_2018_multiyear_mean_m05,VOD_C_2002_2018_multiyear_mean_m06,VOD_C_2002_2018_multiyear_mean_m07,VOD_C_2002_2018_multiyear_mean_m08,VOD_C_2002_2018_multiyear_mean_m09,VOD_C_2002_2018_multiyear_mean_m10,VOD_C_2002_2018_multiyear_mean_m11,VOD_C_2002_2018_multiyear_mean_m12,VOD_Ku_1987_2017_multiyear_mean_m01,VOD_Ku_1987_2017_multiyear_mean_m02,VOD_Ku_1987_2017_multiyear_mean_m03,VOD_Ku_1987_2017_multiyear_mean_m04,VOD_Ku_1987_2017_multiyear_mean_m05,VOD_Ku_1987_2017_multiyear_mean_m06,VOD_Ku_1987_2017_multiyear_mean_m07,VOD_Ku_1987_2017_multiyear_mean_m08,VOD_Ku_1987_2017_multiyear_mean_m09,VOD_Ku_1987_2017_multiyear_mean_m10,VOD_Ku_1987_2017_multiyear_mean_m11,VOD_Ku_1987_2017_multiyear_mean_m12,VOD_X_1997_2018_multiyear_mean_m01,VOD_X_1997_2018_multiyear_mean_m02,VOD_X_1997_2018_multiyear_mean_m03,VOD_X_1997_2018_multiyear_mean_m04,VOD_X_1997_2018_multiyear_mean_m05,VOD_X_1997_2018_multiyear_mean_m06,VOD_X_1997_2018_multiyear_mean_m07,VOD_X_1997_2018_multiyear_mean_m08,VOD_X_1997_2018_multiyear_mean_m09,VOD_X_1997_2018_multiyear_mean_m10,VOD_X_1997_2018_multiyear_mean_m11,VOD_X_1997_2018_multiyear_mean_m12
0,543,943,429,568,1199,657,1088,438,618,1474,554,1002,387,558,1352,527,973,399,547,1279,532,978,409,559,1291,472,1074,308,505,1413,511,1195,339,554,1457,574,1240,380,576,1491,672,1202,438,631,1508,718,1177,470,664,1514,753,1191,498,690,1552,710,1136,460,646,1514,0.338971,0.34859,0.390214,0.419935,0.4123,0.399303,0.38195,0.373861,0.377692,0.407391,0.408676,0.36345,0.692482,0.737323,0.795827,0.87774,0.917897,0.8982,0.816776,0.786747,0.804997,0.830753,0.810278,0.742695,0.385135,0.415175,0.455806,0.506616,0.531718,0.509142,0.444842,0.423471,0.444432,0.472432,0.448272,0.403038
1,837,920,527,710,977,1111,1218,708,941,1261,987,1080,621,837,1134,837,920,532,711,976,954,1056,605,808,1109,1077,1196,693,920,1232,1178,1321,760,1007,1342,1222,1356,787,1037,1376,1246,1366,804,1055,1386,1245,1359,804,1054,1377,1193,1299,771,1011,1317,1173,1281,753,993,1311,0.265089,0.248952,0.228648,0.219438,0.212008,0.200188,0.2065,0.204708,0.206359,0.217578,0.238399,0.270674,0.574958,0.570322,0.565816,0.566738,0.567744,0.554762,0.554799,0.555919,0.56238,0.566057,0.571791,0.576877,0.305927,0.298524,0.277699,0.270621,0.264534,0.244796,0.2536,0.251986,0.261273,0.279112,0.294724,0.311158
2,953,3236,519,885,3682,501,4240,294,736,3677,622,3983,371,779,3624,797,3651,443,848,3721,917,3022,482,851,3534,832,2941,445,795,3435,674,3127,375,751,3406,547,3487,305,739,3370,506,3620,273,716,3324,492,3718,264,705,3397,475,3838,267,685,3462,492,4051,296,723,3562,0.441699,0.428869,0.42579,0.414703,0.412291,0.409913,0.406059,0.413054,0.424531,0.439442,0.441828,0.437605,0.7671,0.758742,0.744641,0.74178,0.741885,0.745001,0.735608,0.734416,0.744643,0.761693,0.772351,0.769418,0.454093,0.447695,0.436853,0.439573,0.44615,0.451612,0.444537,0.434794,0.444734,0.455066,0.458448,0.45544
3,565,3225,296,661,3379,617,2934,321,648,3299,594,3078,315,663,3347,568,3196,298,665,3380,512,3194,273,627,3245,496,3172,288,619,3155,439,3050,234,547,3051,424,2897,215,502,2967,452,2749,226,497,2879,515,2682,256,519,2943,583,2684,289,552,3068,601,2759,299,590,3184,0.34065,0.33108,0.343284,0.334147,0.338084,0.336621,0.345327,0.338007,0.334226,0.335818,0.344647,0.336601,0.568767,0.566518,0.547143,0.534266,0.521404,0.525121,0.52299,0.535659,0.529422,0.545735,0.545098,0.552487,0.347082,0.35685,0.334322,0.328369,0.316033,0.305631,0.321647,0.329818,0.318389,0.350208,0.349995,0.348838
4,3137,3253,3727,3386,2286,838,1678,598,771,1973,2177,2550,2464,2280,1963,3060,3090,3768,3374,2200,3499,3787,3829,3631,2634,3956,4266,3982,3991,2608,2454,2806,2229,2378,1838,658,1756,362,629,2049,337,3249,218,555,2816,324,3462,229,537,2994,372,2878,253,524,2703,503,2375,283,573,2504,0.458608,0.30974,0.354649,0.393623,0.421163,0.537507,0.581277,0.574287,0.521677,0.454662,0.457658,0.474325,0.565585,0.507617,0.590297,0.688077,0.815177,0.907414,0.951749,0.968711,0.942761,0.874929,0.777615,0.678492,0.406564,0.3118,0.358661,0.419538,0.466089,0.562763,0.608216,0.610752,0.56529,0.488996,0.457526,0.448166


In [None]:
df_modis_vod.shape

In [None]:
df_modis_vod.describe()

### Sprawdzenie wartości NaN i null w podzbiorze modis_vod

In [None]:
df_modis_vod.isnull().sum().sum()

### Duplikaty

In [None]:
num_duplicates = df_modis_vod.duplicated().sum()
print("Liczba zduplikowanych wierszy:", num_duplicates)

W podzbiorze MODIS zidentyfikowano 18512 duplikatów, jednak ich usunięcie nie jest możliwe bez analizy pełnego zbioru danych. Ich obecność może być wynikiem kontekstu lub relacji z innymi danymi.

## 3. Analiza wartości VOD
### Kroki:
* Wartości max i min występujące w podzbiorze kolumn VOD.
* Histogramy.
* Analiza korelacji.
* Wykres liniowy na przestrzenie miesięcy dla każdego z pasm.

In [None]:
df_vod   = df_modis_vod[[c for c in df_modis_vod.columns if c.startswith('VOD_')]]

print("Wartość minimalna:", df_vod.min().min())
print("Wartość maksymalna:", df_vod.max().max())

Sprawdzenie czy występują wartości ujemne

In [None]:
negative_mask = (df_vod < 0)
any_negatives = negative_mask.any().any()
print("Czy istnieją wartości ujemne w VOD?", any_negatives)

Podstawowe statystki oraz podgląd na kolumny df_modis_vod pozwoliły na wyróżnienie trzech pasm - C, X i Ku oraz 12 miesięcy, co łączeni daje nam 36 kolumn.


In [None]:
df_vod.hist(figsize=(15, 10), bins=30)
plt.tight_layout()
plt.show()

Dla lepszego zobrazowania zastosowano wykres "boxplot" z biblioteki seaborn.

In [None]:
plt.figure(figsize=(15, 8))
sns.boxplot(data=df_vod, orient='h')
plt.show()

W trakcie eksploracyjnej analizy danych (EDA) zdecydowano o pozostawieniu wszystkich obserwacji w kolumnach VOD, mimo identyfikacji wartości skrajnych, takich jak minimalna (approx. 0.00069) i maksymalna (approx. 1.8689). Zamiast usuwania wartości skrajnych, podjęto decyzję o ich standaryzacji. Poniżej przedstawiono uzasadnienie tej decyzji:

1. Wartości ujemne w danych VOD byłyby błędem technicznym lub artefaktem obliczeń, co mogłoby uzasadniać ich usunięcie. Jednakże w analizowanym zbiorze danych nie występują wartości ujemne, co potwierdza poprawność danych w tym zakresie.
2. Wartości bliskie 0 w VOD mogą być reprezentatywne dla obszarów pustynnych lub innych środowisk o niskiej zawartości biomasy.
3. Wartości maksymalne, takie jak 1.86, mogą odzwierciedlać obszary o bardzo wysokiej zawartości wody w roślinności, np. wilgotne lasy tropikalne. Ich obecność może być rzeczywista i wartościowa.
4. Usuwanie wartości skrajnych powinno być uzasadnione wiedzą domenową lub techniczną wskazującą, że są one błędne. W tym przypadku nie ma dowodów sugerujących, że wartości skrajne w VOD są efektem błędu pomiarowego czy obliczeniowego.
5. Zamiast usuwania wartości skrajnych, w poźniejszym etapie prawdopodobnie zastosuje standaryzację, co pozwali zachować informacje zawarte w danych, jednocześnie ograniczając ich wpływ na modele predykcyjne i dalszą analizę.


### Analiza Korelacji

### Na początek korelacja między różnymi pasmami i miesiącami

In [None]:
corr = df_vod.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, cmap='viridis')
plt.title("Macierz korelacji VOD (pasma i miesiące)")
plt.show()

Na tym etapie widać już jakąś korelację dla pasm C i X.

### Porównanie ogólnego rozkładu pasm

In [None]:
# Porównanie ogólnego rozkładu pasm
vod_c_cols = [c for c in df_vod.columns if c.startswith('VOD_C')]
df_vod_c = df_vod[vod_c_cols]  # DataFrame samych kolumn C
df_vod_c_melt = df_vod_c.melt(var_name="col_name", value_name="vod_value")
df_vod_c_melt["band"] = "C"

vod_ku_cols = [c for c in df_vod.columns if c.startswith('VOD_Ku')]
df_vod_ku = df_vod[vod_ku_cols]
df_vod_ku_melt = df_vod_ku.melt(var_name="col_name", value_name="vod_value")
df_vod_ku_melt["band"] = "Ku"

vod_x_cols = [c for c in df_vod.columns if c.startswith('VOD_X')]
df_vod_x = df_vod[vod_x_cols]
df_vod_x_melt = df_vod_x.melt(var_name="col_name", value_name="vod_value")
df_vod_x_melt["band"] = "X"

df_vod_bands = pd.concat([df_vod_c_melt, df_vod_ku_melt, df_vod_x_melt], ignore_index=True)

plt.figure(figsize=(8, 6))
sns.boxplot(data=df_vod_bands, x="band", y="vod_value")
plt.title("Porównanie rozkładów VOD w pasmach C, Ku i X (wszystkie miesiące razem)")
plt.show()

Dla pasma C i X widzimy bardzo zbliżony rozkład wartości VOD, co może sugerować podobne charakterystyki rejestrowanych danych w tych zakresach częstotliwości. Natomiast pasmo Ku (12–18 GHz) znacząco różni się swoim rozkładem, co można zauważyć przez większą rozpiętość wartości oraz większą liczbę wartości wychodzących poza tzw. zakres międzykwartylowy (IQR). W szczególności w paśmie Ku obserwujemy znacznie więcej wartości odstających, które mogą reprezentować ekstremalne warunki środowiskowe (np. tropikalne lasy o bardzo wysokiej wilgotności lub pustynne obszary o minimalnym poziomie wilgotności).

### Wykres liniowy dla średniej wartości VOD w kolejnych miesiącach dla każdego pasma

In [None]:
bands = ["VOD_C", "VOD_Ku", "VOD_X"]
months = range(1, 13)

plt.figure(figsize=(10, 6))

for band in bands:
    avg_month = []

    for m in months:
        col_suffix = f"_m{m:02d}"
        month_cols = [c for c in df_vod.columns if band in c and col_suffix in c]

        if len(month_cols) == 0:
            avg_month.append(None)
            continue

        col_data = df_vod[month_cols[0]]
        mean_val = col_data.mean(skipna=True)
        avg_month.append(mean_val)

    plt.plot(months, avg_month, marker='o', label=band)

plt.xticks(months)  # miesiące na osi X
plt.xlabel("Miesiąc")
plt.ylabel("Średnia wartość VOD")
plt.title("Sezonowe wahania VOD w poszczególnych pasmach")
plt.legend()
plt.show()

***
Widać, że obserwowany jest wzrost średniej VOD dla każdego pasma w cieplejszych miesiącach oraz podobieństwo średnich cech dla C i X. Można się zastanowić nad scaleniem pasma C i X i sprawdzić jaki wpływ ma to na model.
***

### Heatmapa oraz macierz korelacji dla pasma C i X w celu lepszego zobrazowania ich powiązań

In [None]:
vod_c_cols = [col for col in df_modis_vod.columns if col.startswith('VOD_C')]
vod_x_cols = [col for col in df_modis_vod.columns if col.startswith('VOD_X')]

corr_matrix_c_to_x = df_modis_vod[vod_c_cols + vod_x_cols].corr()

print(corr_matrix_c_to_x)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix_c_to_x, cmap='viridis')
plt.title("Macierz korelacji VOD_X i VOD_C")
plt.show()

## 4. Scalanie pasm C i X w VOD, aby zobaczyć jak może to wyglądac w preprocessingu

Dlaczego warto to zrobić?
Silna korelacja (approx. 0.96) między C i X oznacza, że wartości mają niemal te same wartości.
Scalenie i uśrednienie ich wartości pozwoli na redukcję wymiarów bez utrarty informacji.
Dla każdego miesiąca tworzymy nową skumulowaną kolumne "CX".

In [1]:
df_modis_vod_cx = df_modis_vod

months = [f"m{str(i).zfill(2)}" for i in range(1, 13)]

for month in months:
    vod_c_col = next((col for col in df_modis_vod_cx.columns if f"VOD_C" in col and month in col), None)
    vod_x_col = next((col for col in df_modis_vod_cx.columns if f"VOD_X" in col and month in col), None)

    if vod_c_col and vod_x_col:
        df_modis_vod_cx[f"VOD_CX_{month}"] = df_modis_vod_cx[[vod_c_col, vod_x_col]].mean(axis=1)

        df_modis_vod_cx.drop(columns=[vod_c_col, vod_x_col], inplace=True)

df_modis_vod_cx.head()

NameError: name 'df_modis_vod' is not defined

***
Powyżej podgląd po scaleniu i drop na X i C.
***

### Zawężenie zakresu (osi czasu) do pór roku.

In [None]:
df_modis_vod_seasons = df_modis_vod_cx

seasons = {
    "Winter": ["m12", "m01", "m02"],
    "Spring": ["m03", "m04", "m05"],
    "Summer": ["m06", "m07", "m08"],
    "Autumn": ["m09", "m10", "m11"]
}

for season, months in seasons.items():
    df_modis_vod_seasons[f"VOD_CX_{season}"] = df_modis_vod_seasons[[f"VOD_CX_{m}" for m in months]].mean(axis=1)
    df_modis_vod_seasons[f"VOD_Ku_{season}"] = df_modis_vod_seasons[[f"VOD_Ku_1987_2017_multiyear_mean_{m}" for m in months]].mean(axis=1)

df_modis_vod_seasons.drop(columns=[col for col in df_modis_vod_seasons.columns if any(m in col for m in sum(seasons.values(), []))], inplace=True)

df_modis_vod_seasons.head()

### Wizualizacja

In [None]:
seasonal_means = df_modis_vod_seasons[["VOD_CX_Winter", "VOD_CX_Spring", "VOD_CX_Summer", "VOD_CX_Autumn",
                      "VOD_Ku_Winter", "VOD_Ku_Spring", "VOD_Ku_Summer", "VOD_Ku_Autumn"]].mean()

# Wykres
plt.figure(figsize=(10, 5))
sns.barplot(x=seasonal_means.index, y=seasonal_means.values, palette="viridis")
plt.title("Średnie wartości VOD dla pór roku")
plt.ylabel("Średnia wartość VOD")
plt.xticks(rotation=45)
plt.show()

### Wykres liniowy tym razem dla pór roku i dwóch pasm VOD

In [None]:
bands = ["VOD_CX", "VOD_Ku"]
seasons = ["Winter", "Spring", "Summer", "Autumn"]

plt.figure(figsize=(10, 6))

for band in bands:
    avg_season = []

    for s in seasons:
        season_cols = [c for c in df_modis_vod_seasons.columns if band in c and s in c]

        if len(season_cols) == 0:
            avg_season.append(None)
            continue

        col_data = df_modis_vod_seasons[season_cols[0]]
        mean_val = col_data.mean(skipna=True)
        avg_season.append(mean_val)

    plt.plot(seasons, avg_season, marker='o', label=band)

print(avg_season)
plt.xticks(seasons)  # miesiące na osi X
plt.xlabel("Pora roku")
plt.ylabel("Średnia wartość VOD")
plt.title("Sezonowe wahania VOD w poszczególnych pasmach")
plt.legend()
plt.show()

## 5. Analiza wartości MODIS
### Kroki:
* Sprawdzenie wartości ujemnnych oraz zapoznanie się ze strukturą.
* Struktura pasm.
* Wykres liniowy.
* Macierz korelacji.

Sprawdzenie czy występują wartości NaN, null oraz wartości ujemne, oraz informacja o makysymalnej i minimalnej wartości w podzbiorze MODIS.

In [None]:
df_modis = df_modis_vod[[c for c in df_modis_vod.columns if c.startswith('MODIS_')]]

negative_mask = (df_modis < 0)
any_negatives = negative_mask.any().any()
print("Czy istnieją wartości ujemne w VOD?", any_negatives)
print(f"Wartość maksymalna dla kolumn VOD: {df_modis.max().max()}")
print(f"Wartość minimalna dla kolumn VOD: {df_modis.min().min()}")

In [None]:
modis_cols = [c for c in df_modis_vod.columns if c.startswith('MODIS_')]
neg_mask = (df_modis_vod[modis_cols] < 0)

df_modis[modis_cols] = df_modis_vod[modis_cols].mask(neg_mask, np.nan)

df_modis.isnull().sum().sum()

W tym przypadku, zgodnie z researchem, zdecydowałem się na usunięcie (zastąpienie ich wartościami 0) wartości ujemnych, ponieważ:
1. W normalnych warunkach odbicie nie powinno być ujemne, ponieważ fizycznie oznaczałoby to, że powierzchnia emituje więcej energii, niż otrzymuje, co jest niemożliwe w typowych warunkach
2. Ujemne wartości w praktyce są często traktowane jako artefakty przetwarzania danych lub błędy pomiarowe
3. Ujemne wartości większe od -0.001 są nieistotne i mogą wynikać z błędów numerycznych

In [None]:
for col in modis_cols:
    df_modis[col].fillna(0, inplace=True)

Mamy bardzo dużo kolumn, dlatego pokazywanie histogramów w formie jak dla VOD było by nieczytelne. Poniżej boxplot

In [None]:
plt.figure(figsize=(15, 15))
sns.boxplot(data=df_modis, orient='h')
plt.show()

Za dużo to nam nie mówi na razie, po za tym, że mamy niesymetreczny rozkład wartości.

## Heatmapa korelacji MODIS

In [None]:
modis_corr = df_modis.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(modis_corr, cmap='viridis')
plt.title("Macierz korelacji MODIS (pasma i miesiące)")
plt.show()

Nadal ciężko z tak wielu kolumn wyciągnąć jakieś sensowne wnioski, natomiast z pomocą przychodzi wykres liniowy dla każdego pasma i avg miesiąca.

In [None]:
modis_bands = sorted(set([col.split("_month_")[0] for col in df_modis.columns if "MODIS" in col]))

months = [f"m{i}" for i in range(1, 13)]

plt.figure(figsize=(12, 6))

for band in modis_bands:
    avg_month = []

    for m in months:
        # Tworzenie poprawnej nazwy kolumny dla danego bandu i miesiąca
        month_cols = [c for c in df_modis.columns if band in c and f"_month_{m}" in c]

        if len(month_cols) == 0:
            avg_month.append(None)
            continue

        col_data = df_modis[month_cols[0]]
        mean_val = col_data.mean(skipna=True)
        avg_month.append(mean_val)

    # Rysowanie wykresu dla danego bandu
    plt.plot(range(1, 13), avg_month, marker='o', label=band)

# Konfiguracja wykresu
plt.xticks(range(1, 13))  # miesiące na osi X
plt.xlabel("Miesiąc")
plt.ylabel("Średnia wartość MODIS")
plt.title("Sezonowe wahania MODIS dla różnych pasm")
plt.legend()
plt.show()

Można zauważyć praktycznie nakładające się na siebie dwie linie - band_01 i band_04. Decyduje się więc na scalenie tych dwóch w jeden band_14. Najpierw jednak sprawdzenie dokładne korelacji między tymi bandami.

In [None]:
band_01_cols = [col for col in df_modis.columns if "band_01" in col]
band_04_cols = [col for col in df_modis.columns if "band_04" in col]

# Sprawdzamy, czy mamy tyle samo miesięcy w obu bandach
band_01_cols.sort()
band_04_cols.sort()

# Tworzymy DataFrame tylko z danymi tych dwóch bandów
df_band_01_04 = df_modis[band_01_cols].copy()
df_band_01_04.columns = [f"band_01_{i+1}" for i in range(len(band_01_cols))]  # Nazwy kolumn dla band_01
df_band_01_04 = df_band_01_04.join(df_modis[band_04_cols].set_axis([f"band_04_{i+1}" for i in range(len(band_04_cols))], axis=1))

# Obliczamy macierz korelacji
corr_matrix = df_band_01_04.corr()

print(corr_matrix)

# Wizualizacja macierzy korelacji
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Macierz korelacji między band_01 i band_04 dla wszystkich miesięcy")
plt.show()

To potwierdza zasadność scalenia tych bandów.

In [None]:
months = [f"m{i}" for i in range(1, 13)]

for month in months:
    band_01_col = next((col for col in df_modis.columns if "band_01" in col and f"_month_{month}" in col), None)
    band_04_col = next((col for col in df_modis.columns if "band_04" in col and f"_month_{month}" in col), None)

    if band_01_col and band_04_col:
        # Tworzymy nową kolumnę jako średnią band_03 i band_04
        df_modis[f"MODIS_2000.2020_monthly_mean_surface_reflectance_band_14_._month_{month}"] = df_modis_vod[[band_01_col, band_04_col]].mean(axis=1)

        df_modis.drop(columns=[band_01_col, band_04_col], inplace=True)

df_modis.info()

### Wykres liniowy po scaleniu band01 i band04

In [None]:
modis_bands = sorted(set([col.split("_month_")[0] for col in df_modis.columns if "MODIS" in col]))
months = [f"m{i}" for i in range(1, 13)]

plt.figure(figsize=(12, 6))

for band in modis_bands:
    avg_month = []

    for m in months:
        month_cols = [c for c in df_modis.columns if band in c and f"_month_{m}" in c]

        if len(month_cols) == 0:
            avg_month.append(None)
            continue

        col_data = df_modis[month_cols[0]]
        mean_val = col_data.mean(skipna=True)
        avg_month.append(mean_val)

    plt.plot(range(1, 13), avg_month, marker='o', label=band)

plt.xticks(range(1, 13))
plt.xlabel("Miesiąc")
plt.ylabel("Średnia wartość MODIS")
plt.title("Sezonowe wahania MODIS dla różnych pasm")
plt.legend()
plt.show()

## 6. Standaryzacja VOD
MinMaxScaler (0-1)

In [None]:
vod_cols = [col for col in df_modis_vod_seasons.columns if "VOD" in col]
scaler_vod = MinMaxScaler()
df_modis_vod_seasons[vod_cols] = scaler_vod.fit_transform(df_modis_vod_seasons[vod_cols])

In [None]:
for col in vod_cols:
    upper_limit = df_modis_vod_seasons[col].quantile(0.99)  # 99 percentyl
    lower_limit = df_modis_vod_seasons[col].quantile(0.01)  # 1 percentyl
    df_modis_vod[col] = df_modis_vod_seasons[col].clip(lower=lower_limit, upper=upper_limit)

### Wizualizacja VOD

In [None]:
df_vod_last = df_modis_vod_seasons[[c for c in df_modis_vod_seasons.columns if c.startswith('VOD_')]]
plt.figure(figsize=(15, 8))
sns.boxplot(data=df_vod_last, orient='h')
plt.show()

## 7. Standaryzacja MODIS
Log Transform (log(x + 1))

In [None]:
modis_cols = [col for col in df_modis.columns if "MODIS" in col]
df_modis[modis_cols] = np.log1p(df_modis[modis_cols])

In [None]:
for col in modis_cols:
    upper_limit = df_modis[col].quantile(0.99)  # 99 percentyl
    lower_limit = df_modis[col].quantile(0.01)  # 1 percentyl
    df_modis[col] = df_modis[col].clip(lower=lower_limit, upper=upper_limit)

### Wizualizacja MODIS

In [None]:
plt.figure(figsize=(15, 15))
sns.boxplot(data=df_modis, orient='h')
plt.show()