In [1]:
#from prepare_dataset import CropTypesDataset
from data.russia import Russia
import datetime
import numpy as np
import os
import pandas as pd
import random
import torch



In [2]:
seed = 42
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

In [3]:
dataroot = os.path.join(os.environ["HOME"], "elects_data", "russia")

In [4]:
rd = Russia(root=dataroot,
           partition="train",
           sequencelength=5,
           year=2018,
           use_cache=True,
           return_id=True,
           broadcast_y=False)

Cache is activated and will be used if possible
Data: train, year: 2018
Trying to use cache
Loading X and y from cache
Russia dataset for 2018 year (train part) is loaded. It contains 8264 fields


In [12]:
X, y = rd["X"], rd["y"]

In [13]:
X.shape, y.shape

((8264, 50), (8264,))

In [4]:
BANDS = [
    "B02",
    "B03",
    "B04",
    "B05",
    "B06",
    "B07",
    "B08",
    "B8A",
    "B11",
    "B12",
]  # , 'NDVI'
TIME_INTERVAL = [(4, 1), (9, 30)]
region = "russia"
year = 2018
train_features_filepath = os.path.join(
    dataroot, f"{region}-{year}/train_features.csv.zip"
)
test_features_filepath = os.path.join(
    dataroot, f"{region}-{year}/test_features.csv.zip"
)
parcelsmapping_path = os.path.join(
    dataroot, f"{region}-{year}/parcelsmapping.csv"
)
classmapping_path = os.path.join(dataroot, "classmapping.csv")


def default_transform(x: np.ndarray, sequencelength: int) -> np.ndarray:
    # choose with replacement if sequencelength smaller als choose_t
    replace = False if x.shape[0] >= sequencelength else True
    idxs = np.random.choice(x.shape[0], sequencelength, replace=replace)
    idxs.sort()
    x = x[idxs]
    return x


cd = CropTypesDataset(
    features_filepath=train_features_filepath,
    classmapping_path=classmapping_path,
    fieldsmapping_path=parcelsmapping_path,
    bands=BANDS,
    time_interval=TIME_INTERVAL,
    sequencelength=5,
    transform=default_transform,
)

# Part 1: Classical Machine Learning

## Data preparation

In [13]:
train = pd.read_csv("/home/pavel/elects_data/russia/russia-2021/train_features.csv.zip")
test = pd.read_csv("/home/pavel/elects_data/russia/russia-2021/test_features.csv.zip")

In [40]:
train.shape, test.shape

((837200, 18), (217682, 18))

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 837200 entries, 0 to 837199
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   index                   837200 non-null  int64  
 1   timestamp               837200 non-null  object 
 2   field_id                837200 non-null  int64  
 3   eopatch                 837200 non-null  object 
 4   parcel_pixels           837200 non-null  int64  
 5   B02                     837200 non-null  float64
 6   B03                     837200 non-null  float64
 7   B04                     837200 non-null  float64
 8   B05                     837200 non-null  float64
 9   B06                     837200 non-null  float64
 10  B07                     837200 non-null  float64
 11  B08                     837200 non-null  float64
 12  B8A                     837200 non-null  float64
 13  B11                     837200 non-null  float64
 14  B12                 

In [9]:
train.head(10)

Unnamed: 0_level_0,index,field_id,eopatch,parcel_pixels,B02,B03,B04,B05,B06,B07,B08,B8A,B11,B12,parcell_cloud_coverage,parcell_data_coverage,class_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-04-01,0,2591,37UCS_2_1,856,440.0,389.0,437.5,470.0,475.0,509.0,561.0,565.0,988.0,793.5,0.450025,0.522777,0
2021-04-06,1,2591,37UCS_2_1,6151,211.0,277.0,402.0,474.0,511.0,567.0,656.0,689.0,1766.0,1633.0,0.0,0.522777,0
2021-04-11,2,2591,37UCS_2_1,6151,297.0,323.0,457.0,517.0,563.0,630.0,749.0,767.0,1991.0,1913.0,0.0,0.522777,0
2021-05-01,3,2591,37UCS_2_1,6151,471.0,484.0,600.0,694.0,774.0,865.0,983.0,1030.0,2411.0,2295.0,0.0,0.522777,0
2021-05-16,4,2591,37UCS_2_1,5545,317.0,337.0,256.0,371.0,645.0,674.0,678.0,702.0,711.0,533.0,0.051504,0.522777,0
2021-06-10,5,2591,37UCS_2_1,5479,441.0,472.0,446.0,572.0,867.0,991.0,1040.0,1072.0,1511.0,1277.0,0.057114,0.522777,0
2021-06-15,6,2591,37UCS_2_1,6151,411.0,549.0,492.0,759.0,1429.0,1622.0,1654.0,1769.0,2433.0,1914.0,0.0,0.522777,0
2021-06-20,7,2591,37UCS_2_1,6151,472.0,646.0,462.0,795.0,2233.0,2769.0,2812.0,2887.0,2232.0,1508.0,0.0,0.522777,0
2021-06-25,8,2591,37UCS_2_1,6151,392.0,559.0,386.0,703.0,2304.0,3026.0,2952.0,3167.0,1998.0,1158.0,0.0,0.522777,0
2021-06-30,9,2591,37UCS_2_1,1330,1335.0,1412.0,1137.0,1549.0,3884.0,5324.0,5152.0,5431.0,2590.0,1557.0,0.40974,0.522777,0


In [39]:
X_train = train\
    .pivot(
        index="field_id",
        columns="timestamp",
        values=["B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B11", "B12"])\
    .reset_index(drop=True)
X_train.columns = [f"{a}_{b}" for a, b in X_train.columns]
X_train.interpolate(method="linear", axis=1, inplace=True)
X_train = X_train.fillna(method="ffill", axis=1).fillna(method="bfill", axis=1)

In [40]:
y_train = train.groupby("field_id").agg({"class_id": max})["class_id"]
y_train.shape, y.iloc[0:5]

((30417,),
 field_id
 2591    0
 2592    0
 2593    0
 2594    0
 2595    0
 Name: class_id, dtype: int64)

In [41]:
X_train.head()

Unnamed: 0,B02_2021-04-01,B02_2021-04-02,B02_2021-04-03,B02_2021-04-04,B02_2021-04-05,B02_2021-04-06,B02_2021-04-07,B02_2021-04-08,B02_2021-04-09,B02_2021-04-10,...,B12_2021-09-21,B12_2021-09-22,B12_2021-09-23,B12_2021-09-24,B12_2021-09-25,B12_2021-09-26,B12_2021-09-27,B12_2021-09-28,B12_2021-09-29,B12_2021-09-30
0,440.0,394.2,348.4,302.6,256.8,211.0,228.2,245.4,262.6,279.8,...,2707.0,2707.0,2707.0,2707.0,2707.0,2707.0,2707.0,2707.0,2707.0,2707.0
1,576.0,507.2,438.4,369.6,300.8,232.0,248.6,265.2,281.8,298.4,...,2676.0,2676.0,2676.0,2676.0,2676.0,2676.0,2676.0,2676.0,2676.0,2676.0
2,382.0,354.2,326.4,298.6,270.8,243.0,269.4,295.8,322.2,348.6,...,2883.0,2883.0,2883.0,2883.0,2883.0,2883.0,2883.0,2883.0,2883.0,2883.0
3,1670.0,1385.969331,1101.938661,817.907992,533.877323,249.846653,263.136106,276.425558,289.71501,303.004462,...,2798.38986,2798.38986,2798.38986,2798.38986,2798.38986,2798.38986,2798.38986,2798.38986,2798.38986,2798.38986
4,195.0,195.0,195.0,195.0,195.0,195.0,211.8,228.6,245.4,262.2,...,2854.0,2854.0,2854.0,2854.0,2854.0,2854.0,2854.0,2854.0,2854.0,2854.0


In [42]:
y_train

field_id
2591     0
2592     0
2593     0
2594     0
2595     0
        ..
82578    3
88485    3
88493    3
89284    3
89301    3
Name: class_id, Length: 30417, dtype: int64

In [53]:
X_test = test\
    .pivot(
        index="field_id",
        columns="timestamp",
        values=["B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B11", "B12"])\
    .reset_index(drop=True)
X_test.columns = [f"{a}_{b}" for a, b in X_test.columns]
X_test.interpolate(method="linear", axis=1, inplace=True)
X_test = X_test.fillna(method="ffill", axis=1).fillna(method="bfill", axis=1)

In [54]:
y_test = train.groupby("field_id").agg({"class_id": max})["class_id"]
y_test.shape, y.iloc[0:5]

((30417,),
 field_id
 2591    0
 2592    0
 2593    0
 2594    0
 2595    0
 Name: class_id, dtype: int64)

## Model Training and Evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score

### Random Forest Classifier

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [46]:
rfc = RandomForestClassifier()

gs_params = {
    "n_estimators": range(10, 110, 30),
    "max_depth": range(3, 11, 3),
    "min_samples_split": range(5, 26, 10)
}

gs1 = GridSearchCV(rfc, gs_params, scoring="accuracy", n_jobs=1, verbose=2, cv=3)
gs1.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END ..max_depth=3, min_samples_split=5, n_estimators=10; total time=   3.1s
[CV] END ..max_depth=3, min_samples_split=5, n_estimators=10; total time=   3.1s
[CV] END ..max_depth=3, min_samples_split=5, n_estimators=10; total time=   3.2s
[CV] END ..max_depth=3, min_samples_split=5, n_estimators=40; total time=  12.0s
[CV] END ..max_depth=3, min_samples_split=5, n_estimators=40; total time=  12.0s
[CV] END ..max_depth=3, min_samples_split=5, n_estimators=40; total time=  12.3s
[CV] END ..max_depth=3, min_samples_split=5, n_estimators=70; total time=  21.2s
[CV] END ..max_depth=3, min_samples_split=5, n_estimators=70; total time=  20.5s
[CV] END ..max_depth=3, min_samples_split=5, n_estimators=70; total time=  21.5s
[CV] END .max_depth=3, min_samples_split=5, n_estimators=100; total time=  30.0s
[CV] END .max_depth=3, min_samples_split=5, n_estimators=100; total time=  29.6s
[CV] END .max_depth=3, min_samples_split=5, n_e

[CV] END .max_depth=9, min_samples_split=25, n_estimators=40; total time=  32.0s
[CV] END .max_depth=9, min_samples_split=25, n_estimators=70; total time=  54.1s
[CV] END .max_depth=9, min_samples_split=25, n_estimators=70; total time=  54.2s
[CV] END .max_depth=9, min_samples_split=25, n_estimators=70; total time=  54.9s
[CV] END max_depth=9, min_samples_split=25, n_estimators=100; total time= 1.3min
[CV] END max_depth=9, min_samples_split=25, n_estimators=100; total time= 1.3min
[CV] END max_depth=9, min_samples_split=25, n_estimators=100; total time= 1.3min


In [51]:
gs1.best_score_, gs1.best_estimator_

(0.7124963014103954, RandomForestClassifier(max_depth=9, min_samples_split=25))

In [55]:
y_pred = gs1.predict(X_test)
accuracy_score(y_test,y_pred), f1_score(y_test,y_pred)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- B02_2021-09-24
- B03_2021-09-24
- B04_2021-09-24
- B05_2021-09-24
- B06_2021-09-24
- ...


In [13]:
from dataclasses import dataclass
import os
import torch
from train import train

In [3]:
@dataclass
class TrainConfg:
    dataset: str = "russia"
    alpha: float = 0.5
    epsilon: float = 10
    learning_rate: float = 10e-3
    weight_decay: float = 0
    patience: int = 30
    device: str = "cuda" if torch.cuda.is_available() else "cpu"               
    epochs: int = 50
    sequencelength: int = 183
    batchsize: int = 128
    dataroot: str = os.path.join(os.environ["HOME"],"elects_data")
    snapshot: str = "snapshots/model.pth"
    resume: bool = False
    year: int = 2021
    use_cache: bool = True
    model: str = "earlyrnn"
    n_months: int = 6
    visualize: bool = False
    hyperparameters: dict = None    

  return torch._C._cuda_getDeviceCount() > 0


In [8]:
args_lgbm = Args(model="lightgbm",
               epochs=5,
               sequencelength=10,
               year=2018,
               hyperparameters={"n_estimators": range(10, 100, 10)})

In [9]:
best_model, train_stats = train(args_lgbm)

Cache is activated and will be used if possible
Data: train, year: 2018
Trying to use cache
Loading X and y from cache
Russia dataset for 2018 year (train part) is loaded. It contains 8264 fields
Cache is activated and will be used if possible
Data: test, year: 2018
Trying to use cache
Loading X and y from cache
Russia dataset for 2018 year (test part) is loaded. It contains 2057 fields
X shape: (8264, 100) y shape: (8264,)
Fitting 5 folds for each of 9 candidates, totalling 45 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'mean_fit_time': array([ 3.85318074,  7.70746722, 12.3065865 , 17.68002954, 20.39265714,
       23.63070712, 27.95570722, 32.86610832, 33.92862926]), 'std_fit_time': array([0.49845427, 0.8264151 , 0.34027949, 1.30468588, 1.91543144,
       3.74669063, 2.92577176, 3.39850096, 4.61809028]), 'mean_score_time': array([0.03542018, 0.06509209, 0.09107647, 0.12735238, 0.15124722,
       0.18001976, 0.2114521 , 0.2502439 , 0.28559928]), 'std_score_time': array([0.00458114, 0.00623242, 0.01198859, 0.0141358 , 0.01376145,
       0.02716286, 0.03350791, 0.02983337, 0.06818945]), 'param_n_estimators': masked_array(data=[10, 20, 30, 40, 50, 60, 70, 80, 90],
             mask=[False, False, False, False, False, False, False, False,
                   False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 10}, {'n_estimators': 20}, {'n_estimators': 30}, {'n_estimators': 40}, {'n_estimators': 50}, {'n_estimators': 60}, {'n_estimators': 70}, {'n_estimators': 80}, {'n_est

In [10]:
import pandas as pd

In [12]:
pd.DataFrame(train_stats)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,...,std_test_fscore,rank_test_fscore,split0_test_kappa,split1_test_kappa,split2_test_kappa,split3_test_kappa,split4_test_kappa,mean_test_kappa,std_test_kappa,rank_test_kappa
0,3.853181,0.498454,0.03542,0.004581,10,{'n_estimators': 10},0.566848,0.588627,0.568663,0.569873,...,0.019943,9,0.466057,0.498669,0.476524,0.475601,0.430573,0.469485,0.022202,9
1,7.707467,0.826415,0.065092,0.006232,20,{'n_estimators': 20},0.603751,0.61585,0.606171,0.595886,...,0.021415,8,0.517045,0.537395,0.525698,0.512519,0.462104,0.510952,0.025855,8
2,12.306587,0.340279,0.091076,0.011989,30,{'n_estimators': 30},0.628554,0.638838,0.612825,0.609195,...,0.025883,7,0.549056,0.566686,0.534019,0.529068,0.472744,0.530315,0.031635,7
3,17.68003,1.304686,0.127352,0.014136,40,{'n_estimators': 40},0.636419,0.638234,0.61343,0.610405,...,0.025092,6,0.559101,0.566637,0.535022,0.53004,0.477836,0.533727,0.031196,6
4,20.392657,1.915431,0.151247,0.013761,50,{'n_estimators': 50},0.643073,0.642468,0.612825,0.61585,...,0.024535,5,0.567474,0.571205,0.534098,0.536829,0.484024,0.538726,0.031301,5
5,23.630707,3.746691,0.18002,0.027163,60,{'n_estimators': 60},0.650938,0.648518,0.609195,0.61343,...,0.028137,3,0.577025,0.578207,0.529409,0.533875,0.482352,0.540174,0.035506,3
6,27.955707,2.925772,0.211452,0.033508,70,{'n_estimators': 70},0.644283,0.648518,0.607985,0.620085,...,0.027514,4,0.56907,0.578262,0.527946,0.541744,0.480683,0.539541,0.034568,4
7,32.866108,3.398501,0.250244,0.029833,80,{'n_estimators': 80},0.649728,0.650938,0.611615,0.623109,...,0.029531,2,0.57564,0.581581,0.531844,0.545246,0.477528,0.542368,0.037331,2
8,33.928629,4.61809,0.285599,0.068189,90,{'n_estimators': 90},0.654567,0.655777,0.6098,0.623109,...,0.030719,1,0.581777,0.587517,0.529634,0.545419,0.482231,0.545316,0.038304,1
