## Data Exploration and Splitting
#### Prepared by Isaiah Taylor

In [1]:
import os
import sys
import pathlib
import numpy as np
import pandas as pd

module_path = os.path.abspath(os.path.join('../src/utils'))
sys.path.insert(0, module_path)
import utils


In [2]:
train_df, validate_df, test_df = utils.split_by_rscore(
    "../data/croplands3-2.0.0.csv",
    out_dir="../data/",
    class_col="Class",
    score_col="Rscore",
    train_frac=0.6,
    val_frac=0.2,
    test_frac=0.2,
    seed=47   
)

Split complete → files in ../data
  ▸ train    n=20,241 • mean= 0.785 • median= 0.804
  ▸ validate n= 6,746 • mean= 0.670 • median= 0.662
  ▸ test     n= 6,748 • mean= 0.560 • median= 0.579


In [3]:
validate_df

Unnamed: 0,name,Class,Rscore,label,image,usage
3179,NG1830786,2,0.662500,3cl/v2-0-0/labels/NG1830786_28273_2023-03.tif,3cl/v2-0-0/images/NG1830786_2023-03.tif.tif,validate
5997,BF0384736,2,0.630584,3cl/v2-0-0/labels/BF0384736_13753_2022-04.tif,3cl/v2-0-0/images/BF0384736_2022-04.tif.tif,validate
1963,NG0130294,2,0.691748,3cl/v2-0-0/labels/NG0130294_33866_2017-08.tif,3cl/v2-0-0/images/NG0130294_2017-08.tif.tif,validate
3653,BF0689432,2,0.662500,3cl/v2-0-0/labels/BF0689432_8285_2021-04.tif,3cl/v2-0-0/images/BF0689432_2021-04.tif.tif,validate
2310,MW0058209,2,0.662500,3cl/v2-0-0/labels/MW0058209_3777_2019-08.tif,3cl/v2-0-0/images/MW0058209_2019-08.tif.tif,validate
...,...,...,...,...,...,...
3759,ET1704478,2,0.662500,3cl/v2-0-0/labels/ET1704478_22340_2020-02.tif,3cl/v2-0-0/images/ET1704478_2020-02.tif.tif,validate
1799,ET0591115,2,0.691748,3cl/v2-0-0/labels/ET0591115_12839_2017-02.tif,3cl/v2-0-0/images/ET0591115_2017-02.tif.tif,validate
926,ZM1469069,2,0.691748,3cl/v2-0-0/labels/ZM1469069_16723_2017-08.tif,3cl/v2-0-0/images/ZM1469069_2017-08.tif.tif,validate
4317,ET2479164,2,0.648045,3cl/v2-0-0/labels/ET2479164_22417_2017-02.tif,3cl/v2-0-0/images/ET2479164_2017-02.tif.tif,validate


In [4]:
dfs = []
out_dir = pathlib.Path("../data/")
for split, df in zip(["train", "validate", "test"], [train_df, validate_df, test_df]):
    df["usage"] = split          # tag the rows
    dfs.append(df)

combined = pd.concat(dfs, ignore_index=True)


In [5]:
dfs

[            name Class    Rscore  \
 19405  BW1574957    1d  0.825000   
 16112  ZM1740694     2  0.706386   
 18087  ET1631878     2  0.694175   
 13272  NG0706735     2  0.728205   
 18126  NG0699898     2  0.694175   
 ...          ...   ...       ...   
 9371   BJ0346146     2  0.804348   
 19019  UG0008604    1b  0.789374   
 17482  NG1643195     2  0.706386   
 735    TZ1814911     2  0.870748   
 18177  NG2348291     2  0.694175   
 
                                                 label  \
 19405   3cl/v2-0-0/labels/BW1574957_41040_2019-08.tif   
 16112    3cl/v2-0-0/labels/ZM1740694_7112_2022-06.tif   
 18087    3cl/v2-0-0/labels/ET1631878_8452_2019-02.tif   
 13272   3cl/v2-0-0/labels/NG0706735_36657_2019-02.tif   
 18126    3cl/v2-0-0/labels/NG0699898_8784_2019-02.tif   
 ...                                               ...   
 9371    3cl/v2-0-0/labels/BJ0346146_18787_2022-04.tif   
 19019  3cl/v2-0-0/labels/UG0008604_101173_2022-04.tif   
 17482   3cl/v2-0-0/labels/NG164

In [6]:
combined["image"] = combined["image"].str.replace(r"\.tif\.tif$", ".tif", regex=True)
combined = combined.dropna()

In [7]:
combined.to_csv(out_dir / "croplands3-2.0.0_splits.csv", index=False)

In [8]:
combined.Class.value_counts()

Class
2     30772
4       990
1d      905
1a      797
1b      271
Name: count, dtype: int64

In [9]:
train_df.Class.value_counts()

Class
2     18463
4       594
1d      543
1a      478
1b      163
Name: count, dtype: int64

Visualize Data Distribution