In [1]:
import pandas as pd
import numpy as np
from sktime.datatypes import check_is_scitype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import (
    MinMaxScaler,
    StandardScaler,
    RobustScaler
)
from sktime.transformations.panel.catch22 import Catch22
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
def transform_data(df: pd.DataFrame, df_label: pd.DataFrame) -> pd.DataFrame:
    if "Unnamed: 0" in df.columns:
        df.drop(columns=["Unnamed: 0"], inplace=True)

    # Labeled data does not contain two cases
    missing_uuid = list(set(df["uuid"].unique()) - set(df_label["uuid"].unique()))

    df = df[~df["uuid"].isin(missing_uuid)]

    # Transform data into sktime scitype and check
    df_transformed = df.set_index(["uuid", "date"])[["ndvi"]]
    return df_transformed

def train_clf(
        df_transformed: pd.DataFrame,
        df_label: pd.DataFrame,
        scaler: TransformerMixin,
        clf: BaseEstimator
):
    if "Unnamed: 0" in df_label.columns:
        df_label = df_label.drop(columns=["Unnamed: 0"])

    # Create integer categories
    df_label["class_encoded"] = df_label["class"].astype("category").cat.codes 
    uuids = df_label["uuid"].values
    labels = df_label["class_encoded"].values

    """ 
    Here, k-fold cross-validation is performed on the time-series data features
    extracted using Catch22. The transformations are chained using a `Pipeline`
    object to ensure that data leakage does not occur due to incorrect fits and
    transforms.
    """
    train_scores = []
    val_scores = []

    SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for fold, (train_idx, val_idx) in enumerate(SKF.split(uuids, labels)):
        print(f"Stratified k-fold cross-validation, fold {fold + 1}")

        train_ids = uuids[train_idx]
        val_ids = uuids[val_idx]

        # Filtering the multi-index dataframe
        X_train = df_transformed.loc[
            df_transformed.index.get_level_values("uuid").isin(train_ids)
        ]

        X_val = df_transformed.loc[
            df_transformed.index.get_level_values("uuid").isin(val_ids)
        ]

        y_train = df_label.loc[df_label["uuid"].isin(train_ids), "class_encoded"]
        y_val = df_label.loc[df_label["uuid"].isin(val_ids), "class_encoded"]

        # Create a pipeline for performing Catch22 feature extraction and feature scaling
        p = Pipeline(
            [
                ("catch22", Catch22(catch24=True)),
                ("scaler", scaler())
            ]
        )

        # Only apply transform() on validation data to prevent data leakage
        X_train = p.fit_transform(X_train)
        X_val = p.transform(X_val)

        # There may be NaN columns; we remove any such column where all values as NaN

        nan_cols = np.where(np.all(np.isnan(X_train), axis=0))[0]

        if len(nan_cols) > 0:
            print(f"There are {len(nan_cols)} columns with all NaN; removing them")
            X_train = np.delete(X_train, nan_cols, axis=1)
            X_val = np.delete(X_val, nan_cols, axis=1)

        impute = SimpleImputer(strategy="mean")
        X_train_imputed = impute.fit_transform(X_train)
        X_val_imputed = impute.transform(X_val)

        # Fitting the classifier to the training data and labels
        clf.fit(X_train_imputed, y_train)

        y_train_preds = clf.predict(X_train_imputed)
        y_val_preds = clf.predict(X_val_imputed)

        train_accuracy = accuracy_score(y_train, y_train_preds)
        print(f"Training accuracy in fold {fold+1}: {train_accuracy}")

        val_accuracy = accuracy_score(y_val, y_val_preds)
        print(f"Validation accuracy in fold {fold+1}: {val_accuracy}")

        train_scores.append(train_accuracy)
        val_scores.append(val_accuracy)

    train_scores_mean = np.mean(train_scores)
    val_scores_mean = np.mean(val_scores)

    return train_scores_mean, val_scores_mean

In [30]:
DATA_PATH = "/Users/rafidmahbub/Desktop/DataKind_Geospatial/crop_classification/time_series_analyses/tests/df_clean.csv"
LABEL_PATH = "/Users/rafidmahbub/Desktop/DataKind_Geospatial/crop_classification/time_series_analyses/ndvi_series_labeled/Trans_Nzoia_1_tile_0_NDVI_labels.csv"

df = pd.read_csv(DATA_PATH)
df["date"] = pd.to_datetime(df["date"])
df_label = pd.read_csv(LABEL_PATH)

df_transformed = transform_data(df, df_label)

scaler = RobustScaler

clf = RandomForestClassifier(
    n_estimators=150, 
    criterion="log_loss",
    max_depth=15, 
    min_samples_leaf=5,
    max_features=None
)

train_scores_mean, val_scores_mean = train_clf(
        df_transformed,
        df_label,
        scaler,
        clf
    )

Stratified k-fold cross-validation, fold 1


  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  return function_base._ureduce(a,


There are 1 columns with all NaN; removing them
Training accuracy in fold 1: 0.9324866310160428
Validation accuracy in fold 1: 0.8074866310160428
Stratified k-fold cross-validation, fold 2


  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  return function_base._ureduce(a,


There are 1 columns with all NaN; removing them
Training accuracy in fold 2: 0.93048128342246
Validation accuracy in fold 2: 0.7379679144385026
Stratified k-fold cross-validation, fold 3


  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  return function_base._ureduce(a,


There are 1 columns with all NaN; removing them
Training accuracy in fold 3: 0.9318181818181818
Validation accuracy in fold 3: 0.8048128342245989
Stratified k-fold cross-validation, fold 4


  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  return function_base._ureduce(a,


There are 1 columns with all NaN; removing them
Training accuracy in fold 4: 0.9318181818181818
Validation accuracy in fold 4: 0.7620320855614974
Stratified k-fold cross-validation, fold 5


  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  return function_base._ureduce(a,


There are 1 columns with all NaN; removing them
Training accuracy in fold 5: 0.9344919786096256
Validation accuracy in fold 5: 0.8048128342245989


In [31]:
val_scores_mean

0.7834224598930482