In [6]:
import pandas as pd
import numpy as np
from sktime.datatypes import check_is_scitype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sktime.transformations.panel.catch22 import Catch22
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [7]:
def transform_data(df: pd.DataFrame, df_label: pd.DataFrame) -> pd.DataFrame:
    if "Unnamed: 0" in df.columns:
        df.drop(columns=["Unnamed: 0"], inplace=True)

    # If labeled data does not contain all polygons
    missing_uuid = list(set(df["uuid"].unique()) - set(df_label["uuid"].unique()))

    if len(missing_uuid) > 0:
        df = df[~df["uuid"].isin(missing_uuid)]

    # Transform data into sktime scitype and check
    df_transformed = df.set_index(["uuid", "date"])[["ndvi"]]
    type_check = check_is_scitype(
        df_transformed,
        scitype="Panel",
        return_metadata=True
    )

    if type_check[0]:
        print(f"Dataframe has correct 'scitype': {type_check[2]['scitype']}")
        return df_transformed
    else:
        print(f"Dataframe does not have the correct 'scitype': {type_check[1]}")

In [8]:
DATA_PATH = "/Users/rafidmahbub/Desktop/DataKind_Geospatial/crop_classification/time_series_analyses/ndvi_series_labeled/Trans_Nzoia_1_ndvi_train.csv"
LABEL_PATH = "/Users/rafidmahbub/Desktop/DataKind_Geospatial/crop_classification/time_series_analyses/ndvi_series_labeled/Trans_Nzoia_1_label_train.csv"
    
df = pd.read_csv(DATA_PATH)
df["date"] = pd.to_datetime(df["date"])
df_label = pd.read_csv(LABEL_PATH)

df_label["class_encoded"] = df_label["class"].astype("category").cat.codes

df_transformed = transform_data(df, df_label)

Dataframe has correct 'scitype': Panel


In [9]:
class RemoveNaNColumns(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.columns_to_drop = None
        
    def fit(self, X, y=None):
        # Validate data type
        X = self._validate_input(X)
        # Identify columns where all elements are NaN
        nan_cols = np.where(np.all(np.isnan(X), axis=0))[0]
        self.columns_to_drop = nan_cols 

        return self 
    def transform(self, X):
        # Drop columns
        X_transformed = np.delete(X, self.columns_to_drop, axis=1)

        return X_transformed
    
    def _validate_input(self, X):
        if isinstance(X, np.ndarray):
            return X 
        else:
            raise TypeError("Object X does not have the required Numpy array format.")

In [17]:
from sklearn.model_selection import StratifiedKFold


warnings.filterwarnings("ignore", category=RuntimeWarning)

uuids = df_label["uuid"].values
labels = df_label["class_encoded"].values

SKF = StratifiedKFold(n_splits=5, shuffle=True)

val_accuracies = []
val_f1_scores = []

for fold, (train_idx, val_idx) in enumerate(SKF.split(uuids, labels)):
    train_ids = uuids[train_idx]
    val_ids = uuids[val_idx]

    # Filtering the multi-index dataframe
    X_train = df_transformed.loc[
        df_transformed.index.get_level_values("uuid").isin(train_ids)
    ]

    X_test = df_transformed.loc[
        df_transformed.index.get_level_values("uuid").isin(val_ids)
    ]

    y_train = df_label.loc[df_label["uuid"].isin(train_ids), "class_encoded"]
    y_test = df_label.loc[df_label["uuid"].isin(val_ids), "class_encoded"]

    p = Pipeline(
        [
            ("catch22", Catch22()),
            ("minmax", MinMaxScaler()),
            ("remove_nan", RemoveNaNColumns()), 
            ("imputer", SimpleImputer(strategy="mean"))
        ]
    )

    X_train_transformed = p.fit_transform(X_train)
    X_test_transformed = p.transform(X_test)

    clf_svc = SVC(
        gamma="auto",
        class_weight=None,
        C=968.6191736411615,
        kernel="poly",
        probability=True
    )
    clf_lgb = LGBMClassifier(
        verbosity=-1,
        lambda_l1=6.534846895156005,
        max_depth=9,
        min_child_weight=0.3426063538482019,
        learning_rate=0.05615369822555346,
        bagging_freq=8,
        min_child_samples=73,
        lambda_l2=0.11610557699921975,
        feature_fraction=0.5001440442609598,
        num_leaves=209,
        min_split_gain=0.10462928402523397,
        bagging_fraction=0.9824426936456173
    )
    clf_xgb = XGBClassifier(
        reg_lambda=3.5161512825735066,
        max_delta_step=1.9115702476973047,
        colsample_bylevel=0.9266123974934999,
        max_depth=10,
        learning_rate=0.18184576490542836,
        objective="multi:softmax",
        min_split_loss=2.291361972153634,
        n_estimators=105,
        colsample_bytree=0.8851986960927963,
        reg_alpha=2.411345757878557,
        subsample=0.8161397597180512
    )
    clf_mlp = MLPClassifier(
        learning_rate="adaptive",
        activation="relu",
        solver="adam",
        max_iter=692,
        alpha=0.004081719188194644,
        hidden_layer_sizes=46
    )

    clf_voting = VotingClassifier(
        estimators=[
            ("SVC", clf_svc),
            ("MLP", clf_mlp),
            ("LGB", clf_lgb),
            ("XGB", clf_xgb)
        ],
        weights=[3, 2, 1, 1],
        voting="hard"
    )

    clf_voting.fit(X_train_transformed, y_train)
    y_test_preds = clf_voting.predict(X_test_transformed)

    val_accuracy = accuracy_score(y_test, y_test_preds)
    print(f"validation accuracy for fold {fold+1}: {val_accuracy}")

    val_accuracies.append(val_accuracy)

    val_f1_score = f1_score(y_test, y_test_preds, labels=[0], average="weighted")
    print(f"validation class 0 f1 for fold {fold+1}: {val_f1_score}")

    val_kappa = cohen_kappa_score(y_test, y_test_preds)
    print(f"validation kappa score for fold {fold+1}: {val_kappa}")

    val_f1_scores.append(val_f1_score)

print(f"Mean validation accuracy: {np.mean(val_accuracies)}")
print(f"Mean validation class 0 f1 score: {np.mean(val_f1_scores)}")



validation accuracy for fold 1: 0.8303655107778819
validation class 0 f1 for fold 1: 0.908881199538639
validation kappa score for fold 1: 0.7497155372393505




validation accuracy for fold 2: 0.8153701968134958
validation class 0 f1 for fold 2: 0.9082672706681767
validation kappa score for fold 2: 0.7263810532947679




validation accuracy for fold 3: 0.8359887535145267
validation class 0 f1 for fold 3: 0.9205607476635514
validation kappa score for fold 3: 0.7585732516359094




validation accuracy for fold 4: 0.8227016885553471
validation class 0 f1 for fold 4: 0.9072164948453608
validation kappa score for fold 4: 0.7379653341275592
validation accuracy for fold 5: 0.8339587242026266
validation class 0 f1 for fold 5: 0.9195402298850575
validation kappa score for fold 5: 0.7559204951910328
Mean validation accuracy: 0.8276769747727757
Mean validation class 0 f1 score: 0.912893188520157


