In [11]:
import pandas as pd
import numpy as np
from sktime.datatypes import check_is_scitype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sktime.transformations.panel.catch22 import Catch22
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def transform_data(df: pd.DataFrame, df_label: pd.DataFrame) -> pd.DataFrame:
    if "Unnamed: 0" in df.columns:
        df.drop(columns=["Unnamed: 0"], inplace=True)

    # If labeled data does not contain all polygons
    missing_uuid = list(set(df["uuid"].unique()) - set(df_label["uuid"].unique()))

    if len(missing_uuid) > 0:
        df = df[~df["uuid"].isin(missing_uuid)]

    # Transform data into sktime scitype and check
    df_transformed = df.set_index(["uuid", "date"])[["ndvi"]]
    type_check = check_is_scitype(
        df_transformed,
        scitype="Panel",
        return_metadata=True
    )

    if type_check[0]:
        print(f"Dataframe has correct 'scitype': {type_check[2]['scitype']}")
        return df_transformed
    else:
        print(f"Dataframe does not have the correct 'scitype': {type_check[1]}")

In [4]:
DATA_PATH = "/Users/rafidmahbub/Desktop/DataKind_Geospatial/crop_classification/time_series_analyses/ndvi_series_labeled/ndvi_series_Trans_Nzoia_1_clean.csv"
LABEL_PATH = "/Users/rafidmahbub/Desktop/DataKind_Geospatial/crop_classification/time_series_analyses/ndvi_series_labeled/ndvi_Trans_Nzoia_1_labels.csv"
    
df = pd.read_csv(DATA_PATH)
df["date"] = pd.to_datetime(df["date"])
df_label = pd.read_csv(LABEL_PATH)

df_label["class_encoded"] = df_label["class"].astype("category").cat.codes

df_transformed = transform_data(df, df_label)

Dataframe has correct 'scitype': Panel


In [5]:
class RemoveNaNColumns(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.columns_to_drop = None
        
    def fit(self, X, y=None):
        # Validate data type
        X = self._validate_input(X)
        # Identify columns where all elements are NaN
        nan_cols = np.where(np.all(np.isnan(X), axis=0))[0]
        self.columns_to_drop = nan_cols 

        return self 
    def transform(self, X):
        # Drop columns
        X_transformed = np.delete(X, self.columns_to_drop, axis=1)

        return X_transformed
    
    def _validate_input(self, X):
        if isinstance(X, np.ndarray):
            return X 
        else:
            raise TypeError("Object X does not have the required Numpy array format.")

In [12]:
from sklearn.model_selection import StratifiedKFold


warnings.filterwarnings("ignore", category=RuntimeWarning)

n_classes=4
precision = {i: [] for i in range(n_classes)}
recall = {i: [] for i in range(n_classes)}
threshold = {i: [] for i in range(n_classes)}

uuids = df_label["uuid"].values
labels = df_label["class_encoded"].values

SKF = StratifiedKFold(n_splits=5, shuffle=True)

val_accuracies = []
val_f1_scores = []

for fold, (train_idx, val_idx) in enumerate(SKF.split(uuids, labels)):
    train_ids = uuids[train_idx]
    val_ids = uuids[val_idx]

    # Filtering the multi-index dataframe
    X_train = df_transformed.loc[
        df_transformed.index.get_level_values("uuid").isin(train_ids)
    ]

    X_test = df_transformed.loc[
        df_transformed.index.get_level_values("uuid").isin(val_ids)
    ]

    y_train = df_label.loc[df_label["uuid"].isin(train_ids), "class_encoded"]
    y_test = df_label.loc[df_label["uuid"].isin(val_ids), "class_encoded"]

    p = Pipeline(
        [
            ("catch22", Catch22()),
            ("minmax", MinMaxScaler()),
            ("remove_nan", RemoveNaNColumns()), 
            ("imputer", SimpleImputer(strategy="mean"))
        ]
    )

    X_train_transformed = p.fit_transform(X_train)
    X_test_transformed = p.transform(X_test)

    # select_k = SelectKBest(k=10, score_func=mutual_info_classif)
    # X_train_k = select_k.fit_transform(X_train_transformed, y_train)
    # X_test_k = select_k.transform(X_test_transformed)

    clf = SVC().fit(X_train_transformed, y_train)
    y_test_preds = clf.predict(X_test_transformed)

    val_accuracy = accuracy_score(y_test, y_test_preds)
    print(f"validation accuracy for fold {fold+1}: {val_accuracy}")

    val_accuracies.append(val_accuracy)

    val_f1_score = f1_score(y_test, y_test_preds, labels=[0], average="weighted")
    print(f"validation class 0 f1 for fold {fold+1}: {val_f1_score}")

    val_f1_scores.append(val_f1_score)

print(f"Mean validation accuracy: {np.mean(val_accuracies)}")
print(f"Mean validation class 0 f1 score: {np.mean(val_f1_scores)}")

validation accuracy for fold 1: 0.8088426527958388
validation class 0 f1 for fold 1: 0.9060955518945635
validation accuracy for fold 2: 0.8153446033810143
validation class 0 f1 for fold 2: 0.9015025041736227
validation accuracy for fold 3: 0.811443433029909
validation class 0 f1 for fold 3: 0.8943894389438944
validation accuracy for fold 4: 0.8166449934980494
validation class 0 f1 for fold 4: 0.9198717948717947
validation accuracy for fold 5: 0.8046875
validation class 0 f1 for fold 5: 0.9078498293515358
Mean validation accuracy: 0.8113926365409622
Mean validation class 0 f1 score: 0.9059418238470822
