In [373]:
#!c1.32
# %pip install geojson shapely seaborn

In [681]:
#!c1.32
import geojson
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from shapely.geometry import shape
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import make_scorer, recall_score, precision_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from xgboost import XGBClassifier

%matplotlib inline

# Загрузка данных

In [682]:
#!c1.32
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
df.head(3)

In [None]:
df.info()

# 0. Baseline (простой Random Forest)

In [None]:
X = df.drop(["id",".geo", "crop"], axis = 1)
y = df["crop"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
rfc = RandomForestClassifier(random_state=0, n_estimators = 3)
rfc.fit(X_train, y_train)

In [None]:
y_pred_0 = rfc.predict(X_test)
recall_score(y_test, y_pred_0, average="macro", zero_division=0)

In [None]:
X_test = test.drop([".geo", "id"], axis=1)
ids = test["id"].to_list()
y_pred_test_0 = rfc.predict(X_test)
submission = pd.DataFrame({"id": ids, "crop": y_pred_test_0})
submission.to_csv("submission_0.csv", index=None)

# 1. Разные классификаторы без работы с признаками

Везде подбираются оптимальные параметры с помощью grid search.

In [683]:
#!c1.32
recall_scorer = make_scorer(recall_score, average="macro", zero_division=0)

## 1.1. Random forest

In [None]:
gs_params = {
    "n_estimators": range(3, 30),
    "criterion": ["gini", "entropy"],
    "max_depth": range(3, 31, 3),
    "min_samples_split": range(5, 31, 5)
}

gs = GridSearchCV(RandomForestClassifier(), gs_params, scoring=recall_scorer, cv=3, verbose=2)
gs.fit(X_train, y_train)

In [None]:
gs.best_score_, gs.best_estimator_

In [None]:
X_test = test.drop([".geo", "id"], axis=1)
ids = test["id"].to_list()
y_pred_test_1 = gs.best_estimator_.predict(X_test)
submission1 = pd.DataFrame({"id": ids, "crop": y_pred_test_1})
submission1.to_csv("submission_1.csv", index=None)

## 2. Ridge classifier

In [None]:
gs2_params = {
    "alpha": [x / 10 for x in range(1, 101, 2)]
}

gs2 = GridSearchCV(RidgeClassifier(), gs2_params, scoring=recall_scorer, cv=3, verbose=2)
gs2.fit(X_train, y_train)

In [None]:
gs2.best_score_, gs2.best_estimator_

## 1.3. Logistic regression

In [None]:
gs3_params = {
    "penalty": ["l1", "l2", "none", "elasticnet"],
    "C": [0.01, 0.1, 0.5, 1, 10, 100]
}

gs3 = GridSearchCV(LogisticRegression(), gs3_params, scoring=recall_scorer, cv=3, verbose=2)
gs3.fit(X_train, y_train)

In [None]:
gs3.best_score_, gs3.best_estimator_

## 1.4. SVC

In [None]:
gs4_params = {
    "C": [0.01, 0.1, 0.5, 1, 5, 10, 100],
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree": range(1, 5),
    "decision_function_shape": ["ovr", "ovo"]    
}

gs4 = GridSearchCV(SVC(), gs4_params, scoring=recall_scorer, cv=3, verbose=2)
gs4.fit(X_train, y_train)

In [None]:
gs4.best_score_, gs4.best_estimator_

## 1.5. GradientBoosting Classifier

In [None]:
gs5_params = {
    "learning_rate": [0.01, 0.1, 1, 10, 100],
    "n_estimators": range(6, 16, 3),
    "max_depth": range(3, 10, 3),
    #"min_samples_split": range(5, 31, 5)
}

gs5 = GridSearchCV(GradientBoostingClassifier(), gs5_params, scoring=recall_scorer, cv=3, verbose=2)
gs5.fit(X_train, y_train)

In [None]:
gs5.best_score_, gs5.best_estimator_

## 1.6. CatBoost Classifier

In [None]:
gs6_params = {
    "learning_rate": [0.01, 0.1, 0.25, 0.5, 1],
    "depth": range(2, 10, 2),
    #"l2_leaf_reg": [1, 3, 5, 7, 9]
}

cbc = CatBoostClassifier(iterations=100, verbose=100, loss_function="MultiClass")
gs6 = GridSearchCV(cbc, gs6_params, scoring=recall_scorer, cv=3, verbose=2)
gs6.fit(X, y)

In [None]:
gs6.best_score_, gs6.best_estimator_

In [None]:
gs6_results = pd.DataFrame(gs6.cv_results_)
gs6_results.columns

In [None]:
gs6_results.pivot_table(index="param_depth",
                       columns="param_learning_rate",
                       values="mean_test_score",
                       aggfunc="mean").plot()

In [None]:
X_test = test.drop([".geo", "id"], axis=1)
ids = test["id"].to_list()
y_pred_test_6 = gs6.best_estimator_.predict(X_test)
submission6 = pd.DataFrame({"id": ids, "crop": y_pred_test_6.flatten()})
submission6.to_csv("submission_6.csv", index=None)

# 1.7 Catboost (v2)

In [None]:
gs7_params = {
    "learning_rate": [0.1, 0.25, 0.5],
    "depth": range(3, 7),
    "l2_leaf_reg": [3, 7, 11]
}

cbc = CatBoostClassifier(iterations=100, verbose=100, loss_function="MultiClass")
gs7 = GridSearchCV(cbc, gs7_params, scoring=recall_scorer, cv=3, verbose=2)
gs7.fit(X, y)

In [None]:
gs7.best_score_, gs7.best_estimator_

## 1.8. KNN

In [None]:
gs18_params = {
    "n_neighbors": range(3, 10),
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree"],
    "leaf_size": [20, 30, 40, 50]
}

gs18 = GridSearchCV(KNeighborsClassifier(), gs18_params, scoring=recall_scorer, cv=3, verbose=2)
gs18.fit(X, y)

In [None]:
gs18.best_score_, gs18.best_estimator_

## 1.9. Perceptron

In [None]:
gs19_params = {
    "hidden_layer_sizes": [(100,), (50,)],
    "activation": ["relu", "tahn", "logistic"],
    "alpha": [0.001, 0.01, 0.1],
    "learning_rate_init": [0.0001, 0.001, 0.01,],
   # "solver": ["lbfgs", "sgd", "adam"]
}

gs19 = GridSearchCV(MLPClassifier(max_iter=500), gs19_params, scoring=recall_scorer, cv=3, verbose=2)
gs19.fit(X, y)

In [None]:
gs19.best_score_, gs19.best_estimator_

In [None]:
ids = test["id"].to_list()
y_pred_test_19 = gs19.best_estimator_.predict(X_test)
submission19 = pd.DataFrame({"id": ids, "crop": y_pred_test_19})
submission19.to_csv("submission_19.csv", index=None)

## 1.10. XGBoost

In [None]:
gs110_params = {
    "n_estimators": range(10, 211, 25),
    "max_depth": range(3, 8),
    "learning_rate": [0.001, 0.01, 0.1, 1],
}

gs110 = GridSearchCV(XGBClassifier(objective="multi:softmax"), gs110_params, scoring=recall_scorer, cv=3, verbose=2)
gs110.fit(X, y)

In [None]:
gs110.best_score_, gs110.best_estimator_

In [None]:
X_test = test.drop([".geo", "id"], axis=1)
ids = test["id"].to_list()
y_pred_test_110 = gs110.predict(X_test)
submission110 = pd.DataFrame({"id": ids, "crop": y_pred_test_110})
submission110.to_csv("submission_110.csv", index=None)

## 1.11. LightGBM

In [None]:
gs111_params = {
    "num_leaves": range(10, 101, 20),
    "max_depth": range(3, 8),
    "learning_rate": [0.001, 0.01, 0.1, 0.5, 1],
}

gs111 = GridSearchCV(LGBMClassifier(), gs111_params, scoring=recall_scorer, cv=3, verbose=2)
gs111.fit(X, y)

In [None]:
gs111.best_score_, gs111.best_estimator_

In [None]:
X_test = test.drop([".geo", "id"], axis=1)
ids = test["id"].to_list()
y_pred_test_111 = gs111.predict(X_test)
submission111 = pd.DataFrame({"id": ids, "crop": y_pred_test_111})
submission111.to_csv("submission_111.csv", index=None)

# 2. С обработкой признаков

## 2.1. Добавим широту и долготу

In [None]:
import geojson
from shapely.geometry import shape

In [None]:
df["shape"] = df[".geo"].apply(lambda x: shape(geojson.loads(x)))

In [None]:
df["shape"][500].length

In [None]:
df["shape_type"] = df["shape"].apply(lambda x: x.type)

In [None]:
df["shape_type"].value_counts()

In [None]:
df.loc[df["shape_type"] == "MultiPolygon", "shape"].iloc[0].area

In [None]:
df["lon"] = df["shape"].apply(lambda x: x.representative_point().coords[0][0])
df["lat"] = df["shape"].apply(lambda x: x.representative_point().coords[0][1])

In [None]:
df["lon"].hist(bins=int(df["lon"].max() - df["lon"].min()))

In [None]:
df["lat"].hist(bins=int(df["lat"].max() - df["lat"].min()))

In [None]:
X = df.drop(["id",".geo", "crop", "shape", "shape_type"], axis=1)
y = df["crop"]

### 2.1.1 CatBoost

In [None]:
gs21_params = {
    "learning_rate": [0.1, 0.25, 0.5],
    "depth": range(3, 7)
}

cbc = CatBoostClassifier(iterations=100, verbose=100, loss_function="MultiClass")
gs21 = GridSearchCV(cbc, gs21_params, scoring=recall_scorer, cv=3, verbose=2)
gs21.fit(X, y)

In [None]:
gs21.best_score_, gs21.best_estimator_

In [None]:
test["shape"] = test[".geo"].apply(lambda x: shapely.geometry.shape(geojson.loads(x)))
test["lon"] = test["shape"].apply(lambda x: x.representative_point().coords[0][0])
test["lat"] = test["shape"].apply(lambda x: x.representative_point().coords[0][1])
X_test = test.drop([".geo", "id", "shape"], axis=1)

In [None]:
ids = test["id"].to_list()
y_pred_test_21 = gs21.best_estimator_.predict(X_test)
submission21 = pd.DataFrame({"id": ids, "crop": y_pred_test_21.flatten()})
submission21.to_csv("submission_21.csv", index=None)

### 2.1.2 RandomForest

In [None]:
gs212_params = {
    "n_estimators": range(10, 60, 2),
    "max_depth": range(3, 16, 3),
    "min_samples_split": range(15, 41, 5)
}

gs212 = GridSearchCV(RandomForestClassifier(criterion="entropy"), gs212_params, scoring=recall_scorer, cv=3, verbose=2, n_jobs=3)
gs212.fit(X_train, y_train)

In [None]:
gs212.best_score_, gs212.best_estimator_

In [None]:
y_pred_test_212 = gs21.best_estimator_.predict(X_test)
submission212 = pd.DataFrame({"id": ids, "crop": y_pred_test_212.flatten()})
submission212.to_csv("submission_212.csv", index=None)

### 2.1.3. XGBoost

In [None]:
gs213_params = {
    "n_estimators": range(50, 151, 25),
    "max_depth": range(3, 6),
    "learning_rate": [0.01, 0.05, 0.1, 0.5],
}

gs213 = GridSearchCV(XGBClassifier(objective="multi:softmax"), gs213_params, scoring=recall_scorer, cv=3, verbose=2)
gs213.fit(X, y)

In [None]:
gs213.best_score_, gs213.best_estimator_

In [None]:
y_pred_test_213 = gs213.best_estimator_.predict(X_test)
submission213 = pd.DataFrame({"id": ids, "crop": y_pred_test_213})
submission213.to_csv("submission_213.csv", index=None)

## 2.2. Добавим количество углов у контура поля

In [None]:
def count_edges(shape):
    shape_type = shape.type
    if shape_type == "Polygon":
        n_edges = len(shape.exterior.coords)
    elif shape_type == "MultiPolygon":
        n_edges = sum([len(contour.exterior.coords) for contour in shape.geoms])
    elif shape_type == "GeometryCollection":
        n_edges = 0
        for element in shape.geoms:
            if element.type == "Polygon":
                n_edges += len(element.exterior.coords)
    else:
        n_edges = np.nan                
        
    return n_edges

In [None]:
df["edges"] = df["shape"].apply(count_edges)

In [None]:
df.groupby("shape_type")["edges"].mean()

In [None]:
df["edges"].hist()

In [None]:
X = df.drop(["id",".geo", "crop", "shape", "shape_type"], axis=1)
y = df["crop"]

### 2.2.1. CatBoost

In [None]:
gs221_params = {
    "learning_rate": [0.1, 0.2, 0.25, 0.3, 0.5],
    "depth": range(3, 7)
}

cbc = CatBoostClassifier(iterations=100, verbose=100, loss_function="MultiClass")
gs221 = GridSearchCV(cbc, gs221_params, scoring=recall_scorer, cv=3, verbose=2)
gs221.fit(X, y)

In [None]:
gs221.best_score_, gs221.best_estimator_

In [None]:
gs221.best_estimator_.get_params()

In [None]:
gs221.best_estimator_.get_feature_importance()

In [None]:
test["shape"] = test[".geo"].apply(lambda x: shapely.geometry.shape(geojson.loads(x)))
test["edges"] = test["shape"].apply(count_edges)
test["lon"] = test["shape"].apply(lambda x: x.representative_point().coords[0][0])
test["lat"] = test["shape"].apply(lambda x: x.representative_point().coords[0][1])
X_test = test.drop([".geo", "id", "shape"], axis=1)

In [None]:
ids = test["id"].to_list()
y_pred_test_221 = gs221.best_estimator_.predict(X_test)
submission221 = pd.DataFrame({"id": ids, "crop": y_pred_test_221.flatten()})
submission221.to_csv("submission_221.csv", index=None)

### 2.2.2. XGBoost

In [None]:
gs222_params = {
    "n_estimators": range(50, 151, 50),
    "max_depth": range(3, 6),
    "learning_rate": [x / 10 for x in range(1, 6, 2)],
    #"gamma": [0, 1],
}

gs222 = GridSearchCV(XGBClassifier(booster="dart", objective="multi:softmax", nthread=4), gs222_params, scoring=recall_scorer, cv=3, verbose=2)
gs222.fit(X, y)

In [None]:
gs222.best_score_, gs222.best_estimator_

In [None]:
test["shape"] = test[".geo"].apply(lambda x: shapely.geometry.shape(geojson.loads(x)))
test["lon"] = test["shape"].apply(lambda x: x.representative_point().coords[0][0])
test["lat"] = test["shape"].apply(lambda x: x.representative_point().coords[0][1])
test["edges"] = test["shape"].apply(count_edges)
X_test = test.drop([".geo", "id", "shape"], axis=1)

In [None]:
ids = test["id"].to_list()
y_pred_test_222 = gs222.best_estimator_.predict(X_test)
submission222 = pd.DataFrame({"id": ids, "crop": y_pred_test_222})
submission222.to_csv("submission_222_2.csv", index=None)

### 2.2.3. LightGBM

In [None]:
gs223_params = {
    "num_leaves": range(10, 101, 20),
    "max_depth": range(3, 8),
    "learning_rate": [0.001, 0.01, 0.1, 0.5, 1],
}

gs223 = GridSearchCV(LGBMClassifier(), gs223_params, scoring=recall_scorer, cv=3, verbose=2)
gs223.fit(X, y)

In [None]:
gs223.best_score_, gs223.best_estimator_

In [None]:
test["shape"] = test[".geo"].apply(lambda x: shapely.geometry.shape(geojson.loads(x)))
test["edges"] = test["shape"].apply(count_edges)
test["lon"] = test["shape"].apply(lambda x: x.representative_point().coords[0][0])
test["lat"] = test["shape"].apply(lambda x: x.representative_point().coords[0][1])
X_test = test.drop([".geo", "id", "shape"], axis=1)

In [None]:
ids = test["id"].to_list()
y_pred_test_223 = gs223.best_estimator_.predict(X_test)
submission223 = pd.DataFrame({"id": ids, "crop": y_pred_test_223})
submission223.to_csv("submission_223.csv", index=None)

## 2.3. Заполняем нули средним значением соседей

In [None]:
arr = df.iloc[:, 2:-6].drop(columns=".geo").to_numpy()
arr_left = np.roll(arr, 1, axis=1)
arr_left[:, 0] = 0
arr_right = np.roll(arr, -1, axis=1)
arr_right[:, -1] = 0
arr_avg = (arr_left + arr_right) / 2
arr_filled = np.where(arr == 0, arr_avg, arr)

In [None]:
pd.DataFrame(arr_filled)

In [None]:
df_filled = pd.concat([df["area"], pd.DataFrame(arr_filled), df[["crop", "shape"]]], axis=1)
df_filled.columns = df.drop(columns=["id", ".geo"]).columns[:-4]

In [None]:
df_filled["edges"] = df_filled["shape"].apply(count_edges)
df_filled["lon"] = df_filled["shape"].apply(lambda x: x.representative_point().coords[0][0])
df_filled["lat"] = df_filled["shape"].apply(lambda x: x.representative_point().coords[0][1])

In [None]:
df_filled.info()

In [None]:
X = df_filled.drop(columns=["crop", "shape"])
y = df_filled["crop"]

In [None]:
gs231_params = {
    "learning_rate": [0.1, 0.2, 0.25, 0.3, 0.5],
    "depth": range(3, 7),
    "l2_leaf_reg": [0.1, 1, 10, 100]
}

cbc = CatBoostClassifier(iterations=100, verbose=100, loss_function="MultiClass")
gs231 = GridSearchCV(cbc, gs231_params, scoring=recall_scorer, cv=3, verbose=2)
gs231.fit(X, y)

In [None]:
gs231.best_score_, gs231.best_estimator_

In [None]:
y_pred_test_231 = gs231.best_estimator_.predict(X_test)
submission231 = pd.DataFrame({"id": ids, "crop": y_pred_test_231.flatten()})
submission231.to_csv("submission_231.csv", index=None)

## 2.4. Линейная интерполяция нулей

В отличие от п. 2.3, здесь всё делается по-умному:
 - с учётом того, что колонки идут в абы каком порядке, а не хронологически;
 - с учётом того, что значения идут не через равные интервалы;
 - с учётом того, что пропусков может быть несколько подряд.

In [None]:
class LinearTSImputer(BaseEstimator, TransformerMixin):
    def __init__(self, col_prefix="nd_mean_"):
        self.col_prefix = col_prefix        
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        self._series = []
        X_processed = X.loc[:, X.columns.str.contains(self.col_prefix)]
        X_rest = X.loc[:, ~X.columns.str.contains(self.col_prefix)]
        
        for _, row in X_processed.iterrows():
            row[row == 0] = np.nan
            index = pd.to_datetime(row.index.str.replace(self.col_prefix, ""))
            row = row.set_axis(index)
            row = row.interpolate(method="time", limit_direction="both").sort_index()
            self._series.append(row)
        
        result = pd.DataFrame(self._series)
        result.rename(columns=lambda x: self.col_prefix + x.strftime("%Y-%m-%d"), inplace=True)
        
        result = pd.concat([result, X_rest], axis=1)
        
        return result

In [None]:
gs241_params = {
    "learning_rate": [0.01, 0.1, 0.25, 0.5, 1],
    "depth": range(2, 10, 2),
    #"l2_leaf_reg": [1, 3, 5, 7, 9]
}

cbc = CatBoostClassifier(iterations=100, verbose=100, loss_function="MultiClass")
gs241 = GridSearchCV(cbc, gs241_params, scoring=recall_scorer, cv=3, verbose=2)
gs241.fit(LinearTSImputer().fit_transform(X), y)

In [None]:
gs241.best_score_, gs241.best_estimator_

In [None]:
gs242_params = {
    "n_estimators": range(60, 151, 25),
    "max_depth": range(3, 5),
    "learning_rate": [0.01, 0.1, 1],
}

gs242 = GridSearchCV(XGBClassifier(objective="multi:softmax"), gs242_params, scoring=recall_scorer, cv=3, verbose=2, error_score="raise")
gs242.fit(LinearTSImputer().fit_transform(X), y)

In [None]:
gs242.best_score_, gs242.best_estimator_

In [None]:
X_test = test.drop([".geo", "id"], axis=1)
X_test = LinearTSImputer().fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_242 = gs242.best_estimator_.predict(X_test)
submission242 = pd.DataFrame({"id": ids, "crop": y_pred_test_242})
submission242.to_csv("submission_242.csv", index=None)

In [None]:
def count_edges(shape):
    shape_type = shape.type
    if shape_type == "Polygon":
        n_edges = len(shape.exterior.coords)
    elif shape_type == "MultiPolygon":
        n_edges = sum([len(contour.exterior.coords) for contour in shape.geoms])
    elif shape_type == "GeometryCollection":
        n_edges = 0
        for element in shape.geoms:
            if element.type == "Polygon":
                n_edges += len(element.exterior.coords)
    else:
        n_edges = np.nan                
        
    return n_edges

class GeoTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, length=False, derived=False, computed_area=False):
        self.length = length
        self.derived = derived
        self.computed_area = computed_area
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result["shape"] = result[".geo"].apply(lambda x: shape(geojson.loads(x)))
        result["edges"] = result["shape"].apply(count_edges)
        result["lon"] = result["shape"].apply(lambda x: x.representative_point().coords[0][0])
        result["lat"] = result["shape"].apply(lambda x: x.representative_point().coords[0][1])
        
        if self.length:
            result["length"] = result["shape"].apply(lambda x: x.length)
        if self.derived:
            result["edges_per_length"] = result["edges"] / result["length"]
            result["area_per_length"] = result["area"] / result["length"]
            result["area_per_edge"] = result["area"] / result["edges"]
        if self.computed_area:
            result["computed_area"] = result["shape"].apply(lambda x: x.area)
            
        result.drop(columns=["shape", ".geo"], inplace=True)
        
        return result

In [None]:
transformer_pipe = Pipeline([
    ("geo", GeoTransformer()),
    ("impute", LinearTSImputer())
])

In [None]:
GeoTransformer(length=True, derived=True, computed_area=True).transform(df)

In [None]:
res1 = transformer_pipe.fit_transform(df)
res2 = transformer_pipe.fit_transform(X)

In [None]:
gs252_params = {
    "n_estimators": range(60, 151, 25),
    "max_depth": range(3, 5),
    "learning_rate": [0.01, 0.1, 1],
}

gs252 = GridSearchCV(XGBClassifier(objective="multi:softmax"), gs252_params, scoring=recall_scorer, cv=3, verbose=2, error_score="raise")
gs252.fit(transformer_pipe.fit_transform(X), y)

In [None]:
gs252.best_score_, gs252.best_estimator_

In [None]:
X_test = test.drop(["id"], axis=1)
X_test = transformer_pipe.fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_252 = gs252.best_estimator_.predict(X_test)
submission252 = pd.DataFrame({"id": ids, "crop": y_pred_test_252})
submission252.to_csv("submission_252.csv", index=None)

In [None]:
class DescStatFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result["nd_mean"] = result.loc[:, result.columns.str.contains("nd")].mean(axis=1)
        result["nd_min"] = result.loc[:, result.columns.str.contains("nd")].min(axis=1)
        result["nd_max"] = result.loc[:, result.columns.str.contains("nd")].max(axis=1)
        result["nd_std"] = result.loc[:, result.columns.str.contains("nd")].std(axis=1)
        result["nd_sum"] = result.loc[:, result.columns.str.contains("nd")].sum(axis=1)
        for q in range(1, 20):
            colname = f"nd_q{q}"
            result[colname] = result.loc[:, result.columns.str.contains("nd")].quantile(q / 20, axis=1)
        
        return result

In [None]:
transformer_pipe_with_features = Pipeline([
    ("geo", GeoTransformer()),
    ("stats", DescStatFeatureGenerator()),
    ("impute", LinearTSImputer())
])

In [None]:
transformer_pipe_with_features.transform(X)

In [None]:
gs262_params = {
    "n_estimators": range(60, 151, 25),
    "max_depth": range(3, 5),
    "learning_rate": [0.01, 0.1, 1],
}

gs262 = GridSearchCV(XGBClassifier(objective="multi:softmax"), gs262_params, scoring=recall_scorer, cv=3, verbose=2, error_score="raise")
gs262.fit(transformer_pipe_with_features.fit_transform(X), y)

In [None]:
gs262.best_score_, gs262.best_estimator_

In [None]:
from sklearn.feature_selection import RFECV

gs262_rfecv = RFECV(gs262.best_estimator_, cv=3, scoring=recall_scorer, verbose=2)
gs262_rfecv.fit(transformer_pipe_with_features.fit_transform(X), y)

In [None]:
gs262.feature_names_in_

In [None]:
gs262_rfecv.support_

In [None]:
gs262_rfecv.n_features_

In [None]:
transformer_pipe_without_imputing = Pipeline([
    ("geo", GeoTransformer()),
    ("stats", DescStatFeatureGenerator()),
])

In [None]:
gs272_params = {
    "n_estimators": range(60, 151, 25),
    "max_depth": range(3, 5),
    "learning_rate": [0.01, 0.1, 1],
}

gs272 = GridSearchCV(XGBClassifier(objective="multi:softmax"), gs272_params, scoring=recall_scorer, cv=3, verbose=2, error_score="raise")
gs272.fit(transformer_pipe_without_imputing.fit_transform(X), y)

In [None]:
gs272.best_score_, gs272.best_estimator_

In [None]:
gs272_params = {
    "n_estimators": range(60, 151, 25),
    "max_depth": [4],
    "learning_rate": [0.1],
}

gs272 = GridSearchCV(XGBClassifier(objective="multi:softmax"), gs272_params, scoring=recall_scorer, cv=3, verbose=2, error_score="raise")
gs272.fit(transformer_pipe_without_imputing.fit_transform(X), y)

In [None]:
gs272.best_score_, gs272.best_estimator_

In [None]:
X_test = test.drop(["id"], axis=1)
X_test = transformer_pipe_without_imputing.fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_272 = gs272.best_estimator_.predict(X_test)
submission272 = pd.DataFrame({"id": ids, "crop": y_pred_test_272})
submission272.to_csv("submission_272.csv", index=None)

In [None]:
st = pd.DataFrame({"242": y_pred_test_242, "252": y_pred_test_252, "272": y_pred_test_272})

In [None]:
st.head(50)

In [None]:
st_preds = st.mode(axis=1)[0]

In [None]:
st_preds

In [None]:
submission_st = pd.DataFrame({"id": ids, "crop": st_preds})
submission_st.to_csv("submission_st.csv", index=None)

In [None]:
gs272_rfecv = RFECV(gs272.best_estimator_, cv=3, min_features_to_select=20, step=2, scoring=recall_scorer, verbose=2)
gs272_rfecv.fit(transformer_pipe_without_imputing.fit_transform(X), y)

In [None]:
gs272_rfecv.n_features_

In [None]:
gs272_rfecv.cv_results_

In [None]:
df.iloc[14, df.columns.str.contains("nd")].sort_index().plot()

In [None]:
res.iloc[14, res.columns.str.contains("nd")].sort_index().plot()

In [None]:
res.iloc[14, res.columns.str.contains("nd")].sort_index()

In [None]:
class LinearTSSummator(BaseEstimator, TransformerMixin):
    def __init__(self, col_prefix="nd_mean_"):
        self.col_prefix = col_prefix        
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result["missing_count"] = result.loc[:, result.columns.str.contains(self.col_prefix)].apply(lambda x: len(x[x == 0]), axis=1)
        
        return result

In [None]:
transform_pipe_with_sums = Pipeline([
    ("geo", GeoTransformer()),
    ("stats", DescStatFeatureGenerator()),
    ("cnt_missing", LinearTSSummator())
])

In [None]:
X = df.drop(["id", "crop"], axis=1)
y = df["crop"]

In [None]:
gs281_params = {
    "learning_rate": [0.01, 0.1, 0.25, 0.5, 1],
    "depth": range(2, 10, 2),
    #"l2_leaf_reg": [1, 3, 5, 7, 9]
}

cbc = CatBoostClassifier(iterations=200, verbose=100, loss_function="MultiClass")
gs281 = GridSearchCV(cbc, gs281_params, scoring=recall_scorer, cv=3, verbose=2)
gs281.fit(transform_pipe_with_sums.fit_transform(X), y)

In [None]:
gs281.best_score_, gs281.best_estimator_

In [None]:
X_test = test.drop(["id"], axis=1)
X_test = transform_pipe_with_sums.fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_281 = gs281.best_estimator_.predict(X_test)
submission281 = pd.DataFrame({"id": ids, "crop": y_pred_test_281.flatten()})
submission281.to_csv("submission_281_2.csv", index=None)

In [None]:
gs282_params = {
    "n_estimators": range(50, 201, 25),
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.1, 0.2, 0.3],
}

gs282 = GridSearchCV(XGBClassifier(objective="multi:softmax"), gs282_params, scoring=recall_scorer, cv=3, verbose=2, error_score="raise")
gs282.fit(transform_pipe_with_sums.fit_transform(X), y)

In [None]:
gs282.best_score_, gs282.best_estimator_

In [None]:
X_test = test.drop(["id"], axis=1)
X_test = transform_pipe_with_sums.fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_282 = gs282.best_estimator_.predict(X_test)
submission282 = pd.DataFrame({"id": ids, "crop": y_pred_test_282})
submission282.to_csv("submission_282_2.csv", index=None)

In [None]:
gs283_params = {
    "num_leaves": range(10, 31, 10),
    "max_depth": range(4, 8),
    "learning_rate": [0.001, 0.01, 0.1, 1],
    "reg_lambda": [0, 1]
}

gs283 = GridSearchCV(LGBMClassifier(boosting_type="dart", n_estimators=200), gs283_params, scoring=recall_scorer, cv=3, verbose=2)
gs283.fit(transform_pipe_with_sums.fit_transform(X), y)

In [None]:
gs283.best_score_, gs283.best_estimator_

In [None]:
X_test = test.drop(["id"], axis=1)
X_test = transform_pipe_with_sums.fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_283 = gs283.best_estimator_.predict(X_test)
submission283 = pd.DataFrame({"id": ids, "crop": y_pred_test_283})
submission283.to_csv("submission_283_2.csv", index=None)

In [None]:
gs284_params = {
    "alpha": [0.001, 0.01, 0.1],
    "learning_rate_init": [0.0001, 0.001, 0.01, 0.1, 1],
}

gs284 = GridSearchCV(MLPClassifier(max_iter=10000, batch_size=50), gs284_params, scoring=recall_scorer, cv=3, verbose=2)
gs284.fit(transform_pipe_with_sums.fit_transform(X), y)

In [None]:
gs284.best_score_, gs284.best_estimator_

In [None]:
X_test = test.drop(["id"], axis=1)
X_test = transform_pipe_with_sums.fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_284 = gs284.best_estimator_.predict(X_test)
submission284 = pd.DataFrame({"id": ids, "crop": y_pred_test_284})
submission284.to_csv("submission_284.csv", index=None)

In [None]:
transform_pipe_full = Pipeline([
    ("geo", GeoTransformer()),
    ("stats", DescStatFeatureGenerator()),
    ("cnt_missing", LinearTSSummator()),
    ("impute", LinearTSImputer())
])

In [None]:
X = df.drop(["id", "crop"], axis=1)
y = df["crop"]

In [None]:
gs291_params = {
    "learning_rate": [0.01, 0.1, 0.25, 0.5, 1],
    "depth": range(2, 10, 2),
    #"l2_leaf_reg": [1, 3, 5, 7, 9]
}

cbc = CatBoostClassifier(iterations=200, verbose=100, loss_function="MultiClass")
gs291 = GridSearchCV(cbc, gs291_params, scoring=recall_scorer, cv=3, verbose=2)
gs291.fit(transform_pipe_full.fit_transform(X), y)

In [None]:
gs291.best_score_, gs291.best_estimator_

In [None]:
X_test = test.drop(["id"], axis=1)
X_test = transform_pipe_full.fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_291 = gs291.best_estimator_.predict(X_test)
submission291 = pd.DataFrame({"id": ids, "crop": y_pred_test_291.flatten()})
submission291.to_csv("submission_291.csv", index=None)

In [None]:
gs292_params = {
    "num_leaves": range(10, 31, 10),
    "max_depth": range(4, 8),
    "learning_rate": [0.001, 0.01, 0.1, 1],
    "reg_lambda": [0, 1]
}

gs292 = GridSearchCV(LGBMClassifier(boosting_type="dart", n_estimators=200), gs292_params, scoring=recall_scorer, cv=3, verbose=2)
gs292.fit(transform_pipe_full.fit_transform(X), y)

In [None]:
gs292.best_score_, gs292.best_estimator_

In [None]:
transform_pipe_full.fit_transform(X)

In [None]:
class NDVIMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, col_prefix="nd_mean_"):
        self.col_prefix = col_prefix        
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        cols = result.columns[result.columns.str.contains(self.col_prefix)].to_list()
        result["means"] = result.loc[:, cols].mean(axis=1)        
        cols.append("means")
        result = result.loc[:, cols].replace(0, np.nan)
        result.fillna(method="backfill", axis=1, inplace=True)
        result.drop(columns="means", inplace=True)
        
        return result

In [None]:
transform_pipe_na_means = Pipeline([
    ("geo", GeoTransformer()),
    ("stats", DescStatFeatureGenerator()),
    ("cnt_missing", LinearTSSummator()),
    ("impute", NDVIMeanImputer())
])

In [None]:
X = df.drop(["id", "crop"], axis=1)
y = df["crop"]

In [None]:
gs2101_params = {
    "learning_rate": [0.01, 0.1, 0.25, 0.5, 1],
    "depth": range(2, 10, 2),
    #"l2_leaf_reg": [1, 3, 5, 7, 9]
}

cbc = CatBoostClassifier(iterations=200, verbose=100, loss_function="MultiClass")
gs2101 = GridSearchCV(cbc, gs2101_params, scoring=recall_scorer, cv=3, verbose=2)
gs2101.fit(transform_pipe_na_means.fit_transform(X), y)

In [None]:
gs2101.best_score_, gs2101.best_estimator_

In [None]:
transform_pipe_adv_geo = Pipeline([
    ("geo", GeoTransformer(length=True, derived=True, computed_area=True)),
    ("stats", DescStatFeatureGenerator()),
    ("cnt_missing", LinearTSSummator())    
])

In [None]:
X = df.drop(["id", "crop"], axis=1)
y = df["crop"]

In [None]:
gs2111_params = {
    "learning_rate": [0.01, 0.1, 0.25, 0.5, 1],
    "depth": range(2, 10, 2),
    #"l2_leaf_reg": [1, 3, 5, 7, 9]
}

cbc = CatBoostClassifier(iterations=200, verbose=100, loss_function="MultiClass")
gs2111 = GridSearchCV(cbc, gs2111_params, scoring=recall_scorer, cv=3, verbose=2)
gs2111.fit(transform_pipe_adv_geo.fit_transform(X), y)

In [None]:
gs2111.best_score_, gs2111.best_estimator_

In [None]:
X_test = test.drop(["id"], axis=1)
X_test = transform_pipe_adv_geo.fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_2111 = gs2111.best_estimator_.predict(X_test)
submission2111 = pd.DataFrame({"id": ids, "crop": y_pred_test_2111.flatten()})
submission2111.to_csv("submission_2111.csv", index=None)

In [None]:
gs2112_params = {
    "num_leaves": range(10, 31, 10),
    "max_depth": range(4, 8),
    "learning_rate": [0.001, 0.01, 0.1, 1],
    "reg_lambda": [0, 1]    
}

gs2112 = GridSearchCV(LGBMClassifier(boosting_type="dart", n_estimators=200), gs2112_params, scoring=recall_scorer, cv=3, verbose=2)
gs2112.fit(transform_pipe_adv_geo.fit_transform(X), y)

In [None]:
gs2112.best_score_, gs2112.best_estimator_

In [None]:
X_test = test.drop(["id"], axis=1)
X_test = transform_pipe_adv_geo.fit_transform(X_test)
ids = test["id"].to_list()
y_pred_test_2112 = gs2112.best_estimator_.predict(X_test)
submission2112 = pd.DataFrame({"id": ids, "crop": y_pred_test_2112})
submission2112.to_csv("submission_2112.csv", index=None)

In [None]:
def draw_hist(x):
    vals = x.to_numpy().flatten()
    plot = plt.hist(vals[vals < 1], bins=10)
    return plot

In [None]:
plots = df.iloc[:, 2:].drop(columns=".geo").groupby("crop").apply(draw_hist)

In [None]:
plots[0]

# 3. Решения, близкие к финальным

Оптимально:
 - не заполнять нули;
 - посчитать количество нулей (по сути некорректных значений) и вынести в отдельную колонку;
 - извлечь долготу, широту, количество углов, длину;
 - посчитать некоторые описательные статистики.

In [601]:
#!c1.32
def count_edges(shape):
    shape_type = shape.type
    if shape_type == "Polygon":
        n_edges = len(shape.exterior.coords)
    elif shape_type == "MultiPolygon":
        n_edges = sum([len(contour.exterior.coords) for contour in shape.geoms])
    elif shape_type == "GeometryCollection":
        n_edges = 0
        for element in shape.geoms:
            if element.type == "Polygon":
                n_edges += len(element.exterior.coords)
    else:
        n_edges = np.nan                
        
    return n_edges

class GeoTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, edges=False, length=False, derived=False, computed_area=False):
        self.edges = edges
        self.length = length
        self.derived = derived
        self.computed_area = computed_area
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result["shape"] = result[".geo"].apply(lambda x: shape(geojson.loads(x)))
        result["lon"] = result["shape"].apply(lambda x: x.representative_point().coords[0][0])
        result["lat"] = result["shape"].apply(lambda x: x.representative_point().coords[0][1])
        
        if self.edges:
            result["edges"] = result["shape"].apply(count_edges)
        if self.length:
            result["length"] = result["shape"].apply(lambda x: x.length)
        if self.derived:
            result["edges_per_length"] = result["edges"] / result["length"]
            result["area_per_length"] = result["area"] / result["length"]
            result["area_per_edge"] = result["area"] / result["edges"]
        if self.computed_area:
            result["computed_area"] = result["shape"].apply(lambda x: x.area)
            
        result.drop(columns=["shape", ".geo"], inplace=True)
        
        return result

In [378]:
#!c1.32
class DescStatFeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result["nd_mean"] = result.loc[:, result.columns.str.contains("nd")].mean(axis=1)
        result["nd_min"] = result.loc[:, result.columns.str.contains("nd")].min(axis=1)
        result["nd_max"] = result.loc[:, result.columns.str.contains("nd")].max(axis=1)
        result["nd_std"] = result.loc[:, result.columns.str.contains("nd")].std(axis=1)
        result["nd_sum"] = result.loc[:, result.columns.str.contains("nd")].sum(axis=1)
        for q in range(1, 20):
            colname = f"nd_q{q}"
            result[colname] = result.loc[:, result.columns.str.contains("nd")].quantile(q / 20, axis=1)
        
        return result

In [602]:
#!c1.32
class MissingTSSummator(BaseEstimator, TransformerMixin):
    def __init__(self, col_prefix="nd_mean_"):
        self.col_prefix = col_prefix        
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result["missing_count"] = result.loc[:, result.columns.str.contains(self.col_prefix)].apply(lambda x: len(x[x == 0]), axis=1)
        
        return result

In [449]:
#!c1.32
transformer = Pipeline([
    ("geo", GeoTransformer(length=True)),
    ("count_missing", MissingTSSummator()),
    ("stats", DescStatFeatureGenerator())
])

In [450]:
#!c1.32
X = df.drop(columns=["id", "crop"])
y = df["crop"]

In [430]:
#!c1.32
gs31_params = {
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1],
    "depth": range(4, 8, 1),
    #"l2_leaf_reg": [],
    "grow_policy": ["SymmetricTree", "Depthwise", "Lossguide"],
    #"min_data_in_leaf": range(5, 50, 10)
}

cbc = CatBoostClassifier(iterations=200, verbose=False, loss_function="MultiClass",
                         random_seed=42)
gs31 = GridSearchCV(cbc, gs31_params, scoring=recall_scorer,
                    cv=3, n_jobs=32, verbose=1, error_score="raise")
gs31.fit(transformer.fit_transform(X), y)

Fitting 3 folds for each of 84 candidates, totalling 252 fits


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:  3.8min
[Parallel(n_jobs=32)]: Done 252 out of 252 | elapsed:  9.3min finished


GridSearchCV(cv=3, error_score='raise',
             estimator=<catboost.core.CatBoostClassifier object at 0x7f12db5805e0>,
             iid='deprecated', n_jobs=32,
             param_grid={'depth': range(4, 8),
                         'grow_policy': ['SymmetricTree', 'Depthwise',
                                         'Lossguide'],
                         'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(recall_score, average=macro, zero_division=0),
             verbose=1)

In [431]:
#!c1.32
gs31.best_score_, gs31.best_estimator_.get_all_params()

(0.9669035099719944,
 {'nan_mode': 'Min',
  'eval_metric': 'MultiClass',
  'iterations': 200,
  'sampling_frequency': 'PerTree',
  'leaf_estimation_method': 'Newton',
  'grow_policy': 'Lossguide',
  'penalties_coefficient': 1,
  'boosting_type': 'Plain',
  'model_shrink_mode': 'Constant',
  'feature_border_type': 'GreedyLogSum',
  'bayesian_matrix_reg': 0.10000000149011612,
  'l2_leaf_reg': 3,
  'random_strength': 1,
  'rsm': 1,
  'boost_from_average': False,
  'model_size_reg': 0.5,
  'use_best_model': False,
  'class_names': [0, 1, 2, 3, 4, 5, 6],
  'random_seed': 42,
  'depth': 5,
  'border_count': 254,
  'bagging_temperature': 1,
  'classes_count': 0,
  'auto_class_weights': 'None',
  'sparse_features_conflict_fraction': 0,
  'leaf_estimation_backtracking': 'AnyImprovement',
  'best_model_min_trees': 1,
  'model_shrink_rate': 0,
  'min_data_in_leaf': 1,
  'loss_function': 'MultiClass',
  'learning_rate': 0.20000000298023224,
  'score_function': 'Cosine',
  'task_type': 'CPU',
  'le

In [None]:
#!c1.32
CatBoostClassifier(iterations=200, depth=5, verbose=False,
                   loss_function="MultiClass", random_seed=42)

In [382]:
#!c1.32
gs32_params = {
    "booster": ["gbtree", "gblinear", "dart"],
    #"n_estimators": range(50, 201, 25),
    "max_depth": range(3, 11),
    "learning_rate": [0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.75, 1],
    "gamma": [0, 1, 5, 10, 100],
    "grow_policy": ["depthwise", "lossguide"]
}

gs32 = GridSearchCV(XGBClassifier(objective="multi:softmax", n_estimators=100), gs32_params,
                    scoring=recall_scorer, cv=3, verbose=2, error_score="raise",
                    n_jobs=32)
gs32.fit(transformer.fit_transform(X), y)

ster=gblinear, gamma=5, grow_policy=lossguide, learning_rate=0.05, max_depth=10, total=   3.8s
[CV] booster=gblinear, gamma=5, grow_policy=lossguide, learning_rate=0.2, max_depth=4 
[CV]  booster=gblinear, gamma=5, grow_policy=lossguide, learning_rate=0.2, max_depth=4, total=   3.8s
[CV] booster=gblinear, gamma=5, grow_policy=lossguide, learning_rate=0.3, max_depth=6 
[CV]  booster=gblinear, gamma=5, grow_policy=lossguide, learning_rate=0.3, max_depth=6, total=   3.9s
[CV] booster=gblinear, gamma=5, grow_policy=lossguide, learning_rate=0.5, max_depth=9 
[CV]  booster=gblinear, gamma=5, grow_policy=lossguide, learning_rate=0.5, max_depth=9, total=   3.9s
[CV] booster=gblinear, gamma=5, grow_policy=lossguide, learning_rate=1, max_depth=3 
[CV]  booster=gblinear, gamma=5, grow_policy=lossguide, learning_rate=1, max_depth=3, total=   3.9s
[CV] booster=gblinear, gamma=10, grow_policy=depthwise, learning_rate=0.001, max_depth=6 
[CV]  booster=gblinear, gamma=10, grow_policy=depthwise, learni

[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done  98 tasks      | elapsed:  1.6min
[Parallel(n_jobs=32)]: Done 301 tasks      | elapsed:  3.7min
[Parallel(n_jobs=32)]: Done 584 tasks      | elapsed:  6.9min
[Parallel(n_jobs=32)]: Done 949 tasks      | elapsed: 12.0min
[Parallel(n_jobs=32)]: Done 1394 tasks      | elapsed: 18.7min
[Parallel(n_jobs=32)]: Done 1921 tasks      | elapsed: 26.7min
[Parallel(n_jobs=32)]: Done 2528 tasks      | elapsed: 30.9min
[Parallel(n_jobs=32)]: Done 3217 tasks      | elapsed: 32.3min
[Parallel(n_jobs=32)]: Done 3986 tasks      | elapsed: 33.9min
[Parallel(n_jobs=32)]: Done 4837 tasks      | elapsed: 41.9min
[Parallel(n_jobs=32)]: Done 5768 tasks      | elapsed: 57.0min
[Parallel(n_jobs=32)]: Done 6480 out of 6480 | elapsed: 68.9min finished


GridSearchCV(cv=3, error_score='raise',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='multi:softmax',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_...
             iid='deprecated', n_jobs=32,
             param_grid={'booster': ['gbtree', 'gblinear', 'dart'],
                         'gamma': [0, 1, 5, 10, 100],
                         'grow_policy': ['depthwise', 'lossguide'],
                         'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3,
        

In [383]:
#!c1.32
gs32.best_score_, gs32.best_estimator_

(0.9691481655542757,
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0,
               grow_policy='depthwise', learning_rate=0.3, max_delta_step=0,
               max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
               n_jobs=1, nthread=None, objective='multi:softprob',
               random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
               seed=None, silent=None, subsample=1, verbosity=1))

In [384]:
#!c1.32
def save_sumbission(test_data, transformer, model, suffix):
    ids = test_data["id"].to_list()
    X_test = test_data.drop(["id"], axis=1)
    X_test = transformer.fit_transform(X_test)
    y_pred = model.predict(X_test)
    submission = pd.DataFrame({"id": ids, "crop": y_pred})
    submission.to_csv(f"submission_{suffix}.csv", index=None)

In [386]:
#!c1.32
save_sumbission(test, transformer, gs32.best_estimator_, "32")

In [409]:
#!c1.32
X_train, X_val, y_train, y_val = train_test_split(transformer.fit_transform(X), y, test_size=0.75, random_state=42)

In [412]:
#!c1.32
xgb_best = gs32.best_estimator_
xgb_best.set_params(eval_metric="mlogloss", n_estimators=500)
xgb_best.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10)

[0]	validation_0-mlogloss:1.36093
Will train until validation_0-mlogloss hasn't improved in 10 rounds.
[1]	validation_0-mlogloss:1.08031
[2]	validation_0-mlogloss:0.894126
[3]	validation_0-mlogloss:0.75725
[4]	validation_0-mlogloss:0.64955
[5]	validation_0-mlogloss:0.567716
[6]	validation_0-mlogloss:0.498612
[7]	validation_0-mlogloss:0.442068
[8]	validation_0-mlogloss:0.39654
[9]	validation_0-mlogloss:0.361551
[10]	validation_0-mlogloss:0.33058
[11]	validation_0-mlogloss:0.305997
[12]	validation_0-mlogloss:0.284722
[13]	validation_0-mlogloss:0.267003
[14]	validation_0-mlogloss:0.255232
[15]	validation_0-mlogloss:0.243205
[16]	validation_0-mlogloss:0.230221
[17]	validation_0-mlogloss:0.221683
[18]	validation_0-mlogloss:0.211638
[19]	validation_0-mlogloss:0.205751
[20]	validation_0-mlogloss:0.199959
[21]	validation_0-mlogloss:0.19534
[22]	validation_0-mlogloss:0.190239
[23]	validation_0-mlogloss:0.186693
[24]	validation_0-mlogloss:0.183332
[25]	validation_0-mlogloss:0.180053
[26]	validat

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=50,
              eval_metric='mlogloss', gamma=0, grow_policy='depthwise',
              learning_rate=0.3, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [413]:
#!c1.32
recall_score(y_val, xgb_best.predict(X_val), average="macro", zero_division=0)

0.9500664756740308

In [415]:
#!c1.32
save_sumbission(test, transformer, xgb_best, "32_1")

In [433]:
#!c1.32
gs33_params = {
    "num_leaves": range(10, 31, 10),
    "max_depth": range(4, 11),
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.5, 0.7, 1],
    "reg_lambda": [0, 1, 10]
}

gs33 = GridSearchCV(LGBMClassifier(boosting_type="dart", n_estimators=200),
                    gs33_params, random_state=42,
                    scoring=recall_scorer, cv=3, verbose=1, n_jobs=32)
gs33.fit(transformer.fit_transform(X), y)

Fitting 3 folds for each of 441 candidates, totalling 1323 fits


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:  2.0min
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:  5.6min
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed: 10.7min
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed: 15.9min
[Parallel(n_jobs=32)]: Done 1323 out of 1323 | elapsed: 17.3min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LGBMClassifier(boosting_type='dart', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=200,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=32,
             param_grid={'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5, 0.7, 1],
                

In [434]:
#!c1.32
gs33.best_score_, gs33.best_estimator_

(0.9731523592058261,
 LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.5, max_depth=8,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=200, n_jobs=-1, num_leaves=20, objective=None,
                random_state=None, reg_alpha=0.0, reg_lambda=0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0))

In [435]:
#!c1.32
save_sumbission(test, transformer, gs33.best_estimator_, "33")

In [452]:
#!c1.32
lgbm_best = LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.5, max_depth=8,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=200, n_jobs=-1, num_leaves=20, objective=None,
                random_state=None, reg_alpha=0.0, reg_lambda=0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
lgbm_best.set_params(random_state=42)
gs_33_2 = GridSearchCV(lgbm_best, {"n_estimators": range(10, 501)},
                       scoring=recall_scorer, cv=3,
                       verbose=1, n_jobs=32)
gs_33_2.fit(transformer.fit_transform(X), y)

Fitting 3 folds for each of 491 candidates, totalling 1473 fits


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:   33.5s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:  3.2min
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed: 10.0min
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed: 22.3min
[Parallel(n_jobs=32)]: Done 1473 out of 1473 | elapsed: 32.0min finished


NameError: name 'gs33_2' is not defined

In [453]:
#!c1.32
gs_33_2.best_score_, gs_33_2.best_estimator_

(0.9730620213494628,
 LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.5, max_depth=8,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=287, n_jobs=-1, num_leaves=20, objective=None,
                random_state=42, reg_alpha=0.0, reg_lambda=0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0))

In [454]:
#!c1.32
save_sumbission(test, transformer, gs_33_2.best_estimator_, "33_2")

In [514]:
#!c1.32
gs34_params = {
    "n_estimators": range(3, 100, 3),
    "criterion": ["gini", "entropy"],
    "max_depth": range(3, 61, 3),
    "min_samples_split": range(5, 61, 5)
}

gs34 = GridSearchCV(RandomForestClassifier(random_state=42), gs34_params,
                    scoring=recall_scorer, cv=3, verbose=1, n_jobs=32)
gs34.fit(transformer.fit_transform(X), y)

Fitting 3 folds for each of 15840 candidates, totalling 47520 fits


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    3.5s
[Parallel(n_jobs=32)]: Done 386 tasks      | elapsed:    8.1s
[Parallel(n_jobs=32)]: Done 736 tasks      | elapsed:   14.1s
[Parallel(n_jobs=32)]: Done 1186 tasks      | elapsed:   22.1s
[Parallel(n_jobs=32)]: Done 1736 tasks      | elapsed:   37.5s
[Parallel(n_jobs=32)]: Done 2386 tasks      | elapsed:   56.2s
[Parallel(n_jobs=32)]: Done 3136 tasks      | elapsed:  1.4min
[Parallel(n_jobs=32)]: Done 3986 tasks      | elapsed:  1.9min
[Parallel(n_jobs=32)]: Done 4936 tasks      | elapsed:  2.5min
[Parallel(n_jobs=32)]: Done 5986 tasks      | elapsed:  3.3min
[Parallel(n_jobs=32)]: Done 7136 tasks      | elapsed:  4.1min
[Parallel(n_jobs=32)]: Done 8386 tasks      | elapsed:  4.9min
[Parallel(n_jobs=32)]: Done 9736 tasks      | elapsed:  5.9min
[Parallel(n_jobs=32)]: Done 11186 tasks      | elapsed:  6.9min
[Parallel(n_jobs=32)]: Done 12736 tasks    

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=42,
                                  

In [515]:
#!c1.32
gs34.best_score_, gs34.best_estimator_

(0.9651392260237444,
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=36, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=20,
                        min_weight_fraction_leaf=0.0, n_estimators=93,
                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
                        warm_start=False))

In [517]:
#!c1.32
save_sumbission(test, transformer, gs34.best_estimator_, "34")

# 4. `sktime`

In [524]:
#!c1.32
from sktime.classification.feature_based import Catch22Classifier

In [520]:
#!c1.32
X_ts = df.loc[:, df.columns.str.contains("nd_mean_")]
y_ts = df["crop"]

In [521]:
#!c1.32
X_ts

Unnamed: 0,nd_mean_2021-04-16,nd_mean_2021-04-19,nd_mean_2021-04-22,nd_mean_2021-04-26,nd_mean_2021-04-28,nd_mean_2021-05-02,nd_mean_2021-05-04,nd_mean_2021-05-07,nd_mean_2021-05-16,nd_mean_2021-05-17,...,nd_mean_2021-05-19,nd_mean_2021-05-20,nd_mean_2021-06-04,nd_mean_2021-06-05,nd_mean_2021-06-10,nd_mean_2021-07-05,nd_mean_2021-08-13,nd_mean_2021-08-27,nd_mean_2021-05-08,nd_mean_2021-05-24
0,0.072846,0.261778,0.062981,0.104442,0.021096,0.052202,0.158723,0.000000,0.484102,0.201013,...,0.425571,0.108999,0.118854,0.000000,0.000000,0.026784,0.126832,0.614770,0.008857,0.081498
1,0.332928,0.287182,0.097138,0.152467,0.075484,0.000000,0.140326,0.163340,0.519456,0.501486,...,0.248935,0.134469,0.208268,0.000000,0.000000,0.111148,0.174914,0.179612,0.113071,0.046997
2,0.013679,0.007055,0.120804,0.037839,0.259125,0.000000,0.037446,0.506516,0.000000,0.561541,...,0.127633,0.084467,0.000000,0.210496,0.130467,0.155225,0.090607,0.054127,0.007437,0.219614
3,0.105976,0.052500,0.001917,0.011531,0.042316,0.000000,0.000000,0.043604,0.510271,0.511552,...,0.527583,0.000000,0.642226,0.132886,0.000000,0.570928,0.073492,0.378900,0.327677,0.586523
4,0.130885,0.063492,0.024416,0.000000,0.150671,0.046717,0.254284,0.101479,0.000000,0.403379,...,0.000000,0.206158,0.099058,0.152003,0.073357,0.147842,0.157676,0.012048,0.054223,0.017539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4825,0.028770,0.038282,0.255329,0.148166,0.225421,0.479483,0.071684,0.016856,0.407955,0.367435,...,0.169733,0.001875,0.031995,0.066882,0.139222,0.009041,0.042483,0.022464,0.017705,0.148440
4826,0.035939,0.030722,0.033869,0.029646,0.318387,0.247905,0.265180,0.286298,0.578936,0.420588,...,0.041057,0.192014,0.255964,0.131357,0.000000,0.187236,0.055721,0.044182,0.048316,0.041440
4827,0.191255,0.063581,0.140111,0.323984,0.000000,0.005056,0.092693,0.399439,0.006256,0.477546,...,0.022025,0.085006,0.187352,0.142930,0.000000,0.047572,0.007225,0.145505,0.051215,0.079730
4828,0.291399,0.221428,0.007247,0.055074,0.057781,0.187749,0.112403,0.183555,0.504538,0.721011,...,0.490711,0.536825,0.644357,0.085861,0.631304,0.694524,0.031836,0.453151,0.000000,0.006495


In [527]:
#!c1.32
from sklearn.model_selection import cross_validate

In [538]:
#!c1.32
X_ts_train, X_ts_val, y_ts_train, y_ts_val = train_test_split(X_ts, y_ts, train_size=0.75, random_state=42)

In [539]:
#!c1.32
c22 = Catch22Classifier(
    estimator=RandomForestClassifier(n_estimators=100, max_depth=40, random_state=42),
    outlier_norm=True,
)
c22.fit(X_ts_train.to_numpy(), y_ts_train)

Catch22Classifier(estimator=RandomForestClassifier(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   class_weight=None,
                                                   criterion='gini',
                                                   max_depth=40,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
   

In [541]:
#!c1.32
recall_score(y_ts_val, c22.predict(X_ts_val.to_numpy()), average="macro", zero_division=0)

0.697730047799496

# 5. Combinations

In [546]:
#!c1.32
classifires = {
    "rfc": gs34.best_estimator_,
    "cbc": CatBoostClassifier(iterations=200, depth=5, verbose=False,
                   loss_function="MultiClass", random_seed=42),
    "lgbm": LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
                importance_type='split', learning_rate=0.5, max_depth=8,
                min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                n_estimators=200, n_jobs=-1, num_leaves=20, objective=None,
                random_state=42, reg_alpha=0.0, reg_lambda=0, silent=True,
                subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
    "xgb": XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, gamma=0,
               grow_policy='depthwise', learning_rate=0.3, max_delta_step=0,
               max_depth=4, min_child_weight=1, missing=None, n_estimators=100,
               n_jobs=1, nthread=None, objective='multi:softprob',
               random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
               seed=None, silent=None, subsample=1, verbosity=1)
}

In [548]:
#!c1.32
X_tr, X_val, y_tr, y_val = train_test_split(transformer.transform(X), y, train_size=0.75, random_state=42)

In [557]:
#!c1.32
X_transformed = transformer.transform(X)
preds = {}
for name, clf in classifires.items():
    clf.fit(X_tr, y_tr)
    pred = clf.predict(X_val)
    preds[name] = pred

In [562]:
#!c1.32
recall_score(y_val, preds["cbc"], average="macro", zero_division=0)

0.9564919187192731

In [565]:
#!c1.32
preds["rfc"]

array([0, 3, 3, ..., 5, 5, 2])

In [567]:
#!c1.32
preds["cbc"] = preds["cbc"].flatten()

In [568]:
#!c1.32
preds_df = pd.DataFrame(preds)

In [573]:
#!c1.32
preds_df["true"] = y_val.values

Unnamed: 0,rfc,cbc,lgbm,xgb,true
0,0,0,0,0,0
1,3,3,3,3,3
2,3,3,3,3,3
3,1,1,1,1,1
4,1,1,1,1,1
...,...,...,...,...,...
1203,3,3,3,3,3
1204,0,0,0,0,0
1205,5,5,5,5,5
1206,5,5,5,5,5


In [584]:
#!c1.32
recall_score(y_val, preds_df.iloc[:, :-1].mode(axis=1)[0], average="macro", zero_division=0)

0.9656240040872005

In [None]:
#!c1.32


In [579]:
#!c1.32
gs_preds_params = {
    "learning_rate": [0.1, 1],
    "max_depth": [4, 5, 6, 7],
    "num_leaves": [10, 20, 30]
}

gs_preds = GridSearchCV(LGBMClassifier(n_estimators=30, random_state=42),
                        gs_preds_params, scoring=recall_scorer, cv=5)
gs_preds.fit(preds_df.iloc[:, :-1], preds_df["true"])

GridSearchCV(cv=5, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=30,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=42, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.1, 1], 'max_depth': [4, 5, 6, 7],
                  

In [580]:
#!c1.32
gs_preds.best_score_

0.9679268181870512

In [603]:
#!c1.32
transformer2 = Pipeline([
    ("geo", GeoTransformer()),
    ("count_missing", MissingTSSummator()),
    #("stats", DescStatFeatureGenerator())
])

In [606]:
#!c1.32
gs35_params = {
    "n_estimators": range(70, 100, 3),
    "criterion": ["gini"], #, "entropy"],
    "max_depth": range(20, 61, 3),
    "min_samples_split": range(5, 41, 5)
}

gs35 = GridSearchCV(RandomForestClassifier(random_state=42), gs35_params,
                    scoring=recall_scorer, cv=3, verbose=1, n_jobs=32)
gs35.fit(transformer2.fit_transform(X), y)

Fitting 3 folds for each of 1120 candidates, totalling 3360 fits


In [607]:
#!c1.32
gs35.best_score_, gs35.best_estimator_

(0.9671030514967781,
 RandomForestClassifier(max_depth=29, min_samples_split=5, n_estimators=70,
                        random_state=42))

In [611]:
#!c1.32
gs36_params = {
    "num_leaves": range(10, 31, 10),
    "max_depth": range(4, 11),
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.5, 0.7, 1],
    "reg_lambda": [0, 1, 10],
    "n_estimators": [10, 25, 50, 100, 200]
}

gs36 = GridSearchCV(LGBMClassifier(boosting_type="dart", random_state=42),
                    gs36_params,
                    scoring=recall_scorer, cv=3, verbose=1, n_jobs=32)
gs36.fit(transformer2.fit_transform(X), y)

Fitting 3 folds for each of 2205 candidates, totalling 6615 fits


In [612]:
#!c1.32
gs36.best_score_, gs36.best_estimator_

(0.9726520805412916,
 LGBMClassifier(boosting_type='dart', learning_rate=0.7, max_depth=9,
                n_estimators=200, num_leaves=20, random_state=42, reg_lambda=0))

In [614]:
#!c1.32
save_sumbission(test, transformer2, gs36.best_estimator_, '36')

# 6. With `catch22` features

In [615]:
#!c1.32
from sktime.transformations.panel.catch22 import Catch22

In [643]:
#!c1.32
rows = []
for i, row in df.iterrows():
    row_t = Catch22().fit_transform(row[row.index.str.contains("nd")].astype("float32").to_numpy())
    rows.append(row_t)
catch22_df = pd.DataFrame(rows)

ValueError: Must pass 2-d input. shape=(4830, 1, 22)

In [649]:
#!c1.32
catch22_df = pd.concat(rows, axis=0)
catch22_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.075486,0.037743,6.0,-0.2,0.0,1.0,2.0,0.021755,1.079922,0.251422,...,1.0,0.73913,4.0,2.153809,0.090909,0.747868,0.758621,0.655172,0.055556,4.0
0,0.070904,0.035452,9.0,-0.285714,0.0,2.0,2.0,0.032605,0.589049,0.254183,...,1.0,0.797101,4.0,2.08177,0.166667,0.891584,0.758621,0.793103,0.019284,9.0
0,0.074117,0.037059,14.0,-0.371429,0.0,7.0,1.0,0.036986,0.19635,0.200494,...,2.0,0.811594,4.0,1.993455,0.076923,0.72467,0.586207,0.689655,0.04,19.0
0,0.075328,0.037664,9.0,-0.264286,0.0,1.0,4.0,0.026207,0.785398,0.256016,...,3.0,0.768116,3.0,2.13488,0.125,0.771501,0.793103,0.793103,0.015625,0.0
0,0.083369,0.041685,7.0,-0.214286,0.0,4.0,4.0,0.046433,0.245437,0.214467,...,3.0,0.768116,5.0,2.062417,0.066667,0.842103,0.758621,0.689655,0.0625,21.0


In [657]:
#!c1.32
catch22_df.reset_index(drop=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.075486,0.037743,6.0,-0.200000,0.0,1.0,2.0,0.021755,1.079922,0.251422,...,1.0,0.739130,4.0,2.153809,0.090909,0.747868,0.758621,0.655172,0.055556,4.0
1,0.070904,0.035452,9.0,-0.285714,0.0,2.0,2.0,0.032605,0.589049,0.254183,...,1.0,0.797101,4.0,2.081770,0.166667,0.891584,0.758621,0.793103,0.019284,9.0
2,0.074117,0.037059,14.0,-0.371429,0.0,7.0,1.0,0.036986,0.196350,0.200494,...,2.0,0.811594,4.0,1.993455,0.076923,0.724670,0.586207,0.689655,0.040000,19.0
3,0.075328,0.037664,9.0,-0.264286,0.0,1.0,4.0,0.026207,0.785398,0.256016,...,3.0,0.768116,3.0,2.134880,0.125000,0.771501,0.793103,0.793103,0.015625,0.0
4,0.083369,0.041685,7.0,-0.214286,0.0,4.0,4.0,0.046433,0.245437,0.214467,...,3.0,0.768116,5.0,2.062417,0.066667,0.842103,0.758621,0.689655,0.062500,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4825,0.080159,0.040080,15.0,-0.214286,0.0,7.0,4.0,0.049762,0.196350,0.215046,...,3.0,0.724638,4.0,2.030616,0.076923,0.980926,0.689655,0.689655,0.040000,21.0
4826,0.084526,0.042263,20.0,-0.228571,0.0,6.0,8.0,0.078439,0.147262,0.206271,...,7.0,0.521739,5.0,1.951321,0.062500,0.866993,0.758621,0.758621,0.062500,0.0
4827,0.084462,0.042231,14.0,-0.250000,0.0,4.0,18.0,0.057400,0.245437,0.250103,...,13.0,0.797101,5.0,2.091965,0.066667,0.821984,0.586207,0.689655,0.062500,20.0
4828,0.076714,0.038357,12.0,-0.257143,0.0,1.0,1.0,0.031052,0.687223,0.247107,...,7.0,0.869565,4.0,2.150615,0.066667,0.763008,0.758621,0.655172,0.062500,8.0


In [661]:
#!c1.32
from sktime.transformations.panel.catch22 import Catch22

class Catch22FeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, col_prefix="nd_mean_"):
        self.col_prefix = col_prefix
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        features = []
        
        for _, row in result.iterrows():
            series = row[row.index.str.contains(self.col_prefix)].astype("float32").to_numpy()
            feature_row = Catch22().fit_transform(series)
            features.append(feature_row)
        catch22_df = pd.concat(features)
        catch22_df.rename(columns=lambda x: f"catch22_f{x}", inplace=True)
        catch22_df.reset_index(drop=True, inplace=True)
        
        assert catch22_df.shape[0] == result.shape[0]
        result = pd.concat([result, catch22_df], axis=1)       
        
        return result

In [662]:
#!c1.32
Catch22FeatureGenerator().fit_transform(df)

Unnamed: 0,id,area,nd_mean_2021-04-16,nd_mean_2021-04-19,nd_mean_2021-04-22,nd_mean_2021-04-26,nd_mean_2021-04-28,nd_mean_2021-05-02,nd_mean_2021-05-04,nd_mean_2021-05-07,...,catch22_f12,catch22_f13,catch22_f14,catch22_f15,catch22_f16,catch22_f17,catch22_f18,catch22_f19,catch22_f20,catch22_f21
0,3536,20,0.072846,0.261778,0.062981,0.104442,0.021096,0.052202,0.158723,0.000000,...,1.0,0.739130,4.0,2.153809,0.090909,0.747868,0.758621,0.655172,0.055556,4.0
1,3739,45,0.332928,0.287182,0.097138,0.152467,0.075484,0.000000,0.140326,0.163340,...,1.0,0.797101,4.0,2.081770,0.166667,0.891584,0.758621,0.793103,0.019284,9.0
2,1294,28,0.013679,0.007055,0.120804,0.037839,0.259125,0.000000,0.037446,0.506516,...,2.0,0.811594,4.0,1.993455,0.076923,0.724670,0.586207,0.689655,0.040000,19.0
3,2859,19,0.105976,0.052500,0.001917,0.011531,0.042316,0.000000,0.000000,0.043604,...,3.0,0.768116,3.0,2.134880,0.125000,0.771501,0.793103,0.793103,0.015625,0.0
4,3685,33,0.130885,0.063492,0.024416,0.000000,0.150671,0.046717,0.254284,0.101479,...,3.0,0.768116,5.0,2.062417,0.066667,0.842103,0.758621,0.689655,0.062500,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4825,3772,74,0.028770,0.038282,0.255329,0.148166,0.225421,0.479483,0.071684,0.016856,...,3.0,0.724638,4.0,2.030616,0.076923,0.980926,0.689655,0.689655,0.040000,21.0
4826,5191,109,0.035939,0.030722,0.033869,0.029646,0.318387,0.247905,0.265180,0.286298,...,7.0,0.521739,5.0,1.951321,0.062500,0.866993,0.758621,0.758621,0.062500,0.0
4827,5226,58,0.191255,0.063581,0.140111,0.323984,0.000000,0.005056,0.092693,0.399439,...,13.0,0.797101,5.0,2.091965,0.066667,0.821984,0.586207,0.689655,0.062500,20.0
4828,5390,14,0.291399,0.221428,0.007247,0.055074,0.057781,0.187749,0.112403,0.183555,...,7.0,0.869565,4.0,2.150615,0.066667,0.763008,0.758621,0.655172,0.062500,8.0


In [684]:
#!c1.32
X = df.drop(columns=["id", "crop"])
y = df["crop"]

In [699]:
#!c1.32
transformer3 = Pipeline([
    ("geo", GeoTransformer(edges=True)),
    ("count_missing", MissingTSSummator()),
    ("catch22", Catch22FeatureGenerator()),
    ("stats", DescStatFeatureGenerator())
])

In [679]:
#!c1.32
gs61_params = {
    "n_estimators": range(70, 100, 3),
    "criterion": ["gini"],
    "max_depth": range(20, 61, 3),
    "min_samples_split": range(5, 41, 5)
}

gs61 = GridSearchCV(RandomForestClassifier(random_state=42), gs61_params,
                    scoring=recall_scorer, cv=3, verbose=1, n_jobs=32)
gs61.fit(transformer3.fit_transform(X), y)

Fitting 3 folds for each of 1120 candidates, totalling 3360 fits


In [680]:
#!c1.32
gs61.best_score_, gs61.best_estimator_

(0.9654941778120004,
 RandomForestClassifier(max_depth=23, min_samples_split=5, n_estimators=97,
                        random_state=42))

In [686]:
#!c1.32
gs36_params = {
    "num_leaves": range(10, 31, 10),
    "max_depth": range(4, 11),
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.5, 0.7, 1],
    "reg_lambda": [0, 1, 10],
    "n_estimators": [10, 25, 50, 100, 200, 500]
}

gs36 = GridSearchCV(LGBMClassifier(boosting_type="dart", random_state=42),
                    gs36_params,
                    scoring=recall_scorer, cv=3, verbose=1, n_jobs=32)
gs36.fit(transformer3.fit_transform(X), y)

Fitting 3 folds for each of 2646 candidates, totalling 7938 fits


In [687]:
#!c1.32
gs62.best_score_, gs62.best_estimator_

(0.9733999725034846,
 LGBMClassifier(boosting_type='dart', learning_rate=0.5, max_depth=7,
                n_estimators=500, num_leaves=20, random_state=42, reg_lambda=0))

In [695]:
#!c1.32
gs62_2_params = {
    "random_state": range(10, 31),
}

gs62_2 = GridSearchCV(LGBMClassifier(boosting_type="dart", n_estimators=500,
                                     learning_rate=0.5, max_depth=7, num_leaves=20, reg_lambda=0),
                    gs62_2_params,
                    scoring=recall_scorer, cv=3, verbose=1, n_jobs=32)
gs62_2.fit(transformer3.fit_transform(X), y)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


In [696]:
#!c1.32
gs62_2.best_score_, gs62_2.best_estimator_

(0.9731838325146175,
 LGBMClassifier(boosting_type='dart', learning_rate=0.5, max_depth=7,
                n_estimators=500, num_leaves=20, random_state=10, reg_lambda=0))

In [697]:
#!c1.32
save_sumbission(test, transformer3, gs62_2.best_estimator_, "62_3")

In [701]:
#!c1.32
gs37_params = {
    "num_leaves": range(15, 26, 5),
    "max_depth": range(6, 7),
    "learning_rate": [0.25, 0.5, 0.75],
    "reg_lambda": [0],
    "n_estimators": [500]
}

gs37 = GridSearchCV(LGBMClassifier(boosting_type="dart", random_state=42),
                    gs37_params,
                    scoring=recall_scorer, cv=3, verbose=1, n_jobs=32)
gs37.fit(transformer3.fit_transform(X), y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


In [702]:
#!c1.32
gs37.best_score_, gs37.best_estimator_

(0.9715540202322158,
 LGBMClassifier(boosting_type='dart', learning_rate=0.25, max_depth=6,
                n_estimators=500, num_leaves=15, random_state=42, reg_lambda=0))

In [703]:
#!c1.32
save_sumbission(test, transformer3, gs37.best_estimator_, "37")

In [None]:
#!c1.32
