In [None]:
#!c1.32
import geojson
import numpy as np
import pandas as pd
from shapely.geometry import shape
from lightgbm import LGBMClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import make_scorer, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sktime.transformations.panel.catch22 import Catch22


%matplotlib inline

# Загрузка данных

In [None]:
#!c1.32
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
#!c1.32
df.head(3)

In [None]:
#!c1.32
X = df.drop(columns=["id", "crop"])
y = df["crop"]

# Функции и трансформеры

In [None]:
#!c1.32
recall_scorer = make_scorer(recall_score, average="macro", zero_division=0) # нужен для grid search

In [None]:
#!c1.32
def save_submission(test_data, transformer, model, suffix):
    """делает предсказание и сохраняет его в файл"""
    ids = test_data["id"].to_list()
    X_test = test_data.drop(["id"], axis=1)
    X_test = transformer.fit_transform(X_test)
    y_pred = model.predict(X_test)
    submission = pd.DataFrame({"id": ids, "crop": y_pred})
    submission.to_csv(f"submission_{suffix}.csv", index=None)

In [None]:
#!c1.32
def count_edges(shape):
    """считает количество углов в контуре поля"""
    shape_type = shape.type
    if shape_type == "Polygon":
        n_edges = len(shape.exterior.coords)
    elif shape_type == "MultiPolygon":
        n_edges = sum([len(contour.exterior.coords) for contour in shape.geoms])
    elif shape_type == "GeometryCollection":
        n_edges = 0
        for element in shape.geoms:
            if element.type == "Polygon":
                n_edges += len(element.exterior.coords)
    else:
        n_edges = np.nan                
        
    return n_edges

class GeoTransformer(BaseEstimator, TransformerMixin):
    """трансформер для извлечения признаков из контура поля, представленного в колонке .geo"""
    def __init__(self, edges=False, length=False, derived=False, computed_area=False):
        self.edges = edges
        self.length = length
        self.derived = derived
        self.computed_area = computed_area
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result["shape"] = result[".geo"].apply(lambda x: shape(geojson.loads(x)))
        result["lon"] = result["shape"].apply(lambda x: x.representative_point().coords[0][0])
        result["lat"] = result["shape"].apply(lambda x: x.representative_point().coords[0][1])
        
        if self.edges:
            result["edges"] = result["shape"].apply(count_edges)
        if self.length:
            result["length"] = result["shape"].apply(lambda x: x.length)
        if self.derived:
            result["edges_per_length"] = result["edges"] / result["length"]
            result["area_per_length"] = result["area"] / result["length"]
            result["area_per_edge"] = result["area"] / result["edges"]
        if self.computed_area:
            result["computed_area"] = result["shape"].apply(lambda x: x.area)
            
        result.drop(columns=["shape", ".geo"], inplace=True)
        
        return result

In [None]:
#!c1.32
class DescStatFeatureGenerator(BaseEstimator, TransformerMixin):
    """генератор описательных статистик"""
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result["nd_mean"] = result.loc[:, result.columns.str.contains("nd")].mean(axis=1)
        result["nd_min"] = result.loc[:, result.columns.str.contains("nd")].min(axis=1)
        result["nd_max"] = result.loc[:, result.columns.str.contains("nd")].max(axis=1)
        result["nd_std"] = result.loc[:, result.columns.str.contains("nd")].std(axis=1)
        result["nd_sum"] = result.loc[:, result.columns.str.contains("nd")].sum(axis=1)
        for q in range(1, 20):
            colname = f"nd_q{q}"
            result[colname] = result.loc[:, result.columns.str.contains("nd")].quantile(q / 20, axis=1)
        
        return result

In [None]:
#!c1.32
class MissingTSSummator(BaseEstimator, TransformerMixin):
    """трансформер, считающий количество пропусков во временном ряде NDVI"""
    def __init__(self, col_prefix="nd_mean_"):
        self.col_prefix = col_prefix        
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        result["missing_count"] = result.loc[:, result.columns.str.contains(self.col_prefix)].apply(lambda x: len(x[x == 0]), axis=1)
        
        return result

In [None]:
#!c1.32
class Catch22FeatureGenerator(BaseEstimator, TransformerMixin):
    """генератор признаков catch22 из sktime"""
    def __init__(self, col_prefix="nd_mean_"):
        self.col_prefix = col_prefix
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        result = X.copy()
        features = []
        
        for _, row in result.iterrows():
            series = row[row.index.str.contains(self.col_prefix)].astype("float32").to_numpy()
            feature_row = Catch22().fit_transform(series)
            features.append(feature_row)
        catch22_df = pd.concat(features)
        catch22_df.rename(columns=lambda x: f"catch22_f{x}", inplace=True)
        catch22_df.reset_index(drop=True, inplace=True)
        
        assert catch22_df.shape[0] == result.shape[0]
        result = pd.concat([result, catch22_df], axis=1)       
        
        return result

In [None]:
#!c1.32
# общий pipeline из трансформеров
transformer = Pipeline([
    ("geo", GeoTransformer(edges=True)),
    ("count_missing", MissingTSSummator()),
    ("catch22", Catch22FeatureGenerator()),
    ("stats", DescStatFeatureGenerator())
])

# LightGBM Classifier и поиск гиперпараметров

In [None]:
#!c1.32
# ищем оптимальные параметры
# в этой ячейке — большой поиск из кучи вариантов
gs_params = {
    "num_leaves": range(10, 31, 10),
    "max_depth": range(4, 11),
    "learning_rate": [0.001, 0.01, 0.1, 0.2, 0.5, 0.7, 1],
    "reg_lambda": [0, 1, 10],
    "n_estimators": [10, 25, 50, 100, 200, 500]
}

gs = GridSearchCV(LGBMClassifier(boosting_type="dart", random_state=42),
                  gs_params, scoring=recall_scorer,
                  cv=3, verbose=1, n_jobs=32)
gs.fit(transformer.fit_transform(X), y)
gs.best_score_, gs.best_estimator_

In [None]:
#!c1.32
# Для справки: вот тут набор параметров поиска, который дал лучшее из моих решений
gs_params_2 = {
    "num_leaves": range(15, 26, 5),
    "max_depth": range(6, 7),
    "learning_rate": [0.25, 0.5, 0.75],
    "reg_lambda": [0],
    "n_estimators": [500]
}
# лучшее решение по скору на лидерборде
# (метрика на тесте, параметры классификатора)
# (0.9715540202322158,
# LGBMClassifier(boosting_type='dart', learning_rate=0.25, max_depth=6,
#                n_estimators=500, num_leaves=15, random_state=42, reg_lambda=0))

In [None]:
#!c1.32
# делаем прогноз и сохраняем его
save_submission(test, transformer, gs.best_estimator_, "final")

In [None]:
#!c1.32