In [None]:
from ml_fertilizers.utils import load_data

# DATA LOAD

In [None]:
from IPython.display import display
train, test = load_data()
train = train.set_index("id")
test = test.set_index("id")
display(train)

# FEATURE ENGINEERING

In [None]:
import pickle as pkl
from typing import Dict, List, Optional, Tuple, Union, cast
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from autofeat import AutoFeatClassifier

def engineer_features(X: pd.DataFrame, autofeat_cls: Union[bool, Optional[AutoFeatClassifier]] = False) -> Tuple[pd.DataFrame, Dict[str, List[str]], Union[AutoFeatClassifier, bool]]:
    raw_num_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Phosphorous', 'Potassium']
    raw_cat_features = ['Crop', 'Soil']

    X = X.copy()
    X = X.rename(columns={
        "Soil Type": "Soil",
        "Crop Type": "Crop",
    })


    # X['Crop_x_Soil'] = X['Crop'] + '_' + X['Soil']
    X['Env_Stress_Index'] = X['Temparature'] *0.4 + X['Humidity'] * 0.3 + X['Moisture'] * 0.3
    X['NPK_Index'] = X['Nitrogen'] * 0.5 + X['Phosphorous'] * 0.3 + X['Potassium'] * 0.2
    X['Temp_bin'] = pd.cut(X['Temparature'], bins=[-float('inf'), 15, 25, 35, float('inf')], labels=['low', 'medium', 'high', 'very_high'])
    X['Humidity_bin'] = pd.cut(X['Humidity'], bins=[-float('inf'), 30, 50, 70, float('inf')], labels=['low', 'medium', 'high', 'very_high'])
    X['Moisture_bin'] = pd.cut(X['Moisture'], bins=[-float('inf'), 20, 40, 60, float('inf')], labels=['low', 'medium', 'high', 'very_high'])
    X['PCA_Temparature'] = PCA(n_components=2).fit_transform(X[['Temparature', 'Humidity', 'Moisture']])[:, 0]



    print("Autofeating features...")

    if isinstance(autofeat_cls, bool):
        print("Skipping autofeat feature engineering.")
        X_autofeat = pd.DataFrame()
    elif autofeat_cls is None:
        autofeat_cls = AutoFeatClassifier(verbose=0, n_jobs=-1, feateng_steps=2, categorical_cols=raw_cat_features)
        X_autofeat = cast(pd.DataFrame, autofeat_cls.fit_transform(X[raw_num_features + raw_cat_features], X['Fertilizer Name'])) # type: ignore
        print("Autofeat columns:", X_autofeat.columns.tolist())
    else:
        X_autofeat = cast(pd.DataFrame, autofeat_cls.transform(X[raw_num_features + raw_cat_features]))
        print("Autofeat columns:", X_autofeat.columns.tolist())
    
    
    

    X_final = pd.concat([X, X_autofeat], axis=1)
    X_final = X_final.loc[:, ~X_final.columns.duplicated()]


    final_dict = {
        "num_features": X_final.select_dtypes(include=['number']).columns.tolist(),
        "cat_features": X_final.drop(columns=['Fertilizer Name']).select_dtypes(include=['object', 'category']).columns.tolist(),
        "autofeat_features": X_autofeat.columns.tolist() if not isinstance(autofeat_cls, bool) else [],
    }
    return X_final, final_dict, autofeat_cls


eng_train, feat_dict, auto_cls = engineer_features(train, autofeat_cls=False)

display(eng_train)

# SEQUENTIAL FEATRURE SELECTION

In [None]:
import numpy as np


def mapk_scorer(estimator, X, y_true, k=3):
    """
    Uses estimator.predict_proba to compute MAP@k.
    y_val contains integer-encoded true labels.
    """
    probas = estimator.predict_proba(X)
    topk   = np.argsort(probas, axis=1)[:, -k:][:, ::-1]  # shape: (n_samples, k)
    scores = []
    for i, true_label in enumerate(y_true):
        preds = topk[i]
        score = 0.0
        hits  = 0
        seen  = set()
        for rank, p in enumerate(preds):
            if p == true_label and p not in seen:
                hits += 1
                score += hits / (rank + 1)
                seen.add(p)
        scores.append(score / 1.0)  # each actual list has length=1
    return np.mean(scores)


In [None]:
from typing import Literal
from catboost import CatBoostClassifier 

# NOTE: Doesn't really work with cat_features
class CatBoostWithCatFeatures(CatBoostClassifier):
    def __init__(self, extended_cat_features: Optional[Union[List, Literal['auto']]] = None, **kwargs):
        super().__init__(**kwargs)
        self._extended_cat_features = extended_cat_features

    def fit(self, X, y, **kwargs): # type: ignore
        cat_auto_detect = self._extended_cat_features == 'auto'
        cat_feats = self._extended_cat_features if isinstance(self._extended_cat_features, list) else None


        # If X is a DataFrame and cat_features are names, convert to indices
        if hasattr(X, 'columns'):
            if cat_feats is not None:
                # Convert names to indices if needed
                cat_feats = [col for col in cat_feats if col in X.columns]
            elif cat_auto_detect:
                # Auto-detect object or category dtype columns
                cat_feats = [X.columns[i] for i, dt in enumerate(X.dtypes) if dt.name in ['object', 'category']]

        elif isinstance(X, np.ndarray):
            if cat_feats is not None:
                # Assume user passed indices
                cat_feats = [col for col in cat_feats if isinstance(col, int)]
            elif cat_auto_detect:
                # Auto-detect non-numeric columns
                # Check if each column is numeric
                cat_feats = []
                for i in range(X.shape[1]):
                    col = X[:, i]
                    # Try converting to float; if fails, it's categorical
                    try:
                        col.astype(float)
                    except (ValueError, TypeError):
                        cat_feats.append(i)
        else:
            # Fallback: don't set cat_features
            cat_feats = None

        return super().fit(X, y, cat_features=cat_feats, **kwargs)

print(feat_dict["cat_features"])

catboost_fast = CatBoostWithCatFeatures(
    iterations=800,
    learning_rate=0.1,
    depth=5,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=100,
    allow_writing_files=False,
    thread_count=-1,
    extended_cat_features=feat_dict["cat_features"]
)


In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfs = SFS(
    catboost_fast, # type: ignore
    k_features=10,
    forward=True,
    scoring=mapk_scorer,
    cv=3,
)

X_train_sub = eng_train.drop(columns=['Fertilizer Name']).sample(frac=0.1, random_state=42)
y_train_sub = eng_train.loc[X_train_sub.index, 'Fertilizer Name']

# sfs.fit(X_train_sub, y_train_sub.astype('category').cat.codes)  # type: ignore


In [None]:
# pkl.dump(sfs, open("sfs_catboost.pkl", "wb"))
from ml_fertilizers.utils import PathManager


sfs_loaded = pkl.load(open(PathManager.cwd.value/'sfs_catboost_final.pkl', "rb"))  # Load the SFS object if you have it saved
selected_order = []
prev_set = set()
for k in sorted(sfs_loaded.subsets_.keys()):
    cur_set = set(sfs_loaded.subsets_[k]["feature_idx"])
    added   = cur_set - prev_set
    if len(added) == 1:
        feat_idx = added.pop()
        feat_name = X_train_sub.columns[feat_idx]
        avg_score = sfs_loaded.subsets_[k]["avg_score"]
        selected_order.append((k, feat_name, avg_score))
    prev_set = cur_set

# 3.7 Display the selection order and MAP@3
print("Feature | Order → Feature Name         | MAP@3")
print("--------|-------------------------------|--------")
for k, feat, score in selected_order:
    print(f"{k:>2d}      → {feat:<30s}   {score:.5f}")

# CATBOOST RFE

In [None]:
from catboost import Pool

X_cat_rfe_train = eng_train.drop(columns=['Fertilizer Name']).sample(frac=0.1, random_state=42)
y_cat_rfe_train = eng_train.loc[X_cat_rfe_train.index, 'Fertilizer Name']

X_cat_rfe_test = eng_train.drop(columns=['Fertilizer Name']).sample(frac=0.1, random_state=43)
y_cat_rfe_test = eng_train.loc[X_cat_rfe_train.index, 'Fertilizer Name']

train_pool = Pool(X_cat_rfe_train, y_cat_rfe_train, cat_features=feat_dict['cat_features'])
test_pool = Pool(X_cat_rfe_test, y_cat_rfe_test, cat_features=feat_dict['cat_features'])


summary = catboost_fast.select_features(
    train_pool,
    eval_set=test_pool,
    num_features_to_select=8,
    features_for_select=X_cat_rfe_train.columns.tolist(),
    steps=5,
    algorithm='RecursiveByShapValues',
    train_final_model=True,
    logging_level='Silent',
    plot=False,
)

summary

In [None]:
import matplotlib.pyplot as plt

removed_features = summary['loss_graph']['removed_features_count']
loss_values = summary['loss_graph']['loss_values']

plt.figure(figsize=(8, 5))
plt.plot(removed_features, loss_values, marker='o')
plt.title('Feature Selection Loss Curve')
plt.xlabel('Number of Features Removed')
plt.ylabel('Validation Loss')
plt.grid(True)
plt.show()

# RFE

In [None]:
import json
from lightgbm import LGBMClassifier
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier

rfe_data = pd.get_dummies(eng_train.drop(columns=['Fertilizer Name']).copy().sample(frac=0.25, random_state=42), drop_first=True)
y = eng_train.loc[rfe_data.index, 'Fertilizer Name'].astype('category').cat.codes  # Convert to integer codes for RFE
 
catboost_rfe = CatBoostClassifier(
    iterations=800,
    learning_rate=0.1,
    depth=5,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=0,
    allow_writing_files=False,
    thread_count=-1,
)


xgb_rfe = XGBClassifier(
    n_estimators=800,
    learning_rate=0.1,
    max_depth=5,
    objective='multi:softmax',
    eval_metric='mlogloss',
    random_state=42,
    verbosity=0,
    tree_method='hist',
    n_jobs=-1,
)

lgbm_rfe = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.1,
    max_depth=5,
    objective='multiclass',
    eval_metric='multi_logloss',
    random_state=42,
    verbosity=0,
    n_jobs=-1,
)

rfe_models = [
    catboost_rfe,
    xgb_rfe,
    lgbm_rfe,
]

scoring_features_n = [30, 25, 20, 15, 10, 5]
ans : Dict[str,List[str]] = dict()

for rfe_model in rfe_models:
    base_features = rfe_data.columns.tolist()

    for n_features in sorted(scoring_features_n, reverse=True):
        print(
            f"Training RFE {rfe_model.__class__.__name__} {n_features} features"
        )
        current_features = (
            rfe_data[base_features]
            .columns[
                RFE(rfe_model, n_features_to_select=n_features, step=2) # type: ignore
                .fit(rfe_data[base_features], y)
                .get_support()
            ]
            .tolist()
        )

        original_features = current_features.copy()

        ans[f"{rfe_model.__class__.__name__}_{n_features}"] = current_features
        print(f"Selected features ({len(current_features)}): {current_features}")

        base_features = [col for col in current_features]


print(json.dumps(ans, indent=2))

json.dump(
    ans,
    open(PathManager.cwd.value/'rfe_features.json', "w"),
    indent=2,
    ensure_ascii=False,
)

# FEATURE SETS

In [None]:
cat_set = [
  'Temparature',
  'Soil',
  'Potassium',
  'Phosphorous',
  'NPK_Index',
  'Temp_bin',
  'Humidity_bin',
  'Moisture_bin'
]

rfe_dict = {
  # "CatBoostClassifier_cat_features": cat_set,
  # "CatBoostClassifier_30": [
  #   "Temparature",
  #   "Humidity",
  #   "Moisture",
  #   "Nitrogen",
  #   "Potassium",
  #   "Phosphorous",
  #   "Env_Stress_Index",
  #   "NPK_Index",
  #   "PCA_Temparature",
  #   "Soil_Clayey",
  #   "Soil_Loamy",
  #   "Soil_Red",
  #   "Soil_Sandy",
  #   "Crop_Cotton",
  #   "Crop_Ground Nuts",
  #   "Crop_Maize",
  #   "Crop_Millets",
  #   "Crop_Oil seeds",
  #   "Crop_Paddy",
  #   "Crop_Pulses",
  #   "Crop_Sugarcane",
  #   "Crop_Tobacco",
  #   "Crop_Wheat",
  #   "Temp_bin_high",
  #   "Temp_bin_very_high",
  #   "Humidity_bin_medium",
  #   "Humidity_bin_high",
  #   "Humidity_bin_very_high",
  #   "Moisture_bin_high",
  #   "Moisture_bin_very_high"
  # ],
  "CatBoostClassifier_25": [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "Env_Stress_Index",
    "NPK_Index",
    "PCA_Temparature",
    "Soil_Clayey",
    "Soil_Loamy",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Cotton",
    "Crop_Ground Nuts",
    "Crop_Maize",
    "Crop_Millets",
    "Crop_Oil seeds",
    "Crop_Paddy",
    "Crop_Pulses",
    "Crop_Sugarcane",
    "Crop_Tobacco",
    "Crop_Wheat",
    "Temp_bin_high",
    "Humidity_bin_high"
  ],
  "CatBoostClassifier_20": [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "Env_Stress_Index",
    "NPK_Index",
    "PCA_Temparature",
    "Soil_Clayey",
    "Soil_Loamy",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Ground Nuts",
    "Crop_Oil seeds",
    "Crop_Paddy",
    "Crop_Pulses",
    "Crop_Sugarcane",
    "Crop_Tobacco",
    "Temp_bin_high"
  ],
  "CatBoostClassifier_15": [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "Env_Stress_Index",
    "NPK_Index",
    "PCA_Temparature",
    "Soil_Loamy",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Paddy",
    "Crop_Pulses",
    "Crop_Sugarcane"
  ],
  "CatBoostClassifier_10": [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "Env_Stress_Index",
    "NPK_Index",
    "PCA_Temparature",
    "Soil_Sandy"
  ],
  # "CatBoostClassifier_5": [
  #   "Humidity",
  #   "Nitrogen",
  #   "Potassium",
  #   "Phosphorous",
  #   "PCA_Temparature"
  # ],
  # "XGBClassifier_30": [
  #   "Temparature",
  #   "Humidity",
  #   "Moisture",
  #   "Nitrogen",
  #   "Potassium",
  #   "Phosphorous",
  #   "Env_Stress_Index",
  #   "NPK_Index",
  #   "PCA_Temparature",
  #   "Soil_Clayey",
  #   "Soil_Loamy",
  #   "Soil_Red",
  #   "Soil_Sandy",
  #   "Crop_Cotton",
  #   "Crop_Ground Nuts",
  #   "Crop_Maize",
  #   "Crop_Millets",
  #   "Crop_Oil seeds",
  #   "Crop_Paddy",
  #   "Crop_Pulses",
  #   "Crop_Sugarcane",
  #   "Crop_Tobacco",
  #   "Crop_Wheat",
  #   "Temp_bin_high",
  #   "Temp_bin_very_high",
  #   "Humidity_bin_medium",
  #   "Humidity_bin_high",
  #   "Humidity_bin_very_high",
  #   "Moisture_bin_medium",
  #   "Moisture_bin_high"
  # ],
  "XGBClassifier_25": [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "NPK_Index",
    "PCA_Temparature",
    "Soil_Clayey",
    "Soil_Loamy",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Cotton",
    "Crop_Ground Nuts",
    "Crop_Maize",
    "Crop_Millets",
    "Crop_Oil seeds",
    "Crop_Paddy",
    "Crop_Pulses",
    "Crop_Sugarcane",
    "Crop_Tobacco",
    "Crop_Wheat",
    "Temp_bin_high",
    "Humidity_bin_high",
    "Moisture_bin_high"
  ],
  "XGBClassifier_20": [
    "Temparature",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "Soil_Clayey",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Cotton",
    "Crop_Ground Nuts",
    "Crop_Maize",
    "Crop_Millets",
    "Crop_Oil seeds",
    "Crop_Paddy",
    "Crop_Pulses",
    "Crop_Sugarcane",
    "Crop_Wheat",
    "Temp_bin_high",
    "Humidity_bin_high",
    "Moisture_bin_high"
  ],
  "XGBClassifier_15": [
    "Moisture",
    "Potassium",
    "Phosphorous",
    "Soil_Clayey",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Ground Nuts",
    "Crop_Oil seeds",
    "Crop_Paddy",
    "Crop_Pulses",
    "Crop_Sugarcane",
    "Crop_Wheat",
    "Temp_bin_high",
    "Humidity_bin_high",
    "Moisture_bin_high"
  ],
  "XGBClassifier_10": [
    "Moisture",
    "Phosphorous",
    "Soil_Clayey",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Ground Nuts",
    "Crop_Paddy",
    "Crop_Pulses",
    "Crop_Sugarcane",
    "Temp_bin_high"
  ],
  # "XGBClassifier_5": [
  #   "Moisture",
  #   "Phosphorous",
  #   "Soil_Sandy",
  #   "Crop_Pulses",
  #   "Crop_Sugarcane"
  # ],
  # "LGBMClassifier_30": [
  #   "Temparature",
  #   "Humidity",
  #   "Moisture",
  #   "Nitrogen",
  #   "Potassium",
  #   "Phosphorous",
  #   "Env_Stress_Index",
  #   "NPK_Index",
  #   "PCA_Temparature",
  #   "Soil_Clayey",
  #   "Soil_Loamy",
  #   "Soil_Red",
  #   "Soil_Sandy",
  #   "Crop_Cotton",
  #   "Crop_Ground Nuts",
  #   "Crop_Maize",
  #   "Crop_Millets",
  #   "Crop_Oil seeds",
  #   "Crop_Paddy",
  #   "Crop_Pulses",
  #   "Crop_Sugarcane",
  #   "Crop_Tobacco",
  #   "Crop_Wheat",
  #   "Temp_bin_medium",
  #   "Temp_bin_high",
  #   "Temp_bin_very_high",
  #   "Humidity_bin_medium",
  #   "Humidity_bin_high",
  #   "Moisture_bin_medium",
  #   "Moisture_bin_high"
  # ],
  "LGBMClassifier_25": [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "Env_Stress_Index",
    "NPK_Index",
    "PCA_Temparature",
    "Soil_Clayey",
    "Soil_Loamy",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Cotton",
    "Crop_Ground Nuts",
    "Crop_Maize",
    "Crop_Millets",
    "Crop_Oil seeds",
    "Crop_Paddy",
    "Crop_Pulses",
    "Crop_Sugarcane",
    "Crop_Tobacco",
    "Crop_Wheat",
    "Temp_bin_high",
    "Humidity_bin_high"
  ],
  "LGBMClassifier_20": [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "Env_Stress_Index",
    "NPK_Index",
    "PCA_Temparature",
    "Soil_Clayey",
    "Soil_Loamy",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Maize",
    "Crop_Millets",
    "Crop_Oil seeds",
    "Crop_Paddy",
    "Crop_Pulses",
    "Crop_Tobacco",
    "Crop_Wheat"
  ],
  "LGBMClassifier_15": [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "Env_Stress_Index",
    "NPK_Index",
    "PCA_Temparature",
    "Soil_Clayey",
    "Soil_Loamy",
    "Soil_Red",
    "Soil_Sandy",
    "Crop_Oil seeds",
    "Crop_Paddy"
  ],
  "LGBMClassifier_10": [
    "Temparature",
    "Humidity",
    "Moisture",
    "Nitrogen",
    "Potassium",
    "Phosphorous",
    "Env_Stress_Index",
    "NPK_Index",
    "PCA_Temparature",
    "Soil_Loamy"
  ],
  # "LGBMClassifier_5": [
  #   "Nitrogen",
  #   "Phosphorous",
  #   "Env_Stress_Index",
  #   "NPK_Index",
  #   "PCA_Temparature"
  # ],
  "XGBClassifier_SFS_Baseline_Kaggle": ["Moisture", "Phosphorous", "Potassium", "Soil Type_Black", "Nitrogen", "Soil Type_Sandy", "Crop Type_Sugarcane", "Temparature", "Crop Type_Oil seeds", "Crop Type_Cotton"]
}

# TESTING

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
def evaluate(estimator, X, y, cv=3) -> float:

    scores = cross_val_score(estimator, X, y.astype('category').cat.codes, cv=cv, scoring=mapk_scorer)
    return scores.mean()



In [None]:
ans = {}

X_raw = eng_train.drop(columns=['Fertilizer Name']).sample(frac=0.6, random_state=42)
X_eval = pd.get_dummies(X_raw, drop_first=True)
y_eval = eng_train.loc[X_raw.index, 'Fertilizer Name']


catboost_test = CatBoostClassifier(
    iterations=800,
    learning_rate=0.1,
    depth=5,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=0,
    allow_writing_files=False,
    thread_count=-1,
)


xgb_test = XGBClassifier(
    n_estimators=800,
    learning_rate=0.1,
    max_depth=5,
    objective='multi:softmax',
    eval_metric='mlogloss',
    random_state=42,
    verbosity=0,
    tree_method='hist',
    n_jobs=-1,
)

lgbm_test = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.1,
    max_depth=5,
    objective='multiclass',
    eval_metric='multi_logloss',
    random_state=42,
    verbosity=-1,
    n_jobs=-1,
)

test_models = [
    catboost_test,
    xgb_test,
    lgbm_test,
]


for key, features in rfe_dict.items():
    
    model = None

    for test_model in test_models:
        if key.startswith(test_model.__class__.__name__):
            model = test_model
            break

    if model is None:
        print(f"Model for {key} not found!")
        continue

    print(f"Evaluating {key} with features: {features}")
    score = evaluate(
        model,
        X_eval[features],
        y_eval,
        cv=3
    )
    ans[key] = score
    print(f"Score for {key}: {score:.5f}")


for key in ans.keys():
    print(f"{key}: {ans[key]:.5f}")


json.dump(
    ans,
    open(PathManager.cwd.value/'rfe_scores.json', "w"),
    indent=2,
    ensure_ascii=False,
)

# cat_rfe: 0.28679

# TESTING RESULTS

Evaluating CatBoostClassifier_30 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Clayey', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Cotton', 'Crop_Ground Nuts', 'Crop_Maize', 'Crop_Millets', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Crop_Tobacco', 'Crop_Wheat', 'Temp_bin_high', 'Temp_bin_very_high', 'Humidity_bin_medium', 'Humidity_bin_high', 'Humidity_bin_very_high', 'Moisture_bin_high', 'Moisture_bin_very_high']
Score for CatBoostClassifier_30: 0.31385
Evaluating CatBoostClassifier_25 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Clayey', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Cotton', 'Crop_Ground Nuts', 'Crop_Maize', 'Crop_Millets', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Crop_Tobacco', 'Crop_Wheat', 'Temp_bin_high', 'Humidity_bin_high']
Score for CatBoostClassifier_25: 0.31389
Evaluating CatBoostClassifier_20 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Clayey', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Ground Nuts', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Crop_Tobacco', 'Temp_bin_high']
Score for CatBoostClassifier_20: 0.31273
Evaluating CatBoostClassifier_15 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane']
Score for CatBoostClassifier_15: 0.31018
Evaluating CatBoostClassifier_10 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Sandy']
Score for CatBoostClassifier_10: 0.30740
Evaluating CatBoostClassifier_5 with features: ['Humidity', 'Nitrogen', 'Potassium', 'Phosphorous', 'PCA_Temparature']
Score for CatBoostClassifier_5: 0.30691
Evaluating XGBClassifier_30 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Clayey', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Cotton', 'Crop_Ground Nuts', 'Crop_Maize', 'Crop_Millets', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Crop_Tobacco', 'Crop_Wheat', 'Temp_bin_high', 'Temp_bin_very_high', 'Humidity_bin_medium', 'Humidity_bin_high', 'Humidity_bin_very_high', 'Moisture_bin_medium', 'Moisture_bin_high']
Score for XGBClassifier_30: 0.31977
Evaluating XGBClassifier_25 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'NPK_Index', 'PCA_Temparature', 'Soil_Clayey', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Cotton', 'Crop_Ground Nuts', 'Crop_Maize', 'Crop_Millets', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Crop_Tobacco', 'Crop_Wheat', 'Temp_bin_high', 'Humidity_bin_high', 'Moisture_bin_high']
Score for XGBClassifier_25: 0.32179
Evaluating XGBClassifier_20 with features: ['Temparature', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Soil_Clayey', 'Soil_Red', 'Soil_Sandy', 'Crop_Cotton', 'Crop_Ground Nuts', 'Crop_Maize', 'Crop_Millets', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Crop_Wheat', 'Temp_bin_high', 'Humidity_bin_high', 'Moisture_bin_high']
Score for XGBClassifier_20: 0.32401
Evaluating XGBClassifier_15 with features: ['Moisture', 'Potassium', 'Phosphorous', 'Soil_Clayey', 'Soil_Red', 'Soil_Sandy', 'Crop_Ground Nuts', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Crop_Wheat', 'Temp_bin_high', 'Humidity_bin_high', 'Moisture_bin_high']
Score for XGBClassifier_15: 0.31314
Evaluating XGBClassifier_10 with features: ['Moisture', 'Phosphorous', 'Soil_Clayey', 'Soil_Red', 'Soil_Sandy', 'Crop_Ground Nuts', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Temp_bin_high']
Score for XGBClassifier_10: 0.30568
Evaluating XGBClassifier_5 with features: ['Moisture', 'Phosphorous', 'Soil_Sandy', 'Crop_Pulses', 'Crop_Sugarcane']
Score for XGBClassifier_5: 0.30241
Evaluating LGBMClassifier_30 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Clayey', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Cotton', 'Crop_Ground Nuts', 'Crop_Maize', 'Crop_Millets', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Crop_Tobacco', 'Crop_Wheat', 'Temp_bin_medium', 'Temp_bin_high', 'Temp_bin_very_high', 'Humidity_bin_medium', 'Humidity_bin_high', 'Moisture_bin_medium', 'Moisture_bin_high']
Score for LGBMClassifier_30: 0.31825
Evaluating LGBMClassifier_25 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Clayey', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Cotton', 'Crop_Ground Nuts', 'Crop_Maize', 'Crop_Millets', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Sugarcane', 'Crop_Tobacco', 'Crop_Wheat', 'Temp_bin_high', 'Humidity_bin_high']
Score for LGBMClassifier_25: 0.31816
Evaluating LGBMClassifier_20 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Clayey', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Maize', 'Crop_Millets', 'Crop_Oil seeds', 'Crop_Paddy', 'Crop_Pulses', 'Crop_Tobacco', 'Crop_Wheat']
Score for LGBMClassifier_20: 0.31616
Evaluating LGBMClassifier_15 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Clayey', 'Soil_Loamy', 'Soil_Red', 'Soil_Sandy', 'Crop_Oil seeds', 'Crop_Paddy']
Score for LGBMClassifier_15: 0.31306
Evaluating LGBMClassifier_10 with features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature', 'Soil_Loamy']
Score for LGBMClassifier_10: 0.30871
Evaluating LGBMClassifier_5 with features: ['Nitrogen', 'Phosphorous', 'Env_Stress_Index', 'NPK_Index', 'PCA_Temparature']
Score for LGBMClassifier_5: 0.29873
CatBoostClassifier_30: 0.31385
CatBoostClassifier_25: 0.31389
CatBoostClassifier_20: 0.31273
CatBoostClassifier_15: 0.31018
CatBoostClassifier_10: 0.30740
CatBoostClassifier_5: 0.30691
XGBClassifier_30: 0.31977
XGBClassifier_25: 0.32179
XGBClassifier_20: 0.32401
XGBClassifier_15: 0.31314
XGBClassifier_10: 0.30568
XGBClassifier_5: 0.30241
LGBMClassifier_30: 0.31825
LGBMClassifier_25: 0.31816
LGBMClassifier_20: 0.31616
LGBMClassifier_15: 0.31306
LGBMClassifier_10: 0.30871
LGBMClassifier_5: 0.29873

# ENSEMBLES

In [None]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

X_raw = eng_train.drop(columns=['Fertilizer Name']).sample(frac=0.4, random_state=42)
X_eval = pd.get_dummies(X_raw, drop_first=True)
y_eval = eng_train.loc[X_raw.index, 'Fertilizer Name']


catboost_stack = CatBoostClassifier(
    iterations=800,
    learning_rate=0.1,
    depth=5,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=0,
    allow_writing_files=False,
    thread_count=-1,
)


xgb_stack = XGBClassifier(
    n_estimators=800,
    learning_rate=0.1,
    max_depth=5,
    objective='multi:softmax',
    eval_metric='mlogloss',
    random_state=42,
    verbosity=0,
    tree_method='hist',
    n_jobs=-1,
)

lgbm_stack = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.1,
    max_depth=5,
    objective='multiclass',
    eval_metric='multi_logloss',
    random_state=42,
    verbosity=-1,
    n_jobs=-1,
)

random_forest_stack = RandomForestClassifier(
    n_estimators=800,
    max_depth=5,
    random_state=42,
    n_jobs=-1,
)

stacking_model = StackingClassifier(
    estimators=[
        ('catboost', catboost_stack),
        ('xgboost', xgb_stack),
        ('lgbm', lgbm_stack),
    ],
    final_estimator=random_forest_stack,
    cv=3,
    n_jobs=-1,
    passthrough=True,
)

stacking_score = evaluate(
    stacking_model,
    X_eval[rfe_dict['XGBClassifier_20']],
    y_eval,
    cv=3
)
print(f"Stacking model score: {stacking_score:.5f}")



# HYPEROPT

In [None]:
import os
from typing import List
import multiprocessing as mp

import pandas as pd
from catboost import CatBoostClassifier 
from lightgbm import LGBMClassifier
import optuna
from sklearn import clone

from ml_fertilizers.lib.features.FeatureCombination import FeatureCombination
from ml_fertilizers.lib.models.HyperOptCombination import HyperOptCombination
from ml_fertilizers.lib.models.XgbGPU import XGBClassifierGPU
from ml_fertilizers.lib.optymization.TrialParamWrapper import TrialParamWrapper
from ml_fertilizers.lib.optymization.optimization_study import OBJECTIVE_RETURN_TYPE
from ml_fertilizers.lib.utils.garbage_collector import garbage_manager


processes = None
my_combinations: List[HyperOptCombination] = []
job_count = processes if processes is not None else mp.cpu_count()
os.environ["OMP_NUM_THREADS"] = str(job_count)
os.environ["MKL_NUM_THREADS"] = str(job_count)
RANDOM_STATE = 42
gpu=True
models = [
    CatBoostClassifier(thread_count=job_count, random_state=RANDOM_STATE, verbose=False, allow_writing_files=False, loss_function='MultiClass', eval_metric='MultiClass'),
    LGBMClassifier(n_jobs=job_count, verbosity=-1, random_state=RANDOM_STATE, objective='multiclass', eval_metric='multi_logloss'),  # type: ignore
    XGBClassifierGPU(random_state=RANDOM_STATE, n_jobs=job_count, verbosity=0, objective='multi:softmax', eval_metric='mlogloss')._set_gpu(use_gpu=gpu),
]

for key, features in rfe_dict.items():
    for model in models:
        if model.__class__.__name__.startswith(key.split('_')[0]):
            my_combinations.append(
                HyperOptCombination(
                    name=key,
                    model=model,
                    feature_combination=FeatureCombination(
                        name="_".join(key.split('_')[1:]),
                        features=features
                    )
                )
            )



display(my_combinations)

def create_objective(data:pd.DataFrame, model_combination:HyperOptCombination):

    X = pd.get_dummies(data.drop(columns=['Fertilizer Name']), drop_first=False).copy()
    y = data['Fertilizer Name'].astype('category').cat.codes  # Convert to integer codes for RFE
    model = model_combination.model
    model_name = model_combination.name

    def objective(trail: optuna.Trial) -> OBJECTIVE_RETURN_TYPE:
        params = TrialParamWrapper().get_params(
            model_name=model_name,
            trial=trail,
        )        

        pipeline = clone(model).set_params(**params)

        try:

            score = evaluate(pipeline, X[model_combination.feature_combination.features], y, cv=3)

            return score
        
        except optuna.exceptions.TrialPruned as e:
            print(f"Trial {trail.number} was pruned: {e}")
            raise e
        except Exception as e:
            print(f"Error during evaluation of trial {trail.number}: {e}")
            raise e
        finally: 
            garbage_manager.clean()


    return objective


In [None]:
from ml_fertilizers.lib.optymization.hyper_setup import setup_hyper
from ml_fertilizers.lib.optymization.parrarel_optimization import (
    HyperFunctionDto,
    HyperSetupDto,
)
from ml_fertilizers.utils import PrefixManager, PathManager

model_run = "intial_run"

setup_dto = HyperSetupDto(
    n_optimization_trials=60,
    optimization_timeout=None,
    n_patience=20,
    min_percentage_improvement=0.007,
    model_run=model_run,
    limit_data_percentage=0.75,
    processes=processes,
    max_concurrent_jobs=None,
    output_dir_path=PathManager.output.value,
    hyper_opt_prefix=PrefixManager.hyper.value, 
    study_prefix=PrefixManager.study.value,
    data=eng_train,
    combinations=my_combinations,
    hyper_direction="maximize",
    metadata={},
    force_all_sequential=False,
    omit_names=None,
)


function_dto = HyperFunctionDto(
    create_objective_func=create_objective,
    evaluate_hyperopted_model_func=None,
)

setup_hyper(
    setup_dto=setup_dto,
    function_dto=function_dto
)
