Sesto test ---> Un passo indietro, il migliore è stata N_P_ratio, quindi approfondisco questa combinazione

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import optuna

# 1) Caricamento dati
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/train.csv')
test  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/test.csv')

# 2) Aggiunta della feature N_P_ratio
X = train.drop(['id', 'Fertilizer Name'], axis=1).copy()
X['N_P_ratio'] = X['Nitrogen'] / (X['Phosphorous'] + 1e-9)
X_test = test.drop('id', axis=1).copy()
X_test['N_P_ratio'] = X_test['Nitrogen'] / (X_test['Phosphorous'] + 1e-9)

# 3) Encoding del target
le = LabelEncoder()
y_enc = le.fit_transform(train['Fertilizer Name'])

# 4) Preprocessing
categorical = ['Soil Type', 'Crop Type']
numeric     = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'N_P_ratio']
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numeric),
])

# 5) MAP@3 helper
def map3_score(y_true, proba, classes, k=3):
    topk = np.argsort(proba, axis=1)[:, -k:][:, ::-1]
    score = 0.0
    for i, t in enumerate(y_true):
        preds = classes[topk[i]]
        if t in preds:
            score += 1.0 / (list(preds).index(t) + 1)
    return score / len(y_true)

# 6) Objective per Optuna (CV stratificata)
def objective(trial):
    params = {
        'n_estimators':      trial.suggest_int('n_estimators', 100, 1000),
        'max_depth':         trial.suggest_int('max_depth', 4, 16),
        'learning_rate':     trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'subsample':         trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma':             trial.suggest_float('gamma', 1e-8, 10.0, log=True),
        'min_child_weight':  trial.suggest_int('min_child_weight', 1, 20),
        'reg_alpha':         trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda':        trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'use_label_encoder': False,
        'eval_metric':       'mlogloss',
        'random_state':      42
    }
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for tr_idx, val_idx in kf.split(X, y_enc):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y_enc[tr_idx], y_enc[val_idx]
        pipe = Pipeline([
            ('pre', preprocessor),
            ('clf', XGBClassifier(**params))
        ])
        pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_val)
        scores.append(map3_score(y_val, proba, pipe.named_steps['clf'].classes_))
    return np.mean(scores)

# 7) Avvio dello studio Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=1800)

# 8) Fit finale con i migliori iperparametri
best_params = study.best_params
pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(**best_params, use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])
pipeline.fit(X, y_enc)

# 9) Generazione della submission
proba_test   = pipeline.predict_proba(X_test)
classes_str  = le.inverse_transform(pipeline.named_steps['clf'].classes_)
top3         = np.argsort(proba_test, axis=1)[:, -3:][:, ::-1]
preds        = [' '.join(classes_str[row]) for row in top3]
submission   = pd.DataFrame({'id': test['id'], 'Fertilizer Name': preds})
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/submission_v6_optuna_NP.csv', index=False)


[I 2025-06-12 08:11:35,236] A new study created in memory with name: no-name-1cc2e12c-2781-4b63-aab3-406454da91fa
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-06-12 08:33:37,974] Trial 0 finished with value: 0.33201822222229954 and parameters: {'n_estimators': 144, 'max_depth': 15, 'learning_rate': 0.06373076115493108, 'subsample': 0.6670859136301126, 'colsample_bytree': 0.9503307718874713, 'gamma': 6.223292447072867e-05, 'min_child_weight': 1, 'reg_alpha': 8.135461952564972, 'reg_lambda': 2.5989665876592284e-06}. Best is trial 0 with value: 0.33201822222229954.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameter

Risultato ---> 0.33397