<h3>Terzo test: Senza FE e XGB con ricerca iperparametri e cross-validation</h3>

In [4]:
# Install Optuna e XGBoost in Colab
!pip install optuna xgboost



In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import optuna

# 1) Caricamento dati
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/train.csv')
test  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/test.csv')

# 2) Feature / target
X = train.drop(['id', 'Fertilizer Name'], axis=1)
y = train['Fertilizer Name']

# 3) Label-encode del target
le = LabelEncoder()
y_enc = le.fit_transform(y)

# 4) Split per validazione
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# 5) Preprocessor
categorical_features = ['Soil Type', 'Crop Type']
numeric_features     = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features),
])

# 6) Funzione MAP@3
def map3_score(y_true, proba, classes, k=3):
    topk = np.argsort(proba, axis=1)[:, -k:][:, ::-1]
    score = 0.0
    for i, true in enumerate(y_true):
        preds = classes[topk[i]]
        if true in preds:
            rank = list(preds).index(true)
            score += 1.0 / (rank + 1)
    return score / len(y_true)

# 7) Objective Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'use_label_encoder': False,
        'eval_metric': 'mlogloss',
        'random_state': 42
    }
    model = Pipeline([
        ('pre', preprocessor),
        ('clf', XGBClassifier(**params))
    ])
    model.fit(X_tr, y_tr)
    proba = model.predict_proba(X_val)
    classes_int = model.named_steps['clf'].classes_
    return map3_score(y_val, proba, classes_int)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

# 8) Training finale su tutto il train
best_params = study.best_params
pipeline_xgb = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(**best_params, use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])
pipeline_xgb.fit(X, y_enc)

# 9) Predizione, decodifica e submission
X_test = test.drop('id', axis=1)
proba_test = pipeline_xgb.predict_proba(X_test)
classes_int = pipeline_xgb.named_steps['clf'].classes_
classes_str = le.inverse_transform(classes_int)
top3 = np.argsort(proba_test, axis=1)[:, -3:][:, ::-1]
preds = [' '.join(classes_str[r]) for r in top3]

submission = pd.DataFrame({
    'id': test['id'],
    'Fertilizer Name': preds
})
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/submission_v3_optuna_xgb.csv', index=False)


[I 2025-06-11 07:06:59,333] A new study created in memory with name: no-name-1cc66b3f-2bea-4128-9327-d69a97447625
Parameters: { "use_label_encoder" } are not used.

[I 2025-06-11 07:11:58,312] Trial 0 finished with value: 0.3356922222223038 and parameters: {'n_estimators': 393, 'max_depth': 6, 'learning_rate': 0.1217096926347908, 'subsample': 0.7278545872367442, 'colsample_bytree': 0.9188789265141362, 'gamma': 3.8933236295804757e-07, 'min_child_weight': 1}. Best is trial 0 with value: 0.3356922222223038.
Parameters: { "use_label_encoder" } are not used.

[I 2025-06-11 07:17:22,739] Trial 1 finished with value: 0.3381011111111947 and parameters: {'n_estimators': 350, 'max_depth': 8, 'learning_rate': 0.09123783430612681, 'subsample': 0.8993642633254113, 'colsample_bytree': 0.8152356194125626, 'gamma': 0.00042681073936473726, 'min_child_weight': 3}. Best is trial 1 with value: 0.3381011111111947.
Parameters: { "use_label_encoder" } are not used.



Risultato ---> 0.33989