Quinto test ---> FE avanzata con PolynomialFeatures, XGB (CV e optuna)

In [1]:
!pip install optuna --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/386.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/242.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import optuna

# 1) Caricamento dati
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/train.csv')
test  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/test.csv')

# 2) Feature / target
X = train.drop(['id', 'Fertilizer Name'], axis=1)
y = train['Fertilizer Name']
le = LabelEncoder()
y_enc = le.fit_transform(y)
X_test = test.drop('id', axis=1)

# 3) Definizione feature set
categorical = ['Soil Type', 'Crop Type']
numeric     = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

# 4) Preprocessor con interazioni-only sulle numeriche
numeric_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))
])
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical),
    ('num', numeric_pipe, numeric),
])

# 5) MAP@3 helper
def map3_score(y_true, proba, classes, k=3):
    topk = np.argsort(proba, axis=1)[:, -k:][:, ::-1]
    score = 0.0
    for i, t in enumerate(y_true):
        preds = classes[topk[i]]
        if t in preds:
            score += 1.0 / (list(preds).index(t) + 1)
    return score / len(y_true)

# 6) Objective Optuna con CV
def objective(trial):
    params = {
        'n_estimators':      trial.suggest_int('n_estimators', 100, 500),
        'max_depth':         trial.suggest_int('max_depth', 3, 10),
        'learning_rate':     trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample':         trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma':             trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'min_child_weight':  trial.suggest_int('min_child_weight', 1, 10),
        'use_label_encoder': False,
        'eval_metric':       'mlogloss',
        'random_state':      42
    }
    kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = []
    for tr_idx, val_idx in kf.split(X, y_enc):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y_enc[tr_idx], y_enc[val_idx]
        pipe = Pipeline([
            ('pre', preprocessor),
            ('clf', XGBClassifier(**params))
        ])
        pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_val)
        classes_int = pipe.named_steps['clf'].classes_
        scores.append(map3_score(y_val, proba, classes_int))
    return np.mean(scores)

# 7) Esecuzione dello studio Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, timeout=600)

print("Best MAP@3 (CV):", study.best_value)
print("Best params:", study.best_params)

# 8) Training finale su tutto il train
best_params = study.best_params
final_pipe = Pipeline([
    ('pre', preprocessor),
    ('clf', XGBClassifier(
        **best_params,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    ))
])
final_pipe.fit(X, y_enc)

# 9) Predizioni sul test e creazione submission
proba_test = final_pipe.predict_proba(X_test)
classes_str = le.inverse_transform(final_pipe.named_steps['clf'].classes_)
top3 = np.argsort(proba_test, axis=1)[:, -3:][:, ::-1]
preds = [' '.join(classes_str[r]) for r in top3]

submission = pd.DataFrame({
    'id':    test['id'],
    'Fertilizer Name': preds
})
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/submission_v5_optuna_poly.csv', index=False)
print("Submission salvata in submission_optuna_poly.csv")


[I 2025-06-12 07:03:35,326] A new study created in memory with name: no-name-8cc78b58-e216-4b1c-9831-128b481ec12c
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-06-12 07:09:49,465] Trial 0 finished with value: 0.31753044444443823 and parameters: {'n_estimators': 199, 'max_depth': 5, 'learning_rate': 0.09631550706304026, 'subsample': 0.9327728142592706, 'colsample_bytree': 0.6310326963429074, 'gamma': 1.6387188468428802e-06, 'min_child_weight': 1}. Best is trial 0 with value: 0.31753044444443823.
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

[I 2025-06-12 07:27:43,197] Trial 1 finished with value: 0.3190059999999905 and parameters: {'n_estimators': 276, 'max_depth': 10, 'learning_rate': 0.08427136438710105, 'subsample': 0.8520175377072985, 'colsample_bytree': 0.62176156

Best MAP@3 (CV): 0.3190059999999905
Best params: {'n_estimators': 276, 'max_depth': 10, 'learning_rate': 0.08427136438710105, 'subsample': 0.8520175377072985, 'colsample_bytree': 0.6217615668020853, 'gamma': 4.4946183309650833e-08, 'min_child_weight': 3}


Parameters: { "use_label_encoder" } are not used.



Submission salvata in submission_optuna_poly.csv


Risultato ---> 0.32421