Settimo test ---> Valuto un approfontita ricerca su N_K_ratio e forward selection

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [2]:
!nvidia-smi

Thu Jun 12 10:44:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import optuna

# 1) Caricamento dati e target encoding
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/train.csv')
test  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/test.csv')

X_base = train.drop(['id', 'Fertilizer Name'], axis=1).copy()
y = train['Fertilizer Name']
X_test_base = test.drop('id', axis=1).copy()

le = LabelEncoder()
y_enc = le.fit_transform(y)

# 2) Feature originali
categorical = ['Soil Type','Crop Type']
numeric     = ['Temparature','Humidity','Moisture','Nitrogen','Potassium','Phosphorous']

# 3) Nuove feature da testare
feature_funcs = {
    'N_P_ratio':       lambda df: df['Nitrogen']   / (df['Phosphorous']+1e-9),
    'N_K_ratio':       lambda df: df['Nitrogen']   / (df['Potassium']+1e-9),
    'P_K_ratio':       lambda df: df['Phosphorous']/ (df['Potassium']+1e-9),
    'N_plus_P_plus_K': lambda df: df['Nitrogen']+df['Phosphorous']+df['Potassium'],
}

# 4) MAP@3 helper
def map3_score(y_true, proba, classes, k=3):
    idx   = np.argsort(proba, axis=1)[:, -k:][:, ::-1]
    score = 0.0
    for i, t in enumerate(y_true):
        preds = classes[idx[i]]
        if t in preds:
            score += 1.0 / (list(preds).index(t) + 1)
    return score / len(y_true)

results = []

for feat_name, func in feature_funcs.items():
    # 5) Costruisci X con la nuova feature
    X      = X_base.copy()
    X_test = X_test_base.copy()
    X[feat_name]      = func(X)
    X_test[feat_name] = func(X_test)

    # 6) Preprocessor aggiornato
    num_cols = numeric + [feat_name]
    preproc = ColumnTransformer([
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical),
        ('num', StandardScaler(), num_cols),
    ])

    # 7) Objective Optuna con GPU e CV stratificata
    def objective(trial):
        params = {
            'tree_method':       'gpu_hist',
            'predictor':         'gpu_predictor',
            'gpu_id':            0,
            'n_estimators':      trial.suggest_int('n_estimators', 200, 1500),
            'max_depth':         trial.suggest_int('max_depth', 4, 20),
            'learning_rate':     trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
            'subsample':         trial.suggest_float('subsample', 0.4, 1.0),
            'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.4, 1.0),
            'gamma':             trial.suggest_float('gamma', 1e-8, 10.0, log=True),
            'min_child_weight':  trial.suggest_int('min_child_weight', 1, 50),
            'reg_alpha':         trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda':        trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'max_bin':           trial.suggest_int('max_bin', 128, 1024),
            'grow_policy':       trial.suggest_categorical('grow_policy', ['depthwise','lossguide']),
            'eval_metric':       'mlogloss',
            'use_label_encoder': False,
            'random_state':      42
        }
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = []
        for tr_idx, val_idx in kf.split(X, y_enc):
            X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
            y_tr, y_val = y_enc[tr_idx], y_enc[val_idx]
            pipe = Pipeline([
                ('pre', preproc),
                ('clf', XGBClassifier(**params))
            ])
            pipe.fit(X_tr, y_tr)
            proba = pipe.predict_proba(X_val)
            scores.append(map3_score(y_val, proba, pipe.named_steps['clf'].classes_))
        return np.mean(scores)

    # 8) Tuning Optuna
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50, timeout=1800)
    best_score  = study.best_value
    best_params = study.best_params

    # 9) Training finale e submission
    final_pipe = Pipeline([
        ('pre', preproc),
        ('clf', XGBClassifier(**best_params, use_label_encoder=False, eval_metric='mlogloss', random_state=42))
    ])
    final_pipe.fit(X, y_enc)
    proba_test   = final_pipe.predict_proba(X_test)
    classes_int  = final_pipe.named_steps['clf'].classes_
    classes_str  = le.inverse_transform(classes_int)
    top3         = np.argsort(proba_test, axis=1)[:, -3:][:, ::-1]
    preds        = [' '.join(classes_str[r]) for r in top3]
    pd.DataFrame({'id': test['id'], 'Fertilizer Name': preds}) \
      .to_csv(f'/content/drive/MyDrive/Colab Notebooks/fertilizer/data/submission/submission_v7_{feat_name}.csv', index=False)

    # 10) Registra risultato
    results.append((feat_name, best_score))

# 11) I 3 migliori aggiustamenti
results.sort(key=lambda x: x[1], reverse=True)
print("Top 3 features aggiunte:")
for name, score in results[:3]:
    print(f"{name}: MAP@3 = {score:.4f}")


[I 2025-06-12 10:46:21,813] A new study created in memory with name: no-name-3c0fd2bc-2f1e-4883-b204-407c5638a35b

    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_la

In [None]:
from google.colab import runtime
import time

print("Il runtime verrà terminato automaticamente dopo 30 secondi...")
time.sleep(30)
runtime.unassign()