<h3>Secondo test con LightGBM</h3>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

# 2) Caricamento dati
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/train.csv')
test  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/test.csv')

# 3) Split features / target
X_train = train.drop(['id', 'Fertilizer Name'], axis=1)
y_train = train['Fertilizer Name']
X_test  = test.drop('id', axis=1)

# 4) Definizione feature set
categorical_features = ['Soil Type', 'Crop Type']
numeric_features     = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

# 5) Preprocessing: one-hot + standard scaling
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features),
])

# 6) Definizione del classificatore LightGBM
lgbm = LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    num_leaves=31,
    objective='multiclass',
    random_state=42
)

# 7) Creazione pipeline completa
pipeline_lgbm = Pipeline([
    ('pre', preprocessor),
    ('clf', lgbm)
])

# 8) Training
pipeline_lgbm.fit(X_train, y_train)

# 9) Predizione probabilistica e top-3 per MAP@3
proba = pipeline_lgbm.predict_proba(X_test)
classes = pipeline_lgbm.named_steps['clf'].classes_
top3_idx = np.argsort(proba, axis=1)[:, -3:][:, ::-1]
preds = [' '.join(classes[idxs]) for idxs in top3_idx]

# 10) Creazione file di submission
submission = pd.DataFrame({
    'id': test['id'],
    'Fertilizer Name': preds
})
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/fertilizer/data/submission_v2_LGBM.csv', index=False)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034331 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 218
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 22
[LightGBM] [Info] Start training from score -1.884866
[LightGBM] [Info] Start training from score -1.880057
[LightGBM] [Info] Start training from score -1.897538
[LightGBM] [Info] Start training from score -1.911544
[LightGBM] [Info] Start training from score -1.909121
[LightGBM] [Info] Start training from score -2.067671
[LightGBM] [Info] Start training from score -2.094845




Risultato ---> 0.32520