In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

In [2]:
# Load data
train = pd.read_csv('playground-series-s5e6/train.csv', index_col='id')
test = pd.read_csv('playground-series-s5e6/test.csv')
origin = pd.read_csv('playground-series-s5e6/Fertilizer Prediction.csv')

In [3]:
# Columns to convert to categorical using binning
num_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Phosphorous', 'Potassium']
cat_base = ['Soil Type', 'Crop Type']

# Bin numerical features using pd.qcut and cast to category
for col in num_cols:
    # Use train bins to maintain consistency
    qcut_bins = pd.qcut(train[col], q=5, duplicates='drop')
    bin_edges = qcut_bins.cat.categories

    for df in [train, test, origin]:
        df[col + '_cat'] = pd.cut(df[col], bins=[-np.inf] + [b.right for b in bin_edges], include_lowest=True)
        df[col + '_cat'] = df[col + '_cat'].astype("category")

# Convert base categorical columns to category dtype
for df in [train, test, origin]:
    for col in cat_base:
        df[col] = df[col].astype("category")

# Add indicator for original dataset
train['original'] = False
origin['original'] = True

# Combine categorical feature list
features = cat_base + [col + '_cat' for col in num_cols]

# Encode target
target_le = LabelEncoder()
train['Fertilizer Name'] = target_le.fit_transform(train['Fertilizer Name'])
origin['Fertilizer Name'] = target_le.transform(origin['Fertilizer Name'])

# Final train/val/test setup
X = train[features + ['original']]
y = train['Fertilizer Name']
X_test = test[features]
X_test['original'] = False

# Just for reference if needed later
X_origin = origin[features + ['original']]
y_origin = origin['Fertilizer Name']

print("✅ Data successfully preprocessed.")

✅ Data successfully preprocessed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['original'] = False


In [4]:
# Custom MAP@3 scoring function
def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        p = p[:k]
        score = 0.0
        hits = 0
        seen = set()
        for i, pred in enumerate(p):
            if pred in a and pred not in seen:
                hits += 1
                score += hits / (i + 1.0)
                seen.add(pred)
        return score / min(len(a), k)
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [5]:
FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# XGB models
xgb_models = [
    XGBClassifier(
        max_depth=12, learning_rate=0.03, n_estimators=4000,
        colsample_bytree=0.467, subsample=0.86, gamma=0.26,
        reg_alpha=2.7, reg_lambda=1.4, max_delta_step=4,
        early_stopping_rounds=100, objective='multi:softprob',
        eval_metric='mlogloss', random_state=42, enable_categorical=True),
    
    XGBClassifier(
        max_depth=13, learning_rate=0.0179, n_estimators=1500,
        colsample_bytree=0.4309, subsample=0.7281, gamma=0.0895,
        reg_alpha=0.4187, reg_lambda=2.4665, max_delta_step=4,
        early_stopping_rounds=100, objective='multi:softprob',
        eval_metric='mlogloss', random_state=13, enable_categorical=True),
    
    XGBClassifier(
        max_depth=9, learning_rate=0.0158, n_estimators=2353,
        colsample_bytree=0.4930, subsample=0.9753, gamma=0.0035,
        reg_alpha=0.5994, reg_lambda=4.5972, max_delta_step=1,
        early_stopping_rounds=100, objective='multi:softprob',
        eval_metric='mlogloss', random_state=25, enable_categorical=True)
]


In [7]:

oofs = [np.zeros((len(train), len(target_le.classes_))) for _ in range(3)]
preds = [np.zeros((len(test), len(target_le.classes_))) for _ in range(3)]

# Stratified K-Fold Training with Origin Augmentation Inside CV Loop
for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"\n🔁 Fold {fold + 1}")

    X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[valid_idx], y.iloc[valid_idx]

    # Augment with origin inside fold
    X_aug = pd.concat([X_tr, X_origin])
    y_aug = pd.concat([y_tr, y_origin])

    for i, model in enumerate(xgb_models):
        print(f"  🚀 Training model {i + 1}")
        model.fit(X_aug, y_aug, eval_set=[(X_val, y_val)], verbose=100)
        
        oofs[i][valid_idx] = model.predict_proba(X_val)
        preds[i] += model.predict_proba(X_test)

        # Show fold score
        top_3 = np.argsort(oofs[i][valid_idx], axis=1)[:, -3:][:, ::-1]
        actual = [[label] for label in y_val]  # Wrap each label in a list
        score = mapk(actual, top_3)
        print(f"    ✅ Fold {fold + 1} | XGB-{i+1} MAP@3: {score:.5f}")



🔁 Fold 1
  🚀 Training model 1
[0]	validation_0-mlogloss:1.94572
[100]	validation_0-mlogloss:1.93666
[200]	validation_0-mlogloss:1.93410
[300]	validation_0-mlogloss:1.93312
[400]	validation_0-mlogloss:1.93276
[500]	validation_0-mlogloss:1.93266
[600]	validation_0-mlogloss:1.93272
[605]	validation_0-mlogloss:1.93274
    ✅ Fold 1 | XGB-1 MAP@3: 0.30677
  🚀 Training model 2
[0]	validation_0-mlogloss:1.94580
[100]	validation_0-mlogloss:1.93990
[200]	validation_0-mlogloss:1.93740
[300]	validation_0-mlogloss:1.93600
[400]	validation_0-mlogloss:1.93506
[500]	validation_0-mlogloss:1.93434
[600]	validation_0-mlogloss:1.93381
[700]	validation_0-mlogloss:1.93342
[800]	validation_0-mlogloss:1.93314
[900]	validation_0-mlogloss:1.93292
[1000]	validation_0-mlogloss:1.93275
[1100]	validation_0-mlogloss:1.93262
[1200]	validation_0-mlogloss:1.93254
[1300]	validation_0-mlogloss:1.93247
[1400]	validation_0-mlogloss:1.93243
[1499]	validation_0-mlogloss:1.93241
    ✅ Fold 1 | XGB-2 MAP@3: 0.30728
  🚀 Traini

KeyboardInterrupt: 

In [None]:
# Normalize predictions
for i in range(3):
    preds[i] /= FOLDS

# Final scores
print("\n📊 Final Out-of-Fold MAP@3 Scores:")
for i, oof in enumerate(oofs):
    top_3 = np.argsort(oof, axis=1)[:, -3:][:, ::-1]
    score = mapk(y, top_3)
    print(f"✅ XGB-{i+1}: {score:.5f}")

# Blending weights
w1, w2, w3 = 0.2667, 0.2619, 0.4714
blend_pred = w1 * preds[0] + w2 * preds[1] + w3 * preds[2]

# Top 3 predictions
top_3_preds = np.argsort(blend_pred, axis=1)[:, -3:][:, ::-1]
top_3_labels = target_le.inverse_transform(top_3_preds.ravel()).reshape(top_3_preds.shape)

In [None]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'Fertilizer Name': [' '.join(row) for row in top_3_labels]
})
submission.to_csv("Fet_3xgboost_submission.csv", index=False)
print("✅ Submission saved as Fet_3xgboost_submission.csv")