In [65]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import top_k_accuracy_score

# XGBoost Classifier
from xgboost import XGBClassifier

from copy import deepcopy

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

import joblib

In [66]:
# Load data
train = pd.read_csv('playground-series-s5e6/train.csv',index_col='id')
test = pd.read_csv('playground-series-s5e6/test.csv',index_col = 'id')
origin = pd.read_csv('playground-series-s5e6/Fertilizer Prediction.csv')

In [67]:
train = pd.concat([train,origin], axis= 0 )

In [70]:
crop_nutrient_req_map = {
    'Sugarcane': (50, 32.5, 37.5),
    'Millets': (20, 10, 0),
    'Barley': (110, 0, 0),
    'Paddy': (0, 0, 0),  # unknowns = 0 or np.nan
    'Pulses': (0, 0, 0),
    'Tobacco': (0, 0, 0),
    'Ground Nuts': (12.5, 17.5, 11.25),
    'Maize': (0, 27.5, 110),
    'Cotton': (68.75, 30, 37.5),
    'Wheat': (140, 0, 0),
    'Oil Seeds': (67.5, 0, 0)
}

for i, nutrient in enumerate(['crop_N_req', 'crop_P_req', 'crop_K_req']):
    train[nutrient] = train['Crop Type'].map(lambda x: crop_nutrient_req_map.get(x, (0, 0, 0))[i])
    test[nutrient] = test['Crop Type'].map(lambda x: crop_nutrient_req_map.get(x, (0, 0, 0))[i])


In [71]:
soil_avg_npk_map = {
    'Sandy': (620, 14.2, 98),
    'Black': (88.5, 27, 199),
    'Clayey': (82.6, 20.75, 284.8),
    'Red': (72, 28, 123.5),
    'Loamy': (1500, 8.6, 104)
}

for i, nutrient in enumerate(['soil_N_avg', 'soil_P_avg', 'soil_K_avg']):
    train[nutrient] = train['Soil Type'].map(lambda x: soil_avg_npk_map[x][i])
    test[nutrient] = test['Soil Type'].map(lambda x: soil_avg_npk_map[x][i])


In [72]:
for nutrient in ['N', 'P', 'K']:
    train[f'{nutrient}_gap'] = train[f'crop_{nutrient}_req'] - train[f'soil_{nutrient}_avg']
    test[f'{nutrient}_gap'] = test[f'crop_{nutrient}_req'] - test[f'soil_{nutrient}_avg']


In [73]:
nutrient_key_map = {
    'Nitrogen': 'N',
    'Phosphorous': 'P',
    'Potassium': 'K'
}

for nutrient, key in nutrient_key_map.items():
    train[f'{key}_abnormality'] = train[nutrient] - train[f'soil_{key}_avg']
    test[f'{key}_abnormality'] = test[nutrient] - test[f'soil_{key}_avg']


In [74]:
train['Crop_Soil'] = train['Crop Type'] + "_" + train['Soil Type']
test['Crop_Soil'] = test['Crop Type'] + "_" + test['Soil Type']

train['NPK_gap_product'] = train['N_gap'] * train['P_gap'] * train['K_gap']
test['NPK_gap_product'] = test['N_gap'] * test['P_gap'] * test['K_gap']

train['Moisture_Temp'] = train['Moisture'] * train['Temparature']
test['Moisture_Temp'] = test['Moisture'] * test['Temparature']

In [75]:
train['stress'] = abs(train['Humidity'] - 60) + abs(train['Moisture'] - 45)
test['stress'] = abs(test['Humidity'] - 60) + abs(test['Moisture'] - 45)


In [46]:
train.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
Temparature,37,27,29,35,35,30,27,36,36,28,...,36,26,27,36,32,32,35,28,29,25
Humidity,70,69,63,62,58,59,62,62,51,50,...,59,54,50,68,51,71,72,50,57,72
Moisture,36,65,32,54,43,29,53,44,32,35,...,32,30,47,38,45,61,47,61,63,42
Soil Type,Clayey,Sandy,Sandy,Sandy,Red,Red,Sandy,Red,Loamy,Red,...,Sandy,Black,Black,Sandy,Clayey,Black,Loamy,Sandy,Loamy,Sandy
Crop Type,Sugarcane,Millets,Millets,Barley,Paddy,Pulses,Paddy,Pulses,Tobacco,Tobacco,...,Wheat,Millets,Paddy,Paddy,Paddy,Tobacco,Millets,Maize,Ground Nuts,Wheat
Nitrogen,36,30,24,39,37,10,26,30,19,25,...,34,35,22,19,27,23,38,10,7,38
Potassium,4,6,12,12,2,0,15,12,17,12,...,17,14,7,1,8,1,1,11,10,2
Phosphorous,5,18,16,4,16,9,22,35,29,16,...,17,0,28,4,2,25,32,14,4,6
Fertilizer Name,28-28,28-28,17-17-17,10-26-26,DAP,20-20,28-28,14-35-14,17-17-17,20-20,...,28-28,20-20,20-20,20-20,20-20,20-20,17-17-17,14-35-14,DAP,17-17-17
crop_N_req,50.0,20.0,20.0,110.0,0.0,0.0,0.0,0.0,0.0,0.0,...,140.0,20.0,0.0,0.0,0.0,0.0,20.0,0.0,12.5,140.0


In [76]:
# Encode categorical features
cat_features = ['Soil Type', 'Crop Type', 'Crop_Soil']

num_features = [col for col in train.columns if col not in cat_features + ['Fertilizer Name']]

encoders = {}

for col in cat_features:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    encoders[col] = le

# Encode target
target_le = LabelEncoder()
train['Fertilizer Name'] = target_le.fit_transform(train['Fertilizer Name'])
class_names = list(target_le.classes_)

# Prepare features and target
feature_cols = num_features + cat_features
X = train[feature_cols]
y = train['Fertilizer Name']
X_test = test[feature_cols]

In [53]:
# MAP@3 scorer
def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        p = p[:k]
        score = 0.0
        hits = 0
        seen = set()
        for i, pred in enumerate(p):
            if pred in a and pred not in seen:
                hits += 1
                score += hits / (i + 1.0)
                seen.add(pred)
        return score / min(len(a), k)
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [56]:
# XGBoost config
xgb_model = XGBClassifier(
    max_depth=12,
    colsample_bytree=0.467,
    subsample=0.86,
    n_estimators=4000,
    learning_rate=0.03,
    gamma=0.26,
    max_delta_step=4,
    reg_alpha=2.7,
    reg_lambda=1.4,
    early_stopping_rounds=100,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=42,
    enable_categorical=False
)

# Stratified K-Fold setup
FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
oof = np.zeros((len(X), len(class_names)))
fold_preds = []
fold_scores = []

for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print('#' * 15, f"FOLD {i+1}", '#' * 15)

    x_train, x_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = deepcopy(xgb_model)
    model.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=100
    )

    oof[valid_idx] = model.predict_proba(x_valid)
    
    fold_pred = model.predict_proba(X_test)
    fold_preds.append(fold_pred)

    # Evaluate MAP@3 for the fold
    top_3_preds = np.argsort(oof[valid_idx], axis=1)[:, -3:][:, ::-1]
    actual = [[label] for label in y_valid]
    fold_map3 = mapk(actual, top_3_preds)
    fold_scores.append(fold_map3)
    
    print(f"✅ FOLD {i+1}: MAP@3 Score: {fold_map3:.5f}")

############### FOLD 1 ###############
[0]	validation_0-mlogloss:1.94552
[100]	validation_0-mlogloss:1.92672
[200]	validation_0-mlogloss:1.91927
[300]	validation_0-mlogloss:1.91512
[400]	validation_0-mlogloss:1.91240
[500]	validation_0-mlogloss:1.91078
[600]	validation_0-mlogloss:1.91003
[700]	validation_0-mlogloss:1.90951
[800]	validation_0-mlogloss:1.90966
[822]	validation_0-mlogloss:1.90966
✅ FOLD 1: MAP@3 Score: 0.34445
############### FOLD 2 ###############
[0]	validation_0-mlogloss:1.94555
[100]	validation_0-mlogloss:1.92710
[200]	validation_0-mlogloss:1.92006
[300]	validation_0-mlogloss:1.91626
[400]	validation_0-mlogloss:1.91387
[500]	validation_0-mlogloss:1.91247
[600]	validation_0-mlogloss:1.91189
[700]	validation_0-mlogloss:1.91156
[800]	validation_0-mlogloss:1.91163
[860]	validation_0-mlogloss:1.91174
✅ FOLD 2: MAP@3 Score: 0.34309
############### FOLD 3 ###############
[0]	validation_0-mlogloss:1.94554
[100]	validation_0-mlogloss:1.92652
[200]	validation_0-mlogloss:1.91912

In [77]:
# Normalize fold weights
weights = np.array(fold_scores)
weights /= weights.sum()
print("Fold weights:", weights)

# Weighted average of test predictions
pred_prob = np.zeros_like(fold_preds[0])
for w, pred in zip(weights, fold_preds):
    pred_prob += w * pred

# Final top-3 predicted class indices for each row
final_top3 = np.argsort(pred_prob, axis=1)[:, -3:][:, ::-1]

# Decode top-3 predicted class indices to fertilizer names
decoded_flat = target_le.inverse_transform(final_top3.ravel())
decoded_2d = decoded_flat.reshape(final_top3.shape)
final_top3_labels = [' '.join(map(str, row)) for row in decoded_2d]

# Load sample submission IDs
df_sub = pd.read_csv("playground-series-s5e6/sample_submission.csv")

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': df_sub['id'],
    'Fertilizer Name': final_top3_labels
})

submission_df.head()


Fold weights: [0.20016442 0.19937934 0.20036212 0.20043732 0.1996568 ]


Unnamed: 0,id,Fertilizer Name
0,750000,DAP 28-28 10-26-26
1,750001,17-17-17 Urea 20-20
2,750002,20-20 DAP 10-26-26
3,750003,14-35-14 Urea 17-17-17
4,750004,20-20 10-26-26 Urea


In [78]:
submission_df.to_csv('Fet_w_xgboost_submission.csv', index=False)

In [79]:
n_classes = pred_prob.shape[1]

df = pd.DataFrame(pred_prob, columns=[f'class_{i}' for i in range(n_classes)])
df.to_csv('F_w_xgboost_prob.csv', index=False)