In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import top_k_accuracy_score

from catboost import CatBoostClassifier, Pool

from copy import deepcopy

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

import joblib

In [4]:
# Load data
train_df = pd.read_csv('playground-series-s5e6/train.csv')
test_df = pd.read_csv('playground-series-s5e6/test.csv')
train_df

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [5]:
# Encode categorical features
cat_features = ['Soil Type', 'Crop Type']
encoders = {}

for col in cat_features:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    encoders[col] = le

# Encode target
target_le = LabelEncoder()
train_df['Fertilizer Name'] = target_le.fit_transform(train_df['Fertilizer Name'])
class_names = list(target_le.classes_)

# Prepare features and target
features = ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
            'Nitrogen', 'Potassium', 'Phosphorous']
X = train_df[features]
y = train_df['Fertilizer Name']
X_test = test_df[features]

In [6]:
# MAP@3 scorer
def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        p = p[:k]
        score = 0.0
        hits = 0
        seen = set()
        for i, pred in enumerate(p):
            if pred in a and pred not in seen:
                hits += 1
                score += hits / (i + 1.0)
                seen.add(pred)
        return score / min(len(a), k)
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [15]:
FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof = np.zeros((len(X), len(class_names)))
pred_prob = np.zeros((len(X_test), len(class_names)))

# Categorical feature indices (column names work too)
cat_features = ['Soil Type', 'Crop Type']

for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"############### FOLD {i+1} ###############")
    
    x_train, x_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    model = CatBoostClassifier(
        iterations=500, 
        learning_rate=0.2, 
        depth=6,
        subsample=0.86,
        bootstrap_type='Bernoulli',
        colsample_bylevel=0.467, 
        l2_leaf_reg=2.7, 
        random_seed=13,
        early_stopping_rounds=100, 
        loss_function='MultiClass',
        eval_metric='MultiClass', 
        verbose=100,
        use_best_model=True
    )
    
    train_pool = Pool(x_train, y_train, cat_features=cat_features)
    valid_pool = Pool(x_valid, y_valid, cat_features=cat_features)
    
    model.fit(train_pool, eval_set=valid_pool)
    
    oof[valid_idx] = model.predict_proba(x_valid)
    pred_prob += model.predict_proba(X_test)
    
    top_3_preds = np.argsort(oof[valid_idx], axis=1)[:, -3:][:, ::-1]
    actual = [[label] for label in y_valid]
    fold_map3 = mapk(actual, top_3_preds)
    print(f"✅ FOLD {i+1}: MAP@3 Score: {fold_map3:.5f}")


############### FOLD 1 ###############
0:	learn: 1.9442083	test: 1.9442681	best: 1.9442681 (0)	total: 670ms	remaining: 5m 34s
100:	learn: 1.9285667	test: 1.9334911	best: 1.9334911 (100)	total: 1m 16s	remaining: 5m 4s
200:	learn: 1.9176183	test: 1.9289162	best: 1.9289162 (200)	total: 2m 39s	remaining: 3m 57s
300:	learn: 1.9098335	test: 1.9267103	best: 1.9267103 (300)	total: 4m	remaining: 2m 39s
400:	learn: 1.9032428	test: 1.9252772	best: 1.9252772 (400)	total: 5m 23s	remaining: 1m 19s
499:	learn: 1.8969015	test: 1.9241454	best: 1.9241454 (499)	total: 6m 44s	remaining: 0us

bestTest = 1.924145376
bestIteration = 499

✅ FOLD 1: MAP@3 Score: 0.32235
############### FOLD 2 ###############
0:	learn: 1.9442128	test: 1.9442269	best: 1.9442269 (0)	total: 950ms	remaining: 7m 54s
100:	learn: 1.9281312	test: 1.9330321	best: 1.9330321 (100)	total: 1m 15s	remaining: 4m 59s
200:	learn: 1.9169104	test: 1.9282972	best: 1.9282972 (200)	total: 2m 40s	remaining: 3m 58s
300:	learn: 1.9093912	test: 1.926224

In [16]:
# Average predictions from all folds
pred_prob /= FOLDS

# Final top-3 predicted class indices for each row
final_top3 = np.argsort(pred_prob, axis=1)[:, -3:][:, ::-1]

# Map each index to the corresponding fertilizer name
def decode_labels(row):
    return ' '.join([class_names[int(i)] for i in row])

# Apply decoding to all predictions
final_top3_labels = pd.DataFrame(final_top3).apply(decode_labels, axis=1)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': final_top3_labels
})

submission_df.head()

Unnamed: 0,id,Fertilizer Name
0,750000,DAP 10-26-26 28-28
1,750001,17-17-17 20-20 10-26-26
2,750002,20-20 10-26-26 14-35-14
3,750003,14-35-14 17-17-17 10-26-26
4,750004,20-20 10-26-26 28-28


In [17]:
submission_df.to_csv('catboost_submission.csv', index=False)

In [18]:
n_classes = pred_prob.shape[1]

df = pd.DataFrame(pred_prob, columns=[f'class_{i}' for i in range(n_classes)])
df.to_csv('model4_pred_prob.csv', index=False)