In [6]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import top_k_accuracy_score

# XGBoost Classifier
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

from copy import deepcopy

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

import joblib

In [2]:
# Load data
train_df = pd.read_csv('playground-series-s5e6/train.csv')
test_df = pd.read_csv('playground-series-s5e6/test.csv')
train_df

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [3]:
# Encode categorical features
cat_features = ['Soil Type', 'Crop Type']
encoders = {}

for col in cat_features:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    encoders[col] = le

# Encode target
target_le = LabelEncoder()
train_df['Fertilizer Name'] = target_le.fit_transform(train_df['Fertilizer Name'])
class_names = list(target_le.classes_)

# Prepare features and target
features = ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
            'Nitrogen', 'Potassium', 'Phosphorous']
X = train_df[features]
y = train_df['Fertilizer Name']
X_test = test_df[features]

In [4]:
# MAP@3 scorer
def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        p = p[:k]
        score = 0.0
        hits = 0
        seen = set()
        for i, pred in enumerate(p):
            if pred in a and pred not in seen:
                hits += 1
                score += hits / (i + 1.0)
                seen.add(pred)
        return score / min(len(a), k)
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [8]:
FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof = np.zeros((len(X), len(class_names)))
pred_prob = np.zeros((len(X_test), len(class_names)))

for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"##### FOLD {i+1} #####")
    
    x_train, x_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    lgb_train = lgb.Dataset(x_train, label=y_train)
    lgb_valid = lgb.Dataset(x_valid, label=y_valid)
    
    params = {
        'objective': 'multiclass',
        'num_class': len(class_names),
        'metric': 'multi_logloss',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'max_depth': -1,
        'verbose': -1,
        'seed': 42,
    }
    
    model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_valid],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=100)
        ]
    )
    
    oof[valid_idx] = model.predict(x_valid)
    pred_prob += model.predict(X_test)
    
    top_3_preds = np.argsort(oof[valid_idx], axis=1)[:, -3:][:, ::-1]
    actual = [[label] for label in y_valid]
    fold_map3 = mapk(actual, top_3_preds)
    print(f"✅ FOLD {i+1}: MAP@3 Score: {fold_map3:.5f}")

##### FOLD 1 #####
Training until validation scores don't improve for 100 rounds
[100]	train's multi_logloss: 1.92006	valid's multi_logloss: 1.92895
[200]	train's multi_logloss: 1.90787	valid's multi_logloss: 1.92587
[300]	train's multi_logloss: 1.89732	valid's multi_logloss: 1.9243
[400]	train's multi_logloss: 1.88745	valid's multi_logloss: 1.92322
[500]	train's multi_logloss: 1.8781	valid's multi_logloss: 1.92241
[600]	train's multi_logloss: 1.86916	valid's multi_logloss: 1.92203
[700]	train's multi_logloss: 1.86045	valid's multi_logloss: 1.92157
[800]	train's multi_logloss: 1.85195	valid's multi_logloss: 1.92134
[900]	train's multi_logloss: 1.84376	valid's multi_logloss: 1.92106
[1000]	train's multi_logloss: 1.83569	valid's multi_logloss: 1.92086
Did not meet early stopping. Best iteration is:
[989]	train's multi_logloss: 1.83654	valid's multi_logloss: 1.92084
✅ FOLD 1: MAP@3 Score: 0.32817
##### FOLD 2 #####
Training until validation scores don't improve for 100 rounds
[100]	train'

In [9]:
# Average predictions from all folds
pred_prob /= FOLDS

# Final top-3 predicted class indices for each row
final_top3 = np.argsort(pred_prob, axis=1)[:, -3:][:, ::-1]

# Map each index to the corresponding fertilizer name
def decode_labels(row):
    return ' '.join([class_names[int(i)] for i in row])

# Apply decoding to all predictions
final_top3_labels = pd.DataFrame(final_top3).apply(decode_labels, axis=1)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': final_top3_labels
})

submission_df.head()

Unnamed: 0,id,Fertilizer Name
0,750000,DAP 28-28 10-26-26
1,750001,17-17-17 20-20 10-26-26
2,750002,20-20 14-35-14 10-26-26
3,750003,14-35-14 17-17-17 DAP
4,750004,20-20 10-26-26 28-28


In [11]:
submission_df.to_csv('lightgbm_submission.csv', index=False)

In [12]:
n_classes = pred_prob.shape[1]

df = pd.DataFrame(pred_prob, columns=[f'class_{i}' for i in range(n_classes)])
df.to_csv('model3_pred_prob.csv', index=False)