In [18]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import top_k_accuracy_score
from copy import deepcopy

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.neighbors import KNeighborsClassifier

import joblib

In [19]:
# Load data
train_df = pd.read_csv('playground-series-s5e6/train.csv')
test_df = pd.read_csv('playground-series-s5e6/test.csv')
train_df

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [27]:
# Encode categorical features
cat_features = ['Soil Type', 'Crop Type']
encoders = {}

for col in cat_features:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    encoders[col] = le

# Encode target
target_le = LabelEncoder()
train_df['Fertilizer Name'] = target_le.fit_transform(train_df['Fertilizer Name'])
class_names = list(target_le.classes_)

# Prepare features and target
features = ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
            'Nitrogen', 'Potassium', 'Phosphorous']
X = train_df[features]
y = train_df['Fertilizer Name']
X_test = test_df[features]

# Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=features)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=features)

In [21]:
# MAP@3 scorer
def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        p = p[:k]
        score = 0.0
        hits = 0
        seen = set()
        for i, pred in enumerate(p):
            if pred in a and pred not in seen:
                hits += 1
                score += hits / (i + 1.0)
                seen.add(pred)
        return score / min(len(a), k)
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [22]:
FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof = np.zeros((len(X), len(class_names)))
pred_prob = np.zeros((len(X_test), len(class_names)))

for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    print(f"##### FOLD {i+1} #####")
    
    x_train, x_valid = X_scaled.iloc[train_idx], X_scaled.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    knn = KNeighborsClassifier(n_neighbors=15, weights='distance', n_jobs=-1)
    knn.fit(x_train, y_train)

    oof[valid_idx] = knn.predict_proba(x_valid)
    pred_prob += knn.predict_proba(X_test)

    top_3_preds = np.argsort(oof[valid_idx], axis=1)[:, -3:][:, ::-1]
    actual = [[label] for label in y_valid]
    fold_map3 = mapk(actual, top_3_preds)
    print(f"✅ FOLD {i+1}: MAP@3 Score: {fold_map3:.5f}")

    # Save model for this fold
    joblib.dump(knn, f'knn_model_fold{i+1}.joblib')
    print(f"💾 Saved model for fold {i+1} as knn_model_fold{i+1}.joblib")


##### FOLD 1 #####
✅ FOLD 1: MAP@3 Score: 0.27675
💾 Saved model for fold 1 as knn_model_fold1.joblib
##### FOLD 2 #####
✅ FOLD 2: MAP@3 Score: 0.27509
💾 Saved model for fold 2 as knn_model_fold2.joblib
##### FOLD 3 #####
✅ FOLD 3: MAP@3 Score: 0.27390
💾 Saved model for fold 3 as knn_model_fold3.joblib
##### FOLD 4 #####
✅ FOLD 4: MAP@3 Score: 0.27430
💾 Saved model for fold 4 as knn_model_fold4.joblib
##### FOLD 5 #####
✅ FOLD 5: MAP@3 Score: 0.27473
💾 Saved model for fold 5 as knn_model_fold5.joblib


In [23]:
# Average predictions from all folds
pred_prob /= FOLDS

# Final top-3 predictions
final_top3 = np.argsort(pred_prob, axis=1)[:, -3:][:, ::-1]
# Map class indices back to fertilizer names
def decode_labels(row):
    return ' '.join([class_names[int(i)] for i in row.values])

final_top3_df = pd.DataFrame(final_top3)
final_top3_labels = final_top3_df.apply(decode_labels, axis=1)

final_top3_labels = [' '.join(class_names[i] for i in row) for row in final_top3]

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': final_top3_labels
})

submission_df

Unnamed: 0,id,Fertilizer Name
0,750000,20-20 DAP 14-35-14
1,750001,20-20 14-35-14 10-26-26
2,750002,DAP 17-17-17 Urea
3,750003,10-26-26 20-20 14-35-14
4,750004,10-26-26 DAP 20-20
...,...,...
249995,999995,DAP 10-26-26 20-20
249996,999996,DAP 20-20 14-35-14
249997,999997,DAP 20-20 14-35-14
249998,999998,DAP 20-20 17-17-17


In [24]:
submission_df.to_csv('knn_submission.csv', index=False)

In [25]:
n_classes = pred_prob.shape[1]

df = pd.DataFrame(pred_prob, columns=[f'class_{i}' for i in range(n_classes)])
df.to_csv('model2_pred_prob.csv', index=False)