In [2]:
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

In [3]:
uli = False
try:  # works on the kaggle website
    df_train = pd.read_csv("/kaggle/input/playground-series-s5e6/train.csv")
    df_test = pd.read_csv("/kaggle/input/playground-series-s5e6/test.csv")
    if uli:
        original = pd.read_csv("/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv")
    else:
        original = pd.read_csv("/kaggle/input/original/Fertilizer Prediction.csv")
except: # if it fails, find the file locally
    df_train = pd.read_csv("train.csv")
    df_test = pd.read_csv("test.csv")
    original = pd.read_csv("Fertilizer Prediction.csv")

In [63]:
original

Unnamed: 0,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,32,51,41,Red,Ground Nuts,7,3,19,14-35-14
1,35,58,35,Black,Cotton,4,14,16,Urea
2,27,55,43,Sandy,Sugarcane,28,0,17,20-20
3,33,56,56,Loamy,Ground Nuts,37,5,24,28-28
4,32,70,60,Red,Ground Nuts,4,6,9,14-35-14
...,...,...,...,...,...,...,...,...,...
99995,32,71,61,Black,Tobacco,23,1,25,20-20
99996,35,72,47,Loamy,Millets,38,1,32,17-17-17
99997,28,50,61,Sandy,Maize,10,11,14,14-35-14
99998,29,57,63,Loamy,Ground Nuts,7,10,4,DAP


In [64]:
df_train

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [65]:
df_test

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous
0,750000,31,70,52,Sandy,Wheat,34,11,24
1,750001,27,62,45,Red,Sugarcane,30,14,15
2,750002,28,72,28,Clayey,Ground Nuts,14,15,4
3,750003,37,53,57,Black,Ground Nuts,18,17,36
4,750004,31,55,32,Red,Pulses,13,19,14
...,...,...,...,...,...,...,...,...,...
249995,999995,26,66,30,Red,Sugarcane,14,7,18
249996,999996,33,62,55,Red,Pulses,28,14,7
249997,999997,36,53,64,Black,Paddy,28,11,27
249998,999998,36,67,26,Clayey,Paddy,33,0,10


In [4]:
df_train = pd.concat([df_train, original], ignore_index=True, join="outer", sort=False)

In [5]:
#Divide all the features with each other to see their correlation

df_train["Temp_div_Humidity"] = df_train["Temparature"] / df_train["Humidity"]
df_train["Temp_div_Moisture"] = df_train["Temparature"] / df_train["Moisture"]
df_train["Humidity_div_Moisture"] = df_train["Humidity"] / df_train["Moisture"]

df_train["Nitrogen_div_Potassium"] = df_train["Nitrogen"] / df_train["Potassium"]
df_train["Nitrogen_div_Phosphorous"] = df_train["Nitrogen"] / df_train["Phosphorous"]
df_train["Phosphorous_div_Potassium"] = df_train["Phosphorous"] / df_train["Potassium"]

In [6]:
#Doing the same for the test

df_test["Temp_div_Humidity"] = df_test["Temparature"] / df_test["Humidity"]
df_test["Temp_div_Moisture"] = df_test["Temparature"] / df_test["Moisture"]
df_test["Humidity_div_Moisture"] = df_test["Humidity"] / df_test["Moisture"]

df_test["Nitrogen_div_Potassium"] = df_test["Nitrogen"] / df_test["Potassium"]
df_test["Nitrogen_div_Phosphorous"] = df_test["Nitrogen"] / df_test["Phosphorous"]
df_test["Phosphorous_div_Potassium"] = df_test["Phosphorous"] / df_test["Potassium"]


In [7]:
df_train["NPK_total"] = df_train["Nitrogen"] + df_train["Phosphorous"] + df_train["Potassium"]
df_test["NPK_total"] = df_test["Nitrogen"] + df_test["Phosphorous"] + df_test["Potassium"]

In [8]:
# Replace all inf/-inf with NaN
df_test.replace([np.inf, -np.inf], 0, inplace=True)
df_test.fillna(0, inplace=True)

df_train.replace([np.inf, -np.inf], 0, inplace=True)
df_train.fillna(0, inplace=True)

In [9]:
missing_values = df_train.isnull().sum()
missing_values = missing_values[missing_values > 0]

if not missing_values.empty:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=missing_values.index, y=missing_values.values, palette='viridis')
    plt.xticks(rotation=90)
    plt.xlabel('Features')
    plt.ylabel('Missing Values')
    plt.title('Missing Values per Feature')
    plt.tight_layout()
    plt.show()
else:
    print("✅ No missing values found in the dataset.")

✅ No missing values found in the dataset.


In [10]:
y = df_train["Fertilizer Name"] # The target

features = ["id", "Temparature", "Humidity", "Moisture", "Nitrogen", "Potassium", "Phosphorous", "Soil Type", "Crop Type","Temp_div_Humidity","Temp_div_Moisture","Humidity_div_Moisture","Nitrogen_div_Potassium","Nitrogen_div_Phosphorous","Phosphorous_div_Potassium", "NPK_total"]

X = pd.get_dummies(df_train[features]) # One hot encode training set

test = pd.get_dummies(df_test) # One hot encode test set

# Unused!?!
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,     # 20% test, 80% train
    random_state=42,   # random seed for reproducibility
    shuffle=True
)

In [None]:
# this takes time! (about 3 minutes per seed on local PC)

# --- your existing prep ---
y_str = y.astype(str)
le = LabelEncoder().fit(y_str)
y_enc = le.transform(y_str)

# seeds = [42]
# seeds = [42, 33, 77, 99, 101]
seeds = [i for i in range(10)]

# --- train models and keep them ---
models = []
for s in seeds:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=0.2, random_state=s, stratify=y_enc
    )

    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=s, stratify=y_train
    )

    mdl = XGBClassifier(
        objective="multi:softprob",
        eval_metric="mlogloss",
        n_estimators=4000,
        learning_rate=0.06,
        max_depth=2,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method="hist",
        early_stopping_rounds=50,
        random_state=s
    )
    mdl.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=0
    )
    models.append(mdl)

# --- predict on your final test set and average probs ---
# Make sure column order matches training:
X_test_final = test[X.columns] if isinstance(test, pd.DataFrame) else test

probs = []
for mdl in models:
    bi = getattr(mdl, "best_iteration", None)
    if bi is not None:
        p = mdl.predict_proba(X_test_final, iteration_range=(0, bi + 1))
    else:
        p = mdl.predict_proba(X_test_final)
    probs.append(p)                           # probs shape: seeds x samples x classes

avg_proba = np.mean(probs, axis=0)            # average over seeds   shape: samples x classes
best_idx = np.argmax(avg_proba, axis=1)       # unused
best_names = le.inverse_transform(best_idx)   # unused   array of class names (strings)

# 1) Indices of the top-3 classes per row (highest → lowest)
top3_idx = np.argsort(avg_proba, axis=1)[:, -3:][:, ::-1]

# 2) Convert indices → class names
# (both of these are correct; pick one)
# top3_names = le.inverse_transform(top3_idx.ravel()).reshape(top3_idx.shape)
top3_names = le.classes_[top3_idx]   # simpler

# 3) Join names with a single space
fert_col = [" ".join(name) for name in top3_names]

# 4) Build EXACT format CSV
submission = pd.DataFrame({
    "id": test["id"],
    "Fertilizer Name": fert_col
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv with top-3 labels concatenated")


In [59]:
submission

Unnamed: 0,id,Fertilizer Name
0,750000,DAP 14-35-14 28-28
1,750001,17-17-17 20-20 10-26-26
2,750002,20-20 10-26-26 14-35-14
3,750003,14-35-14 17-17-17 10-26-26
4,750004,20-20 10-26-26 17-17-17
...,...,...
249995,999995,14-35-14 20-20 17-17-17
249996,999996,14-35-14 17-17-17 10-26-26
249997,999997,14-35-14 17-17-17 DAP
249998,999998,DAP 10-26-26 17-17-17


In [60]:
# le.classes_[top3_idx]
for i in range(7):
    print(le.classes_[i])

10-26-26
14-35-14
17-17-17
20-20
28-28
DAP
Urea


n_estimators=4000, learning_rate=0.06, max_depth=8 = 0.32378 - 0.32357
n_estimators=1000, learning_rate=0.03, max_depth=4 = 0.31992 - 0.32143
