In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import optuna

In [None]:
train_data = pd.read_csv(r"/kaggle/input/playground-series-s5e6/train.csv")
test_data = pd.read_csv(r"/kaggle/input/playground-series-s5e6/test.csv")
original_data = pd.read_csv(r"/kaggle/input/fertilizer-data/Fertilizer Prediction.csv")
data = pd.read_csv(r"/kaggle/input/playground-series-s5e6/sample_submission.csv")

print("train_data shape :",train_data.shape)
print("test_data shape :",test_data.shape)
print("original_data shape :",original_data.shape)
print("data shape :",data.shape)

In [None]:
train_data.head()

In [None]:
train_data.isna().sum().sort_values(ascending=False)

In [None]:
test_data.head()

In [None]:
test_data.isna().sum().sort_values(ascending=False)

In [None]:
original_data.head()

In [None]:
train_data = train_data.drop("id", axis=1)
test_data = test_data.drop("id", axis=1)
train_data = pd.concat([train_data, original_data], ignore_index=True)
train_data = train_data.drop_duplicates()
print("shape of the data :",train_data.shape)

In [None]:
params = {'booster': 'gbtree', 'lambda': 0.4852532041827346, 'alpha': 5.681002524055748, 'colsample_bytree': 0.40465381192194894, 'subsample': 0.9318477513237314, 'learning_rate': 0.2978528279037068, 'max_depth': 10, 'min_child_weight': 6}

In [None]:
X = train_data.drop(columns="Fertilizer Name")
y = train_data["Fertilizer Name"]
test = test_data.copy()
# Encode target
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
num_classes = len(np.unique(y_encoded))

# Identify column types
num_cols = X.select_dtypes(include='number').columns.tolist()
cat_cols = X.select_dtypes(include='object').columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

# MAP@3 metric
def map3_score(y_true, y_proba, k=3):
    top_k = np.argsort(y_proba, axis=1)[:, -k:][:, ::-1]
    score = 0.0
    for i in range(len(y_true)):
        if y_true[i] in top_k[i]:
            rank = np.where(top_k[i] == y_true[i])[0][0]
            score += 1 / (rank + 1)
    return score / len(y_true)

# Optuna hyperparameter tuning for XGB example
def optuna_xgb(trial):
    params = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'eval_metric': 'mlogloss',
        'use_label_encoder': False,
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.3),
        'max_depth': trial.suggest_int("max_depth", 3, 10),
        'subsample': trial.suggest_float("subsample", 0.5, 1.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),
        'gamma': trial.suggest_float("gamma", 0, 5),
    }
    map3_scores = []
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    for train_idx, val_idx in skf.split(X, y_encoded):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
        
        preprocessor.fit(X_train)
        X_train_scaled = preprocessor.transform(X_train)
        X_val_scaled = preprocessor.transform(X_val)

        model = XGBClassifier(**params)
        model.fit(X_train_scaled, y_train)
        proba = model.predict_proba(X_val_scaled)
        map3_scores.append(map3_score(y_val, proba))

    return np.mean(map3_scores)

# Run Optuna study
#study = optuna.create_study(direction="maximize")
#study.optimize(optuna_xgb, n_trials=20)
#print("Best MAP@3:", study.best_value)
#print("Best params:", study.best_params)

# Use best XGB params in stacking with test predictions
FOLDS = 5
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Base models with best XGB
base_models = {
    "xgb": XGBClassifier(**params, use_label_encoder=False, eval_metric='mlogloss', num_class=num_classes, random_state=42),
    "lgb": LGBMClassifier(objective='multiclass', num_class=num_classes, random_state=42, verbosity=-1),
    "rf": RandomForestClassifier(n_estimators=100, random_state=42),
    "gb": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "hgb": HistGradientBoostingClassifier(random_state=42),
    "cat": CatBoostClassifier(iterations=100, verbose=0, random_seed=42)
}

meta_features = np.zeros((len(X), len(base_models) * num_classes))
test_preds = np.zeros((len(test), len(base_models) * num_classes))

for i, (name, model) in enumerate(base_models.items()):
    print(f"\n Training base model: {name}")
    oof = np.zeros((len(X), num_classes))
    test_fold_preds = np.zeros((len(test), num_classes))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_encoded), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

        preprocessor.fit(X_train)
        X_train_scaled = preprocessor.transform(X_train)
        X_val_scaled = preprocessor.transform(X_val)
        test_scaled = preprocessor.transform(test)

        model.fit(X_train_scaled, y_train)
        oof[val_idx] = model.predict_proba(X_val_scaled)
        test_fold_preds += model.predict_proba(test_scaled) / FOLDS

    meta_features[:, i * num_classes:(i + 1) * num_classes] = oof
    test_preds[:, i * num_classes:(i + 1) * num_classes] = test_fold_preds

# Train meta model:
meta_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
meta_model.fit(meta_features, y_encoded)

# Predict on test
final_test_preds = meta_model.predict_proba(test_preds)
top3 = np.argsort(final_test_preds, axis=1)[:, -3:][:, ::-1]
top3_labels = target_le.inverse_transform(top3.ravel()).reshape(-1, 3)

submission = pd.read_csv(r"/kaggle/input/playground-series-s5e6-data/sample_submission.csv")
submission.to_csv("submission.csv", index=False)
submission = pd.DataFrame({
    "id": data.id,
    "Fertilizer Name": [" ".join(row) for row in top3_labels.astype(str)]
})

submission.to_csv("stacked_meta_submission.csv", index=False)
submission.head()
