In [None]:
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from category_encoders import TargetEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score
from lightgbm import LGBMClassifier

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')

In [None]:
train_df.head(10)

In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
train_df['Fertilizer Name'].unique()

In [None]:
train_df.groupby('Fertilizer Name')[['Temparature','Humidity','Moisture' ,'Phosphorous', 'Nitrogen', 'Potassium']].mean()

In [None]:
categorical_cols = ['Soil Type', 'Crop Type']

for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    ax = sns.countplot(x=col, hue='Fertilizer Name', data=train_df)
    plt.title(f'{col} Distribution by Fertilizer')
    plt.xticks(rotation=45)
    plt.tight_layout()

    # Move legend outside
    ax.legend(title='Fertilizer Name', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.show()

In [None]:
numerical_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

for col in numerical_cols:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='Fertilizer Name', y=col, data=train_df)
    plt.title(f'{col} Distribution by Fertilizer')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
corr_matrix = train_df[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
X = train_df.drop('Fertilizer Name', axis=1)
y = train_df['Fertilizer Name']

In [None]:
# for categorical target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.2, stratify = y, random_state = 10)

In [None]:
# Feature Engineering
X_train = X_train.copy()
X_train['Soil Crop'] = X_train['Soil Type']+'_'+X_train['Crop Type']

X_test = X_test.copy()
X_test['Soil Crop'] = X_test['Soil Type']+'_'+X_test['Crop Type']

In [None]:
# column groups
te_cols = ['Soil Type','Crop Type', 'Soil Crop']
all_num_cols = [ 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

In [None]:
te = TargetEncoder(cols=te_cols, smoothing=10)
X_train[te_cols] = te.fit_transform(X_train[te_cols], y_train)
X_test[te_cols] = te.transform(X_test[te_cols])

In [None]:
scaler_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [None]:
col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', scaler_pipeline, all_num_cols),
    ('te_cols', scaler_pipeline, te_cols)
],
    remainder = 'drop')

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessing', col_trans)
])

In [None]:
X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)

In [None]:
# Metric function: MAP@k calculation
def mapk_metric(y_true, y_pred_topk, k=3):
    return np.mean([
        1.0 / (pred[:k].index(true) + 1) if true in pred[:k] else 0.0
        for true, pred in zip(y_true, y_pred_topk)
    ])

# Scoring function with estimator, X, y signature for GridSearchCV
def map3_scorer(estimator, X, y):
    proba = estimator.predict_proba(X)  # predicted probabilities
    top_3 = np.argsort(proba, axis=1)[:, -3:][:, ::-1]  # top 3 class indices
    top_3_list = top_3.tolist()
    return mapk_metric(y, top_3_list, k=3)

In [None]:
# Models
lgbm = LGBMClassifier(random_state=10)

In [None]:
param_grid_lgbm = {
    'n_estimators': [500],
    'learning_rate': [0.05],
    'num_leaves': [ 127],
    'min_data_in_leaf': [20, 50],
    'feature_fraction': [0.6],
    'bagging_fraction': [0.8],
    'bagging_freq': [1],
    'min_gain_to_split': [0, 0.1],
    'lambda_l2': [0, 1]
}

In [None]:
lgbm_cv = GridSearchCV(lgbm, param_grid_lgbm, cv=3, scoring=map3_scorer, n_jobs=-1)

In [None]:
lgbm_cv.fit(X_train_processed, y_train)

In [None]:
lgbm_cv.best_params_

In [None]:
lgbm_cv.best_score_

In [None]:
# from sklearn.inspection import permutation_importance

# # Run permutation importance using best estimator
# result = permutation_importance(
#     lgbm_cv.best_estimator_,  
#     X_test_processed,
#     y_test,
#     n_repeats=10,
#     random_state=10,
#     n_jobs=-1
# )

# # Handle feature names: If pipeline removed names, get them manually
# try:
#     feature_names = col_trans.get_feature_names_out()
# except:
#     feature_names = [f'Feature {i}' for i in range(X_test_processed.shape[1])]

# # Create DataFrame with results
# importance_df = pd.DataFrame({
#     'Feature': feature_names,
#     'Importance Mean': result.importances_mean,
#     'Importance Std': result.importances_std
# }).sort_values(by='Importance Mean', ascending=False)

# # Plot
# importance_df.head(20).plot(
#     kind='barh',
#     x='Feature',
#     y='Importance Mean',
#     xerr='Importance Std',
#     title='Top 20 Features by Permutation Importance',
#     figsize=(10, 8)
# )
# plt.gca().invert_yaxis()
# plt.tight_layout()
# plt.show()

# # get top 10 features
# top_features = importance_df.head(10)['Feature'].tolist()
# print("Top 10 Features:", top_features)

In [None]:
# Models
xgb = XGBClassifier(random_state=10, tree_method = 'hist')

In [None]:
param_grid_xgb = {
    'n_estimators': [500],
    'max_depth': [8],               
    'learning_rate': [0.1],         
    'subsample': [0.8],             
    'min_child_weight': [4],        
    'colsample_bytree': [0.4],      
    'gamma': [0],       
    'lambda': [5],             
    'alpha': [1], 
}     

In [None]:
xgb_cv = GridSearchCV(xgb, param_grid_xgb, cv=3, scoring=map3_scorer, n_jobs=-1)

In [None]:
xgb_cv.fit(X_train_processed, y_train)

In [None]:
xgb_cv.best_params_

In [None]:
xgb_cv.best_score_

In [None]:
# Use full training data
X_full = train_df.drop('Fertilizer Name', axis=1).copy()
y_full = train_df['Fertilizer Name']
y_full_encoded = le.transform(y_full)

# Add new feature
X_full['Soil Crop'] = X_full['Soil Type'] + '_' + X_full['Crop Type']

# Apply target encoding (on full training set only)
te_full = TargetEncoder(cols=te_cols, smoothing=10)
X_full[te_cols] = te_full.fit_transform(X_full[te_cols], y_full_encoded)

# Preprocess features
X_full_processed = pipeline.fit_transform(X_full)

# Prepare test set
test_df = test_df.copy()
test_df['Soil Crop'] = test_df['Soil Type'] + '_' + test_df['Crop Type']
test_df[te_cols] = te_full.transform(test_df[te_cols])
test_processed = pipeline.transform(test_df)

# Set up 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
test_preds = np.zeros((test_processed.shape[0], len(le.classes_)))  

# Train and predict for each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_full_processed, y_full_encoded)):
    print(f" Training fold {fold+1}")
    
    X_train_fold = X_full_processed[train_idx]
    y_train_fold = y_full_encoded[train_idx]

    model = VotingClassifier(
        estimators=[
            ('xgb', XGBClassifier(**xgb_cv.best_params_, use_label_encoder=False, eval_metric='mlogloss', verbosity=0)),
            ('lgbm', LGBMClassifier(**lgbm_cv.best_params_))
        ],
        voting='soft',
        n_jobs=-1
    )

    model.fit(X_train_fold, y_train_fold)

    # Average test predictions
    test_preds += model.predict_proba(test_processed) / 5

# Get top-3 predictions
top_3_indices = np.argsort(test_preds, axis=1)[:, -3:][:, ::-1]

# Decode the class indices to original labels
top_3_labels = np.array([le.inverse_transform(top_3_indices[:, i]) for i in range(3)]).T

# Format predictions into space-separated strings
final_preds = [' '.join(row) for row in top_3_labels]

# Create submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': final_preds
})

submission.to_csv('submission_simple_ensemble.csv', index=False)