In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
# Load data
train_df = pd.read_csv('playground-series-s5e6/train.csv')
test_df = pd.read_csv('playground-series-s5e6/test.csv')
train_df

Unnamed: 0,id,Temparature,Humidity,Moisture,Soil Type,Crop Type,Nitrogen,Potassium,Phosphorous,Fertilizer Name
0,0,37,70,36,Clayey,Sugarcane,36,4,5,28-28
1,1,27,69,65,Sandy,Millets,30,6,18,28-28
2,2,29,63,32,Sandy,Millets,24,12,16,17-17-17
3,3,35,62,54,Sandy,Barley,39,12,4,10-26-26
4,4,35,58,43,Red,Paddy,37,2,16,DAP
...,...,...,...,...,...,...,...,...,...,...
749995,749995,25,69,30,Clayey,Maize,8,16,6,28-28
749996,749996,37,64,58,Loamy,Sugarcane,38,8,20,17-17-17
749997,749997,35,68,59,Sandy,Ground Nuts,6,11,29,10-26-26
749998,749998,31,68,29,Red,Cotton,9,11,12,20-20


In [4]:
# Encode categorical features
cat_features = ['Soil Type', 'Crop Type']
encoders = {}

for col in cat_features:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    encoders[col] = le

# Encode target
target_le = LabelEncoder()
train_df['Fertilizer Name'] = target_le.fit_transform(train_df['Fertilizer Name'])
class_names = list(target_le.classes_)

# Prepare features and target
features = ['Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type',
            'Nitrogen', 'Potassium', 'Phosphorous']
X = train_df[features]
y = train_df['Fertilizer Name']
X_test = test_df[features]

In [5]:
xg = pd.read_csv('./pred_probs/xgboost_pred_prob.csv').values
xg3 = pd.read_csv('./pred_probs/3xgboost_prob.csv').values
lg = pd.read_csv('./pred_probs/lightgbm_pred_prob.csv').values
cat = pd.read_csv('./pred_probs/catboost_pred_prob.csv').values
wxg = pd.read_csv('./pred_probs/w_xgboost_prob.csv').values
n_classes = xg.shape[1]

In [11]:
# Stack all model predictions
all_preds = [xg, xg3, wxg, cat, lg]

# Simple average ensemble
# ensemble_pred = np.mean(all_preds, axis=0)

# weightted ensemble
# weights = [0.6, 0.1, 0.3]  #weights for [XGBoost, KNN, LightGBM]
# ensemble_pred = weights[0]*pred1 + weights[1]*pred2 + weights[2]*pred3

# weights = [0.6, 0.4]  #weights for [XGBoost, LightGBM]
# ensemble_pred = weights[0]*pred1 + weights[1]*pred3

# weights = [0.175, 0.45, 0.175, 0.05, 0.15]
# ensemble_pred = weights[0]*xg + weights[1]*xg3 + weights[2]*wxg + weights[3]*cat + weights[4]*lg

weights = [0.15, 0.7, 0.15]
ensemble_pred = weights[0]*xg + weights[1]*xg3 + weights[2]*wxg

In [12]:
# Top 3 predictions per row (most confident classes)
top3_preds = np.argsort(ensemble_pred, axis=1)[:, -3:][:, ::-1]  # shape: (n_samples, 3)

In [13]:
def decode_labels(row):
    return ' '.join([class_names[int(i)] for i in row])

final_top3_labels = pd.DataFrame(top3_preds).apply(decode_labels, axis=1)

# 3. Form submission dataframe
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': final_top3_labels
})

In [14]:
submission_df

Unnamed: 0,id,Fertilizer Name
0,750000,10-26-26 DAP 20-20
1,750001,17-17-17 20-20 10-26-26
2,750002,20-20 28-28 Urea
3,750003,14-35-14 DAP 17-17-17
4,750004,20-20 Urea 10-26-26
...,...,...
249995,999995,Urea 28-28 17-17-17
249996,999996,10-26-26 14-35-14 28-28
249997,999997,DAP 10-26-26 Urea
249998,999998,10-26-26 28-28 17-17-17


In [15]:
submission_df.to_csv('w_ensembel_submission1.csv', index=False)