# Baseline Model: XGBoost + LightGBM

Following the winning solution patterns:
- Start with XGBoost and LightGBM as foundation
- Minimal feature engineering (product features for NPK interactions)
- Stratified K-Fold validation (5 folds)
- Treat features as categorical where appropriate

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target distribution:")
print(train['Fertilizer Name'].value_counts().head())

In [None]:
# Basic feature engineering - minimal as recommended
# Add simple product features for NPK interactions
for df in [train, test]:
    df['NPK_product'] = df['Nitrogen'] * df['Potassium'] * df['Phosphorous']
    df['N_P_ratio'] = df['Nitrogen'] / (df['Phosphorous'] + 1)
    df['N_K_ratio'] = df['Nitrogen'] / (df['Potassium'] + 1)
    df['P_K_ratio'] = df['Phosphorous'] / (df['Potassium'] + 1)

# Identify feature types
categorical_features = ['Soil Type', 'Crop Type']
numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 
                      'NPK_product', 'N_P_ratio', 'N_K_ratio', 'P_K_ratio']

print(f"Categorical features: {categorical_features}")
print(f"Numerical features: {numerical_features}")

In [None]:
# Encode categorical features
le_dict = {}
for col in categorical_features:
    le = LabelEncoder()
    # Fit on combined train+test to handle unseen categories
    combined = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined)
    train[col + '_encoded'] = le.transform(train[col])
    test[col + '_encoded'] = le.transform(test[col])
    le_dict[col] = le

# Prepare feature matrix
feature_cols = [col + '_encoded' for col in categorical_features] + numerical_features
X = train[feature_cols].copy()
X_test = test[feature_cols].copy()
y = train['Fertilizer Name'].copy()

print(f"Feature matrix shape: {X.shape}")
print(f"Test feature matrix shape: {X_test.shape}")
print(f"Number of classes: {y.nunique()}")

In [None]:
# Encode target
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)
print(f"Classes: {le_target.classes_}")
print(f"Number of classes: {len(le_target.classes_)}")

In [None]:
# Stratified K-Fold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize predictions
oof_predictions = np.zeros((len(X), len(le_target.classes_)))
test_predictions_xgb = np.zeros((len(X_test), len(le_target.classes_)))
test_predictions_lgb = np.zeros((len(X_test), len(le_target.classes_)))

# XGBoost parameters (following winning solution patterns)
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class': len(le_target.classes_),
    'learning_rate': 0.05,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 500,
    'tree_method': 'hist',
    'device': 'cuda',
    'random_state': 42,
    'verbosity': 0
}

# LightGBM parameters
lgb_params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': len(le_target.classes_),
    'learning_rate': 0.05,
    'max_depth': 8,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbose': -1
}

print("Starting cross-validation...")

In [None]:
# Cross-validation loop
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    # Train XGBoost
    print("Training XGBoost...")
    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_xgb.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=False
    )
    
    # Train LightGBM
    print("Training LightGBM...")
    model_lgb = lgb.LGBMClassifier(**lgb_params)
    model_lgb.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict on validation set
    pred_xgb = model_xgb.predict_proba(X_val)
    pred_lgb = model_lgb.predict_proba(X_val)
    
    # Average predictions
    pred_avg = (pred_xgb + pred_lgb) / 2
    oof_predictions[val_idx] = pred_avg
    
    # Predict on test set
    test_pred_xgb = model_xgb.predict_proba(X_test)
    test_pred_lgb = model_lgb.predict_proba(X_test)
    
    test_predictions_xgb += test_pred_xgb / n_splits
    test_predictions_lgb += test_pred_lgb / n_splits
    
    # Calculate fold score (MAP@3 approximation using top-3 accuracy)
    # For now, we'll use a simple metric - will calculate proper MAP@3 later
    top3_pred = np.argsort(pred_avg, axis=1)[:, -3:][:, ::-1]
    correct = np.sum(top3_pred == y_val.reshape(-1, 1))
    fold_score = correct / (len(y_val) * 3)
    fold_scores.append(fold_score)
    
    print(f"Fold {fold + 1} top-3 accuracy: {fold_score:.4f}")

print(f"\nMean top-3 accuracy: {np.mean(fold_scores):.4f} Â± {np.std(fold_scores):.4f}")

In [None]:
# Calculate proper MAP@3 score
def map_at_3(predictions, true_labels):
    """Calculate MAP@3 score"""
    map_scores = []
    
    for i in range(len(true_labels)):
        # Get top 3 predictions
        pred_idx = np.argsort(predictions[i])[-3:][::-1]
        
        # Calculate average precision for this observation
        score = 0.0
        num_hits = 0
        
        for k, pred in enumerate(pred_idx, 1):
            if pred == true_labels[i]:
                num_hits += 1
                score += num_hits / k
                break  # Only one correct label per observation
        
        map_scores.append(score)
    
    return np.mean(map_scores)

# Calculate CV score
cv_score = map_at_3(oof_predictions, y_encoded)
print(f"CV MAP@3 Score: {cv_score:.4f}")

In [None]:
# Average predictions from both models
final_test_predictions = (test_predictions_xgb + test_predictions_lgb) / 2

# Get top 3 predictions for each test sample
top3_predictions = np.argsort(final_test_predictions, axis=1)[:, -3:][:, ::-1]

# Convert back to fertilizer names
predicted_names = []
for pred in top3_predictions:
    names = le_target.inverse_transform(pred)
    predicted_names.append(' '.join(names))

# Create submission
test_ids = test['id'].values
submission = pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': predicted_names
})

print("Submission format:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved to /home/submission/submission.csv")

In [None]:
# Create experiment folder
import os
os.makedirs('experiments/001_baseline', exist_ok=True)

# Save OOF predictions for potential ensembling later
np.save('experiments/001_baseline/oof_predictions.npy', oof_predictions)
np.save('experiments/001_baseline/test_predictions_xgb.npy', test_predictions_xgb)
np.save('experiments/001_baseline/test_predictions_lgb.npy', test_predictions_lgb)

# Save feature columns and encoders
import pickle
with open('experiments/001_baseline/feature_cols.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)
with open('experiments/001_baseline/le_target.pkl', 'wb') as f:
    pickle.dump(le_target, f)

print("Experiment artifacts saved to experiments/001_baseline/")