# Enhanced Feature Engineering: Categorical Treatment + Target Encoding

This experiment addresses evaluator concerns:
1. Treat all numerical features as categorical (low cardinality: 14-43 unique values)
2. Add interaction features
3. Implement proper target encoding with leakage prevention
4. Add CatBoost for model diversity
5. Optimize directly for MAP@3

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import TargetEncoder
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
import warnings
import pickle
import os
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target distribution:")
print(train['Fertilizer Name'].value_counts())

In [None]:
# Define MAP@3 metric for optimization
def map_at_3(predictions, true_labels):
    """Calculate MAP@3 score"""
    map_scores = []
    
    for i in range(len(true_labels)):
        # Get top 3 predictions
        pred_idx = np.argsort(predictions[i])[-3:][::-1]
        
        # Calculate average precision for this observation
        score = 0.0
        num_hits = 0
        
        for k, pred in enumerate(pred_idx, 1):
            if pred == true_labels[i]:
                num_hits += 1
                score += num_hits / k
                break  # Only one correct label per observation
        
        map_scores.append(score)
    
    return np.mean(map_scores)

print("MAP@3 metric defined")

In [None]:
# Feature Engineering - Phase 1: Categorical Treatment
# All numerical features have low cardinality (14-43 unique values) - treat as categorical

# Original numerical features
numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
categorical_features = ['Soil Type', 'Crop Type']

print("Converting numerical features to categorical via binning...")

# Create binned versions of numerical features (10 bins each)
for col in numerical_features:
    # Use qcut for equal frequency binning to ensure good distribution
    train[f'{col}_binned'] = pd.qcut(train[col], q=10, labels=False, duplicates='drop')
    test[f'{col}_binned'] = pd.qcut(test[col], q=10, labels=False, duplicates='drop')
    
    # Convert to category dtype
    train[f'{col}_binned'] = train[f'{col}_binned'].astype('category')
    test[f'{col}_binned'] = test[f'{col}_binned'].astype('category')

print("Binned features created")

# Keep original numerical features as backup
train[numerical_features] = train[numerical_features].astype(float)
test[numerical_features] = test[numerical_features].astype(float)

In [None]:
# Feature Engineering - Phase 2: Interaction Features
print("Creating interaction features...")

# Debug: Check current column names BEFORE creating Soil_Crop
print(f"Current train columns BEFORE Soil_Crop: {train.columns.tolist()}")

# Environmental interactions
for df in [train, test]:
    # Temp × Humidity
    df['Temp_Humidity'] = df['Temparature'] * df['Humidity']
    # Temp × Moisture  
    df['Temp_Moisture'] = df['Temparature'] * df['Moisture']
    # Humidity × Moisture
    df['Humidity_Moisture'] = df['Humidity'] * df['Moisture']
    
    # Nutrient-Environment interactions
    df['N_Temp'] = df['Nitrogen'] * df['Temparature']
    df['P_Humidity'] = df['Phosphorous'] * df['Humidity']
    df['K_Moisture'] = df['Potassium'] * df['Moisture']
    
    # NPK balance features
    df['NPK_sum'] = df['Nitrogen'] + df['Phosphorous'] + df['Potassium']
    df['NPK_balance'] = df['Nitrogen'] / (df['NPK_sum'] + 1)
    df['P_balance'] = df['Phosphorous'] / (df['NPK_sum'] + 1)
    df['K_balance'] = df['Potassium'] / (df['NPK_sum'] + 1)
    
    # Soil-Crop interaction (critical for target encoding)
    df['Soil_Crop'] = df['Soil Type'].astype(str) + '_' + df['Crop Type'].astype(str)
    df['Soil_Crop'] = df['Soil_Crop'].astype('category')

# Debug: Check column names AFTER creating Soil_Crop
print(f"Current train columns AFTER Soil_Crop: {train.columns.tolist()}")
print(f"Soil_Crop feature created with {train['Soil_Crop'].nunique()} unique values")

print("Interaction features created")

In [None]:
# Prepare feature lists
binned_features = [f'{col}_binned' for col in numerical_features]
interaction_features = ['Temp_Humidity', 'Temp_Moisture', 'Humidity_Moisture', 
                       'N_Temp', 'P_Humidity', 'K_Moisture',
                       'NPK_sum', 'NPK_balance', 'P_balance', 'K_balance']

# All categorical features (binned + original categorical + interactions)
# Use the EXACT column names from the CSV (with spaces)
all_categorical = binned_features + ['Soil Type', 'Crop Type', 'Soil_Crop']
all_numerical = numerical_features + interaction_features

print(f"Categorical features ({len(all_categorical)}): {all_categorical}")
print(f"Numerical features ({len(all_numerical)}): {len(all_numerical)}")
print(f"Total features: {len(all_categorical) + len(all_numerical)}")

# Verify all features exist
print(f"\nVerifying all features exist in train:")
missing_features = []
for col in all_categorical + all_numerical:
    if col not in train.columns:
        missing_features.append(col)
        print(f"  MISSING: {col}")

if not missing_features:
    print("  All features present!")

print(f"\nSoil_Crop sample values:")
print(train['Soil_Crop'].value_counts().head())

In [None]:
# Encode target
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(train['Fertilizer Name'])
print(f"Classes: {le_target.classes_}")
print(f"Number of classes: {len(le_target.classes_)}")

In [None]:
# Prepare feature matrices
# Use EXACT column names as they appear in the CSV
feature_columns = []

# Add binned features
feature_columns.extend(binned_features)

# Add original categorical features (with spaces)
feature_columns.extend(['Soil Type', 'Crop Type'])

# Add interaction features
feature_columns.extend(['Soil_Crop'])
feature_columns.extend(interaction_features)

# Add original numerical features
feature_columns.extend(numerical_features)

# DEBUG: Check what's actually in train before selecting columns
print(f"DEBUG: train.columns = {train.columns.tolist()}")
print(f"DEBUG: 'Soil_Crop' in train.columns = {'Soil_Crop' in train.columns}")
print(f"DEBUG: 'Crop Type' in train.columns = {'Crop Type' in train.columns}")

# Verify all features exist
print("Verifying all features exist in train:")
missing_features = []
for col in feature_columns:
    if col not in train.columns:
        missing_features.append(col)
        print(f"  MISSING: {col}")

if missing_features:
    raise ValueError(f"Missing features: {missing_features}")

print("All features verified!")

# Create feature matrices
X = train[feature_columns].copy()
X_test = test[feature_columns].copy()

# Convert categorical features to category dtype
categorical_for_conversion = binned_features + ['Soil Type', 'Crop Type', 'Soil_Crop']
for col in categorical_for_conversion:
    if col in X.columns:
        X[col] = X[col].astype('category')
        X_test[col] = X_test[col].astype('category')
        print(f"Converted {col} to category dtype")

print(f"\nFinal feature matrix shape: {X.shape}")
print(f"Final test matrix shape: {X_test.shape}")
print(f"\nFeature columns ({len(feature_columns)}): {feature_columns}")

In [None]:
# Target Encoding with Leakage Prevention
# Use sklearn's TargetEncoder with proper CV

print("Setting up target encoding...")

# Features to target encode - use EXACT column names from CSV
target_encode_features = ['Soil Type', 'Crop Type', 'Soil_Crop']

# Check if features exist and have reasonable cardinality
for col in target_encode_features:
    if col in train.columns:
        unique_values = train[col].nunique()
        print(f"{col}: {unique_values} unique values")
    else:
        print(f"WARNING: {col} not found in data")

# We'll do target encoding inside the CV loop to prevent leakage
# For now, just prepare the data structure
target_encode_cols = target_encode_features

print(f"Target encoding will be applied to: {target_encode_cols}")

In [None]:
# Stratified K-Fold setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize predictions
oof_predictions = np.zeros((len(X), len(le_target.classes_)))
test_predictions_xgb = np.zeros((len(X_test), len(le_target.classes_)))
test_predictions_lgb = np.zeros((len(X_test), len(le_target.classes_)))
test_predictions_cat = np.zeros((len(X_test), len(le_target.classes_)))

# Model parameters
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class': len(le_target.classes_),
    'learning_rate': 0.05,
    'max_depth': 7,  # Shallower depth for categorical features
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 500,
    'tree_method': 'hist',
    'device': 'cuda',
    'enable_categorical': True,  # Enable native categorical support
    'random_state': 42,
    'verbosity': 0
}

lgb_params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': len(le_target.classes_),
    'learning_rate': 0.05,
    'max_depth': 7,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbose': -1
}

cat_params = {
    'iterations': 500,
    'learning_rate': 0.05,
    'depth': 7,
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'random_seed': 42,
    'verbose': False,
    'task_type': 'GPU' if os.environ.get('CUDA_VISIBLE_DEVICES') else 'CPU'
}

print("Starting cross-validation with target encoding...")

In [None]:
# Cross-validation loop with proper target encoding
fold_scores = []
fold_scores_xgb = []
fold_scores_lgb = []
fold_scores_cat = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    # DEBUG: Check columns in X_train before encoding
    print(f"DEBUG: Columns in X_train before encoding: {X_train.columns.tolist()}")
    print(f"DEBUG: 'Crop Type' in X_train: {'Crop Type' in X_train.columns}")
    print(f"DEBUG: 'Soil_Crop' in X_train: {'Soil_Crop' in X_train.columns}")
    
    # Apply target encoding (fit on train, transform on val)
    X_train_enc = X_train.copy()
    X_val_enc = X_val.copy()
    X_test_enc = X_test.copy()
    
    # Fit target encoder on training data
    encoder = TargetEncoder(target_type='multiclass', smooth='auto')
    encoder.fit(X_train[target_encode_cols], y_train)
    
    # Transform training, validation, and test data
    for i, col in enumerate(target_encode_cols):
        # Transform each column separately to get proper shape
        train_col_encoded = encoder.transform(X_train[[col]])
        val_col_encoded = encoder.transform(X_val[[col]])
        test_col_encoded = encoder.transform(X_test[[col]])
        
        # The output is 2D with shape (n_samples, n_classes)
        # Take mean across classes to get a single value per sample
        if train_col_encoded.ndim == 2:
            X_train_enc[f'{col}_te'] = train_col_encoded.mean(axis=1)
            X_val_enc[f'{col}_te'] = val_col_encoded.mean(axis=1)
            X_test_enc[f'{col}_te'] = test_col_encoded.mean(axis=1)
        else:
            # Fallback for 1D output
            X_train_enc[f'{col}_te'] = train_col_encoded.ravel()
            X_val_enc[f'{col}_te'] = val_col_encoded.ravel()
            X_test_enc[f'{col}_te'] = test_col_encoded.ravel()
    
    # Ensure all datasets have the same columns in the same order
    feature_cols = X_train_enc.columns.tolist()
    X_val_enc = X_val_enc[feature_cols]
    X_test_enc = X_test_enc[feature_cols]
    
    # DEBUG: Check if Crop Type and Soil_Crop are still present
    print(f"DEBUG: Columns in X_train_enc: {X_train_enc.columns.tolist()}")
    print(f"DEBUG: 'Crop Type' in X_train_enc: {'Crop Type' in X_train_enc.columns}")
    print(f"DEBUG: 'Soil_Crop' in X_train_enc: {'Soil_Crop' in X_train_enc.columns}")
    
    # Train XGBoost
    print("Training XGBoost...")
    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_xgb.fit(X_train_enc, y_train)
    
    # Train LightGBM
    print("Training LightGBM...")
    model_lgb = lgb.LGBMClassifier(**lgb_params)
    model_lgb.fit(X_train_enc, y_train)
    
    # Train CatBoost
    print("Training CatBoost...")
    # CatBoost requires Pool objects for categorical features
    # Only include categorical features that exist in the dataframe
    cat_features_indices = []
    categorical_cols_for_catboost = binned_features + ['Soil Type', 'Crop Type', 'Soil_Crop']
    for col in categorical_cols_for_catboost:
        if col in X_train_enc.columns:
            cat_features_indices.append(X_train_enc.columns.get_loc(col))
    
    print(f"CatBoost categorical features: {categorical_cols_for_catboost}")
    print(f"CatBoost categorical indices: {cat_features_indices}")
    print(f"CatBoost will use columns: {[X_train_enc.columns[i] for i in cat_features_indices]}")
    
    train_pool = Pool(X_train_enc, y_train, cat_features=cat_features_indices)
    val_pool = Pool(X_val_enc, y_val, cat_features=cat_features_indices)
    
    model_cat = CatBoostClassifier(**cat_params)
    model_cat.fit(train_pool, eval_set=val_pool, verbose=False)
    
    # Predict on validation set
    pred_xgb = model_xgb.predict_proba(X_val_enc)
    pred_lgb = model_lgb.predict_proba(X_val_enc)
    pred_cat = model_cat.predict_proba(val_pool)
    
    # Average predictions (equal weighting for now)
    pred_avg = (pred_xgb + pred_lgb + pred_cat) / 3
    oof_predictions[val_idx] = pred_avg
    
    # Predict on test set
    test_pred_xgb = model_xgb.predict_proba(X_test_enc)
    test_pred_lgb = model_lgb.predict_proba(X_test_enc)
    
    test_pool = Pool(X_test_enc, cat_features=cat_features_indices)
    test_pred_cat = model_cat.predict_proba(test_pool)
    
    test_predictions_xgb += test_pred_xgb / n_splits
    test_predictions_lgb += test_pred_lgb / n_splits
    test_predictions_cat += test_pred_cat / n_splits
    
    # Calculate fold scores using proper MAP@3
    score_xgb = map_at_3(pred_xgb, y_val)
    score_lgb = map_at_3(pred_lgb, y_val)
    score_cat = map_at_3(pred_cat, y_val)
    score_avg = map_at_3(pred_avg, y_val)
    
    fold_scores_xgb.append(score_xgb)
    fold_scores_lgb.append(score_lgb)
    fold_scores_cat.append(score_cat)
    fold_scores.append(score_avg)
    
    print(f"Fold {fold + 1} MAP@3 - XGB: {score_xgb:.4f}, LGB: {score_lgb:.4f}, CAT: {score_cat:.4f}, AVG: {score_avg:.4f}")

print(f"\nMean MAP@3 - XGB: {np.mean(fold_scores_xgb):.4f} ± {np.std(fold_scores_xgb):.4f}")
print(f"Mean MAP@3 - LGB: {np.mean(fold_scores_lgb):.4f} ± {np.std(fold_scores_lgb):.4f}")
print(f"Mean MAP@3 - CAT: {np.mean(fold_scores_cat):.4f} ± {np.std(fold_scores_cat):.4f}")
print(f"Mean MAP@3 - AVG: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")

In [None]:
# Calculate final CV score
cv_score = map_at_3(oof_predictions, y_encoded)
print(f"\nFinal CV MAP@3 Score: {cv_score:.4f}")
print(f"Improvement over baseline: {cv_score - 0.3311:.4f}")

In [None]:
# Average predictions from all three models
final_test_predictions = (test_predictions_xgb + test_predictions_lgb + test_predictions_cat) / 3

# Get top 3 predictions for each test sample
top3_predictions = np.argsort(final_test_predictions, axis=1)[:, -3:][:, ::-1]

# Convert back to fertilizer names
predicted_names = []
for pred in top3_predictions:
    names = le_target.inverse_transform(pred)
    predicted_names.append(' '.join(names))

# Create submission
test_ids = test['id'].values
submission = pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': predicted_names
})

print("Submission format:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved to /home/submission/submission.csv")

In [None]:
# Create experiment folder
os.makedirs('experiments/002_categorical_target_encoding', exist_ok=True)

# Save OOF predictions and test predictions
np.save('experiments/002_categorical_target_encoding/oof_predictions.npy', oof_predictions)
np.save('experiments/002_categorical_target_encoding/test_predictions_xgb.npy', test_predictions_xgb)
np.save('experiments/002_categorical_target_encoding/test_predictions_lgb.npy', test_predictions_lgb)
np.save('experiments/002_categorical_target_encoding/test_predictions_cat.npy', test_predictions_cat)

# Save feature information
with open('experiments/002_categorical_target_encoding/feature_info.pkl', 'wb') as f:
    pickle.dump({
        'categorical_features': all_categorical,
        'numerical_features': all_numerical,
        'target_encode_cols': target_encode_cols,
        'le_target': le_target
    }, f)

print("Experiment artifacts saved to experiments/002_categorical_target_encoding/")