# Enhanced Feature Engineering: Categorical Treatment + Target Encoding

This experiment addresses evaluator concerns:
1. Treat all numerical features as categorical (low cardinality: 14-43 unique values)
2. Add interaction features
3. Implement proper target encoding with leakage prevention
4. Add CatBoost for model diversity
5. Optimize directly for MAP@3

In [1]:
import pandas as pd
import numpy as np
import torch  # For GPU detection
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import TargetEncoder
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
import warnings
import pickle
import os
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Target distribution:")
print(train['Fertilizer Name'].value_counts())

Train shape: (750000, 10)
Test shape: (250000, 9)
Target distribution:
Fertilizer Name
14-35-14    114436
10-26-26    113887
17-17-17    112453
28-28       111158
20-20       110889
DAP          94860
Urea         92317
Name: count, dtype: int64


In [2]:
# Define MAP@3 metric for optimization
def map_at_3(predictions, true_labels):
    """Calculate MAP@3 score"""
    map_scores = []
    
    for i in range(len(true_labels)):
        # Get top 3 predictions
        pred_idx = np.argsort(predictions[i])[-3:][::-1]
        
        # Calculate average precision for this observation
        score = 0.0
        num_hits = 0
        
        for k, pred in enumerate(pred_idx, 1):
            if pred == true_labels[i]:
                num_hits += 1
                score += num_hits / k
                break  # Only one correct label per observation
        
        map_scores.append(score)
    
    return np.mean(map_scores)

print("MAP@3 metric defined")

MAP@3 metric defined


In [3]:
# Feature Engineering - Phase 1: Categorical Treatment
# All numerical features have low cardinality (14-43 unique values) - treat as categorical

# Original numerical features
numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
categorical_features = ['Soil Type', 'Crop Type']

print("Converting numerical features to categorical via binning...")

# Create binned versions of numerical features (10 bins each)
for col in numerical_features:
    # Use qcut for equal frequency binning to ensure good distribution
    train[f'{col}_binned'] = pd.qcut(train[col], q=10, labels=False, duplicates='drop')
    test[f'{col}_binned'] = pd.qcut(test[col], q=10, labels=False, duplicates='drop')
    
    # Convert to category dtype
    train[f'{col}_binned'] = train[f'{col}_binned'].astype('category')
    test[f'{col}_binned'] = test[f'{col}_binned'].astype('category')

print("Binned features created")

# Keep original numerical features as backup
train[numerical_features] = train[numerical_features].astype(float)
test[numerical_features] = test[numerical_features].astype(float)

Converting numerical features to categorical via binning...


Binned features created


In [4]:
# Feature Engineering - Phase 2: Interaction Features
print("Creating interaction features...")

# Environmental interactions
for df in [train, test]:
    # Temp × Humidity
    df['Temp_Humidity'] = df['Temparature'] * df['Humidity']
    # Temp × Moisture  
    df['Temp_Moisture'] = df['Temparature'] * df['Moisture']
    # Humidity × Moisture
    df['Humidity_Moisture'] = df['Humidity'] * df['Moisture']
    
    # Nutrient-Environment interactions
    df['N_Temp'] = df['Nitrogen'] * df['Temparature']
    df['P_Humidity'] = df['Phosphorous'] * df['Humidity']
    df['K_Moisture'] = df['Potassium'] * df['Moisture']
    
    # NPK balance features
    df['NPK_sum'] = df['Nitrogen'] + df['Phosphorous'] + df['Potassium']
    df['NPK_balance'] = df['Nitrogen'] / (df['NPK_sum'] + 1)
    df['P_balance'] = df['Phosphorous'] / (df['NPK_sum'] + 1)
    df['K_balance'] = df['Potassium'] / (df['NPK_sum'] + 1)
    
    # Soil-Crop interaction (critical for target encoding)
    df['Soil_Crop'] = df['Soil Type'].astype(str) + '_' + df['Crop Type'].astype(str)
    df['Soil_Crop'] = df['Soil_Crop'].astype('category')

print("Interaction features created")
print(f"Soil_Crop feature created with {train['Soil_Crop'].nunique()} unique values")
print(f"Sample Soil_Crop values: {train['Soil_Crop'].unique()[:5]}")

Creating interaction features...


Interaction features created
Soil_Crop feature created with 55 unique values
Sample Soil_Crop values: ['Clayey_Sugarcane', 'Sandy_Millets', 'Sandy_Barley', 'Red_Paddy', 'Red_Pulses']
Categories (55, object): ['Black_Barley', 'Black_Cotton', 'Black_Ground Nuts', 'Black_Maize', ..., 'Sandy_Pulses', 'Sandy_Sugarcane', 'Sandy_Tobacco', 'Sandy_Wheat']


In [5]:
# Prepare feature lists
binned_features = [f'{col}_binned' for col in numerical_features]
interaction_features = ['Temp_Humidity', 'Temp_Moisture', 'Humidity_Moisture', 
                       'N_Temp', 'P_Humidity', 'K_Moisture',
                       'NPK_sum', 'NPK_balance', 'P_balance', 'K_balance']

# All categorical features (binned + original categorical + interactions)
# Use the EXACT column names from the CSV (with spaces)
all_categorical = binned_features + ['Soil Type', 'Crop Type', 'Soil_Crop']
all_numerical = numerical_features + interaction_features

print(f"Categorical features ({len(all_categorical)}): {all_categorical}")
print(f"Numerical features ({len(all_numerical)}): {len(all_numerical)}")
print(f"Total features: {len(all_categorical) + len(all_numerical)}")

# Verify all features exist
print(f"\nVerifying all features exist in train:")
missing_features = []
for col in all_categorical + all_numerical:
    if col not in train.columns:
        missing_features.append(col)
        print(f"  MISSING: {col}")

if not missing_features:
    print("  All features present!")

print(f"\nSoil_Crop sample values:")
print(train['Soil_Crop'].value_counts().head())

Categorical features (9): ['Temparature_binned', 'Humidity_binned', 'Moisture_binned', 'Nitrogen_binned', 'Potassium_binned', 'Phosphorous_binned', 'Soil Type', 'Crop Type', 'Soil_Crop']
Numerical features (16): 16
Total features: 25

Verifying all features exist in train:
  All features present!

Soil_Crop sample values:
Soil_Crop
Black_Paddy     18410
Sandy_Paddy     17552
Loamy_Paddy     16869
Red_Paddy       16679
Sandy_Pulses    16406
Name: count, dtype: int64


In [6]:
# Encode target
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(train['Fertilizer Name'])
print(f"Classes: {le_target.classes_}")
print(f"Number of classes: {len(le_target.classes_)}")

Classes: ['10-26-26' '14-35-14' '17-17-17' '20-20' '28-28' 'DAP' 'Urea']
Number of classes: 7


In [7]:
# Prepare feature matrices
# Use EXACT column names as they appear in the CSV
feature_columns = []

# Add binned features
feature_columns.extend(binned_features)

# Add original categorical features (with spaces)
feature_columns.extend(['Soil Type', 'Crop Type'])

# Add Soil_Crop interaction feature
feature_columns.append('Soil_Crop')

# Add other interaction features
feature_columns.extend(interaction_features)

# Add original numerical features
feature_columns.extend(numerical_features)

# DEBUG: Check what's actually in train before selecting columns
print(f"DEBUG: train.columns = {train.columns.tolist()}")
print(f"DEBUG: 'Soil_Crop' in train.columns = {'Soil_Crop' in train.columns}")
print(f"DEBUG: 'Crop Type' in train.columns = {'Crop Type' in train.columns}")
print(f"DEBUG: 'Soil Type' in train.columns = {'Soil Type' in train.columns}")

# Verify all features exist
print("Verifying all features exist in train:")
missing_features = []
for col in feature_columns:
    if col not in train.columns:
        missing_features.append(col)

if missing_features:
    print(f"MISSING FEATURES: {missing_features}")
else:
    print("  All features present!")

# Create feature matrices
X = train[feature_columns].copy()
X_test = test[feature_columns].copy()

print(f"\nFeature matrix X shape: {X.shape}")
print(f"Test feature matrix X_test shape: {X_test.shape}")

# Ensure binned features are category dtype
for col in binned_features:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

print(f"\nFinal feature columns ({len(feature_columns)}): {feature_columns}")

DEBUG: train.columns = ['id', 'Temparature', 'Humidity', 'Moisture', 'Soil Type', 'Crop Type', 'Nitrogen', 'Potassium', 'Phosphorous', 'Fertilizer Name', 'Temparature_binned', 'Humidity_binned', 'Moisture_binned', 'Nitrogen_binned', 'Potassium_binned', 'Phosphorous_binned', 'Temp_Humidity', 'Temp_Moisture', 'Humidity_Moisture', 'N_Temp', 'P_Humidity', 'K_Moisture', 'NPK_sum', 'NPK_balance', 'P_balance', 'K_balance', 'Soil_Crop']
DEBUG: 'Soil_Crop' in train.columns = True
DEBUG: 'Crop Type' in train.columns = True
DEBUG: 'Soil Type' in train.columns = True
Verifying all features exist in train:
All features verified!
Total features in feature_columns: 25
feature_columns: ['Temparature_binned', 'Humidity_binned', 'Moisture_binned', 'Nitrogen_binned', 'Potassium_binned', 'Phosphorous_binned', 'Soil Type', 'Crop Type', 'Soil_Crop', 'Temp_Humidity', 'Temp_Moisture', 'Humidity_Moisture', 'N_Temp', 'P_Humidity', 'K_Moisture', 'NPK_sum', 'NPK_balance', 'P_balance', 'K_balance', 'Temparature', 

Converted Temparature_binned to category dtype
Converted Humidity_binned to category dtype
Converted Moisture_binned to category dtype
Converted Nitrogen_binned to category dtype
Converted Potassium_binned to category dtype
Converted Phosphorous_binned to category dtype


Converted Soil Type to category dtype


Converted Crop Type to category dtype
Converted Soil_Crop to category dtype

Final feature matrix shape: (750000, 25)
Final test matrix shape: (250000, 25)

X columns: ['Temparature_binned', 'Humidity_binned', 'Moisture_binned', 'Nitrogen_binned', 'Potassium_binned', 'Phosphorous_binned', 'Soil Type', 'Crop Type', 'Soil_Crop', 'Temp_Humidity', 'Temp_Moisture', 'Humidity_Moisture', 'N_Temp', 'P_Humidity', 'K_Moisture', 'NPK_sum', 'NPK_balance', 'P_balance', 'K_balance', 'Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']


In [8]:
# Target Encoding with Leakage Prevention
# Use sklearn's TargetEncoder with proper CV

print("Setting up target encoding...")

# Features to target encode - use EXACT column names from CSV
target_encode_features = ['Soil Type', 'Crop Type', 'Soil_Crop']

# Check if features exist and have reasonable cardinality
for col in target_encode_features:
    if col in train.columns:
        unique_values = train[col].nunique()
        print(f"{col}: {unique_values} unique values")
    else:
        print(f"WARNING: {col} not found in data")

# We'll do target encoding inside the CV loop to prevent leakage
# For now, just prepare the data structure
target_encode_cols = target_encode_features

print(f"Target encoding will be applied to: {target_encode_cols}")

Setting up target encoding...
Soil Type: 5 unique values
Crop Type: 11 unique values
Soil_Crop: 55 unique values
Target encoding will be applied to: ['Soil Type', 'Crop Type', 'Soil_Crop']


In [None]:
# Stratified K-Fold setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize predictions
oof_predictions = np.zeros((len(X), len(le_target.classes_)))
test_predictions_xgb = np.zeros((len(X_test), len(le_target.classes_)))
test_predictions_lgb = np.zeros((len(X_test), len(le_target.classes_)))
test_predictions_cat = np.zeros((len(X_test), len(le_target.classes_)))

# Model parameters
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class': len(le_target.classes_),
    'learning_rate': 0.05,
    'max_depth': 7,  # Shallower depth for categorical features
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 500,
    'tree_method': 'hist',
    'device': 'cuda',
    'enable_categorical': True,  # Enable native categorical support
    'random_state': 42,
    'verbosity': 0
}

lgb_params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': len(le_target.classes_),
    'learning_rate': 0.05,
    'max_depth': 7,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbosity': -1
}

catboost_params = {
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'learning_rate': 0.05,
    'depth': 7,
    'iterations': 500,
    'bootstrap_type': 'Bernoulli',  # Required for subsample
    'subsample': 0.8,
    'random_state': 42,
    'verbose': False,
    'task_type': 'GPU' if torch.cuda.is_available() else 'CPU'
}

print("Model parameters defined")
print(f"Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

In [None]:
# Cross-validation loop with proper target encoding
fold_scores = []
fold_scores_xgb = []
fold_scores_lgb = []
fold_scores_cat = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    # DEBUG: Check columns in X_train before encoding
    print(f"DEBUG: Columns in X_train before encoding: {X_train.columns.tolist()}")
    print(f"DEBUG: 'Crop Type' in X_train: {'Crop Type' in X_train.columns}")
    print(f"DEBUG: 'Soil_Crop' in X_train: {'Soil_Crop' in X_train.columns}")
    
    # Apply target encoding (fit on train, transform on val)
    X_train_enc = X_train.copy()
    X_val_enc = X_val.copy()
    X_test_enc = X_test.copy()
    
    # Fit target encoder on training data
    encoder = TargetEncoder(target_type='multiclass', smooth='auto')
    encoder.fit(X_train[target_encode_cols], y_train)
    
    # Transform the target encoding features
    # TargetEncoder returns 2D array for multiclass, need to flatten to 1D per feature
    train_encoded = encoder.transform(X_train[target_encode_cols])
    val_encoded = encoder.transform(X_val[target_encode_cols])
    test_encoded = encoder.transform(X_test[target_encode_cols])
    
    # Flatten the 2D encoded arrays to 1D by taking mean across classes
    for i, col in enumerate(target_encode_cols):
        X_train_enc[col] = train_encoded[:, i].ravel()
        X_val_enc[col] = val_encoded[:, i].ravel()
        X_test_enc[col] = test_encoded[:, i].ravel()
    
    print(f"  Target encoding applied to {len(target_encode_cols)} features")
    
    # XGBoost model
    print("  Training XGBoost...")
    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_xgb.fit(X_train_enc, y_train, eval_set=[(X_val_enc, y_val)], verbose=False)
    
    # Predictions
    val_pred_xgb = model_xgb.predict_proba(X_val_enc)
    test_pred_xgb = model_xgb.predict_proba(X_test_enc)
    
    # Store predictions
    oof_predictions[val_idx] = val_pred_xgb
    test_predictions_xgb += test_pred_xgb / n_splits
    
    # Fold score
    fold_score_xgb = map_at_3(val_pred_xgb, y_val)
    fold_scores_xgb.append(fold_score_xgb)
    print(f"  XGBoost MAP@3: {fold_score_xgb:.4f}")
    
    # LightGBM model
    print("  Training LightGBM...")
    
    # For LightGBM, need to specify categorical features (only binned features)
    cat_features_lgb = binned_features.copy()
    
    model_lgb = lgb.LGBMClassifier(**lgb_params)
    model_lgb.fit(
        X_train_enc, y_train,
        eval_set=[(X_val_enc, y_val)],
        categorical_feature=cat_features_lgb
    )
    
    # Predictions
    val_pred_lgb = model_lgb.predict_proba(X_val_enc)
    test_pred_lgb = model_lgb.predict_proba(X_test_enc)
    
    # Store predictions
    test_predictions_lgb += test_pred_lgb / n_splits
    
    # Fold score
    fold_score_lgb = map_at_3(val_pred_lgb, y_val)
    fold_scores_lgb.append(fold_score_lgb)
    print(f"  LightGBM MAP@3: {fold_score_lgb:.4f}")
    
    # CatBoost model
    print("  Training CatBoost...")
    
    # For CatBoost, only binned features are categorical after target encoding
    # The target encoded columns are now numerical
    cat_features_cat = binned_features.copy()
    
    # Convert to Pool objects
    train_pool = Pool(X_train_enc, y_train, cat_features=cat_features_cat)
    val_pool = Pool(X_val_enc, y_val, cat_features=cat_features_cat)
    test_pool = Pool(X_test_enc, cat_features=cat_features_cat)
    
    model_cat = CatBoostClassifier(**catboost_params)
    model_cat.fit(train_pool, eval_set=val_pool, verbose=False)
    
    # Predictions
    val_pred_cat = model_cat.predict_proba(val_pool)
    test_pred_cat = model_cat.predict_proba(test_pool)
    
    # Store predictions
    test_predictions_cat += test_pred_cat / n_splits
    
    # Fold score
    fold_score_cat = map_at_3(val_pred_cat, y_val)
    fold_scores_cat.append(fold_score_cat)
    print(f"  CatBoost MAP@3: {fold_score_cat:.4f}")
    
    # Average ensemble score for this fold
    val_pred_ensemble = (val_pred_xgb + val_pred_lgb + val_pred_cat) / 3
    fold_score = map_at_3(val_pred_ensemble, y_val)
    fold_scores.append(fold_score)
    print(f"  Ensemble MAP@3: {fold_score:.4f}")

print(f"\nCV Results:")
print(f"  XGBoost: {np.mean(fold_scores_xgb):.4f} ± {np.std(fold_scores_xgb):.4f}")
print(f"  LightGBM: {np.mean(fold_scores_lgb):.4f} ± {np.std(fold_scores_lgb):.4f}")
print(f"  CatBoost: {np.mean(fold_scores_cat):.4f} ± {np.std(fold_scores_cat):.4f}")
print(f"  Ensemble: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")

In [None]:
# Calculate final CV score
cv_score = map_at_3(oof_predictions, y_encoded)
print(f"\nFinal CV MAP@3 Score: {cv_score:.4f}")
print(f"Improvement over baseline: {cv_score - 0.3311:.4f}")

In [None]:
# Average predictions from all three models
final_test_predictions = (test_predictions_xgb + test_predictions_lgb + test_predictions_cat) / 3

# Get top 3 predictions for each test sample
top3_predictions = np.argsort(final_test_predictions, axis=1)[:, -3:][:, ::-1]

# Convert back to fertilizer names
predicted_names = []
for pred in top3_predictions:
    names = le_target.inverse_transform(pred)
    predicted_names.append(' '.join(names))

# Create submission
test_ids = test['id'].values
submission = pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': predicted_names
})

print("Submission format:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
submission.to_csv('/home/submission/submission.csv', index=False)
print("Submission saved to /home/submission/submission.csv")

In [None]:
# Create experiment folder
os.makedirs('experiments/002_categorical_target_encoding', exist_ok=True)

# Save OOF predictions and test predictions
np.save('experiments/002_categorical_target_encoding/oof_predictions.npy', oof_predictions)
np.save('experiments/002_categorical_target_encoding/test_predictions_xgb.npy', test_predictions_xgb)
np.save('experiments/002_categorical_target_encoding/test_predictions_lgb.npy', test_predictions_lgb)
np.save('experiments/002_categorical_target_encoding/test_predictions_cat.npy', test_predictions_cat)

# Save feature information
with open('experiments/002_categorical_target_encoding/feature_info.pkl', 'wb') as f:
    pickle.dump({
        'categorical_features': all_categorical,
        'numerical_features': all_numerical,
        'target_encode_cols': target_encode_cols,
        'le_target': le_target
    }, f)

print("Experiment artifacts saved to experiments/002_categorical_target_encoding/")