# Evolver Loop 2: Pure Categorical Treatment Analysis

This notebook validates the pure categorical approach recommended by the evaluator.
We test whether treating numerical features as categorical WITHOUT binning improves performance.

**Hypothesis**: Direct categorical treatment (no binning) preserves ordinal relationships and allows XGBoost/LightGBM to learn optimal splits.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print("Dataset shapes:")
print(f"Train: {train.shape}")
print(f"Test: {test.shape}")
print("\nTarget distribution:")
print(train['Fertilizer Name'].value_counts())

Dataset shapes:
Train: (750000, 10)
Test: (250000, 9)

Target distribution:
Fertilizer Name
14-35-14    114436
10-26-26    113887
17-17-17    112453
28-28       111158
20-20       110889
DAP          94860
Urea         92317
Name: count, dtype: int64


## 1. Cardinality Analysis: Why Binning is Harmful

In [6]:
# Test pure categorical treatment (no binning, no target encoding, no interactions)

# Define features
feature_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 
                'Phosphorous', 'Soil Type', 'Crop Type']

# Create a copy for categorical treatment
train_cat = train.copy()
test_cat = test.copy()

# Convert numerical features to categorical dtype (NO BINNING - use original values as categories)
for col in numerical_features:
    train_cat[col] = train_cat[col].astype('category')
    test_cat[col] = test_cat[col].astype('category')

# Label encode categorical features (Soil Type, Crop Type)
le_soil = LabelEncoder()
le_crop = LabelEncoder()

# Also label encode target for XGBoost
le_target = LabelEncoder()

train_cat['Soil Type'] = le_soil.fit_transform(train_cat['Soil Type'])
test_cat['Soil Type'] = le_soil.transform(test_cat['Soil Type'])

train_cat['Crop Type'] = le_crop.fit_transform(train_cat['Crop Type'])
test_cat['Crop Type'] = le_crop.transform(test_cat['Crop Type'])

# Convert to category dtype
train_cat['Soil Type'] = train_cat['Soil Type'].astype('category')
test_cat['Soil Type'] = test_cat['Soil Type'].astype('category')
train_cat['Crop Type'] = train_cat['Crop Type'].astype('category')
test_cat['Crop Type'] = test_cat['Crop Type'].astype('category')

# Encode target
train_cat['Fertilizer Name_encoded'] = le_target.fit_transform(train_cat['Fertilizer Name'])

print("Feature types after categorical conversion:")
print(train_cat[feature_cols].dtypes)
print("\nNumber of categories per feature:")
for col in feature_cols:
    print(f"{col:12s}: {train_cat[col].nunique()} categories")
    
print(f"\nTarget classes: {len(le_target.classes_)} - {list(le_target.classes_)}")

Feature types after categorical conversion:
Temparature    category
Humidity       category
Moisture       category
Nitrogen       category
Potassium      category
Phosphorous    category
Soil Type      category
Crop Type      category
dtype: object

Number of categories per feature:
Temparature : 14 categories
Humidity    : 23 categories
Moisture    : 41 categories
Nitrogen    : 39 categories
Potassium   : 20 categories
Phosphorous : 43 categories
Soil Type   : 5 categories
Crop Type   : 11 categories

Target classes: 7 - ['10-26-26', '14-35-14', '17-17-17', '20-20', '28-28', 'DAP', 'Urea']


# Quick test to see if pure categorical treatment works
import xgboost as xgb
from sklearn.model_selection import cross_val_score
import time

# Prepare data
X = train_cat[feature_cols]
y = train_cat['Fertilizer Name_encoded']  # Use encoded target

# Define MAP@3 metric for XGBoost
def map_at_3_xgboost(predt, dtrain):
    """MAP@3 metric for XGBoost"""
    y_true = dtrain.get_label()
    # Convert predictions to probabilities
    # predt is already in the right format for multi:softprob
    # Get top 3 predictions for each sample
    top3_idx = np.argsort(predt, axis=1)[:, -3:][:, ::-1]
    
    map_scores = []
    for i in range(len(y_true)):
        # Find where true label is in top 3
        true_label = y_true[i]
        pred_row = top3_idx[i]
        
        score = 0.0
        num_hits = 0
        
        for k, pred in enumerate(pred_row, 1):
            if pred == true_label:
                num_hits += 1
                score += num_hits / k
        
        if num_hits > 0:
            map_scores.append(score / min(3, len(pred_row)))
        else:
            map_scores.append(0.0)
    
    return 'MAP@3', np.mean(map_scores)

# Test with small number of trees first
print("Testing pure categorical treatment with XGBoost...")
print("="*50)

start_time = time.time()

# Simple XGBoost model with categorical support
model_test = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=7,
    eval_metric='mlogloss',
    tree_method='hist',
    enable_categorical=True,
    max_depth=7,
    learning_rate=0.05,
    n_estimators=100,  # Small number for quick test
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=4,
    device='cuda'
)

# Single fold validation for quick test
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_idx, val_idx = next(skf.split(X, y))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

# Train
model_test.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Predict and evaluate
y_pred_proba = model_test.predict_proba(X_val)
y_val_true = y_val.values

# Calculate MAP@3
top3_idx = np.argsort(y_pred_proba, axis=1)[:, -3:][:, ::-1]
map_scores = []
for i in range(len(y_val_true)):
    true_label = y_val_true[i]
    pred_row = top3_idx[i]
    
    score = 0.0
    num_hits = 0
    
    for k, pred in enumerate(pred_row, 1):
        if pred == true_label:
            num_hits += 1
            score += num_hits / k
    
    if num_hits > 0:
        map_scores.append(score / min(3, len(pred_row)))
    else:
        map_scores.append(0.0)

map3_score = np.mean(map_scores)
test_time = time.time() - start_time

print(f"Quick test results (100 trees, 1 fold):")
print(f"MAP@3: {map3_score:.4f}")
print(f"Test time: {test_time:.1f} seconds")
print(f"\nBaseline CV: 0.3311")
print(f"Difference: {map3_score - 0.3311:+.4f}")

if map3_score > 0.3311:
    print("\n‚úÖ PURE CATEGORICAL TREATMENT WORKS! (Better than baseline)")
else:
    print("\n‚ùå Pure categorical treatment needs more tuning or features")

In [7]:
# Test pure categorical treatment (no binning, no target encoding, no interactions)

# Define features
feature_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 
                'Phosphorous', 'Soil Type', 'Crop Type']

# Create a copy for categorical treatment
train_cat = train.copy()
test_cat = test.copy()

# Convert numerical features to categorical dtype (NO BINNING - use original values as categories)
for col in numerical_features:
    train_cat[col] = train_cat[col].astype('category')
    test_cat[col] = test_cat[col].astype('category')

# Label encode categorical features (Soil Type, Crop Type)
le_soil = LabelEncoder()
le_crop = LabelEncoder()

train_cat['Soil Type'] = le_soil.fit_transform(train_cat['Soil Type'])
test_cat['Soil Type'] = le_soil.transform(test_cat['Soil Type'])

train_cat['Crop Type'] = le_crop.fit_transform(train_cat['Crop Type'])
test_cat['Crop Type'] = le_crop.transform(test_cat['Crop Type'])

# Convert to category dtype
train_cat['Soil Type'] = train_cat['Soil Type'].astype('category')
test_cat['Soil Type'] = test_cat['Soil Type'].astype('category')
train_cat['Crop Type'] = train_cat['Crop Type'].astype('category')
test_cat['Crop Type'] = test_cat['Crop Type'].astype('category')

print("Feature types after categorical conversion:")
print(train_cat[feature_cols].dtypes)
print("\nNumber of categories per feature:")
for col in feature_cols:
    print(f"{col:12s}: {train_cat[col].nunique()} categories")

Feature types after categorical conversion:
Temparature    category
Humidity       category
Moisture       category
Nitrogen       category
Potassium      category
Phosphorous    category
Soil Type      category
Crop Type      category
dtype: object

Number of categories per feature:
Temparature : 14 categories
Humidity    : 23 categories
Moisture    : 41 categories
Nitrogen    : 39 categories
Potassium   : 20 categories
Phosphorous : 43 categories
Soil Type   : 5 categories
Crop Type   : 11 categories


## 3. Quick Performance Test with XGBoost

In [None]:
# Quick test to see if pure categorical treatment works
import xgboost as xgb
from sklearn.model_selection import cross_val_score
import time

# Prepare data
X = train_cat[feature_cols]
y = train_cat['Fertilizer Name']

# Define MAP@3 metric for XGBoost
def map_at_3_xgboost(predt, dtrain):
    """MAP@3 metric for XGBoost"""
    y_true = dtrain.get_label()
    # Convert predictions to probabilities
    # predt is already in the right format for multi:softprob
    # Get top 3 predictions for each sample
    top3_idx = np.argsort(predt, axis=1)[:, -3:][:, ::-1]
    
    map_scores = []
    for i in range(len(y_true)):
        # Find where true label is in top 3
        true_label = y_true[i]
        pred_row = top3_idx[i]
        
        score = 0.0
        num_hits = 0
        
        for k, pred in enumerate(pred_row, 1):
            if pred == true_label:
                num_hits += 1
                score += num_hits / k
        
        if num_hits > 0:
            map_scores.append(score / min(3, len(pred_row)))
        else:
            map_scores.append(0.0)
    
    return 'MAP@3', np.mean(map_scores)

# Test with small number of trees first
print("Testing pure categorical treatment with XGBoost...")
print("="*50)

start_time = time.time()

# Simple XGBoost model with categorical support
model_test = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=7,
    eval_metric='mlogloss',
    tree_method='hist',
    enable_categorical=True,
    max_depth=7,
    learning_rate=0.05,
    n_estimators=100,  # Small number for quick test
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=4,
    device='cuda'
)

# Single fold validation for quick test
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_idx, val_idx = next(skf.split(X, y))

X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

# Train
model_test.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Predict and evaluate
y_pred_proba = model_test.predict_proba(X_val)
y_val_true = y_val.values

# Calculate MAP@3
top3_idx = np.argsort(y_pred_proba, axis=1)[:, -3:][:, ::-1]
map_scores = []
for i in range(len(y_val_true)):
    true_label = y_val_true[i]
    pred_row = top3_idx[i]
    
    score = 0.0
    num_hits = 0
    
    for k, pred in enumerate(pred_row, 1):
        if pred == true_label:
            num_hits += 1
            score += num_hits / k
    
    if num_hits > 0:
        map_scores.append(score / min(3, len(pred_row)))
    else:
        map_scores.append(0.0)

map3_score = np.mean(map_scores)
test_time = time.time() - start_time

print(f"Quick test results (100 trees, 1 fold):")
print(f"MAP@3: {map3_score:.4f}")
print(f"Test time: {test_time:.1f} seconds")
print(f"\nBaseline CV: 0.3311")
print(f"Difference: {map3_score - 0.3311:+.4f}")

if map3_score > 0.3311:
    print("\n‚úÖ PURE CATEGORICAL TREATMENT WORKS! (Better than baseline)")
else:
    print("\n‚ùå Pure categorical treatment needs more tuning or features")

## 4. Why the Previous Approach Failed

In [None]:
# Analyze why exp_002 failed

print("="*70)
print("ANALYSIS: Why exp_002 (categorical_target_encoding) FAILED")
print("="*70)

print("\n1. BINNING DESTROYED INFORMATION:")
print("   - Original numericals: 14-43 unique values (low cardinality)")
print("   - After binning: 10 bins (forced reduction)")
print("   - Result: Lost natural ordinal relationships")
print("   - Example: Temperature values 25, 26, 27... became bin 0, 1, 2...")
print("   - XGBoost couldn't learn optimal splits")

print("\n2. TARGET ENCODING OVERWROTE ORIGINAL FEATURES:")
print("   - Replaced 'Soil Type', 'Crop Type' with encoded versions")
print("   - Lost original categorical information that XGBoost could leverage")
print("   - Flattened multiclass encoding with .mean(axis=1) destroyed class-specific info")

print("\n3. INTERACTION FEATURES ADDED NOISE:")
print("   - Simple multiplicative interactions (Temp√óHumidity, etc.)")
print("   - No validation that these interactions actually help")
print("   - Added 12+ features without testing individual impact")

print("\n4. NO HYPERPARAMETER TUNING:")
print("   - Used same hyperparameters as baseline (depth=7, lr=0.05)")
print("   - Categorical features may need different regularization")
print("   - No exploration of depth 6-8 range")

print("\n5. HYBRID APPROACH CONFUSED MODELS:")
print("   - Mix of binned categoricals, original numericals, AND interactions")
print("   - No clear feature engineering strategy")
print("   - Too many changes at once (binning + interactions + target encoding + CatBoost)")

print("\n" + "="*70)
print("SOLUTION: PURE CATEGORICAL TREATMENT")
print("="*70)
print("- Convert numericals to categorical WITHOUT binning")
print("- Keep original features (no target encoding)")
print("- Remove interaction features (for now)")
print("- Tune hyperparameters specifically for categorical features")
print("- Add back complexity incrementally ONLY if it helps")

## 5. Recommended Next Experiment Design

In [None]:
next_experiment_design = """
## Experiment 003: Pure Categorical Treatment (No Binning, No Target Encoding)

**Goal**: Test the core hypothesis that direct categorical treatment improves performance

**Key Changes from exp_002:**
1. ‚úÖ NO BINNING: Convert numericals to categorical using original values as categories
2. ‚úÖ NO TARGET ENCODING: Keep original Soil Type, Crop Type features
3. ‚úÖ NO INTERACTIONS: Remove all multiplicative interaction features
4. ‚úÖ HYPERPARAMETER TUNING: Tune depth (6, 7, 8) and learning rate (0.03, 0.05, 0.07)
5. ‚úÖ KEEP CATBOOST: Native categorical support is valuable

**Feature Engineering Pipeline:**
- Convert ALL numerical features to category dtype (use original values as categories)
- Label encode Soil Type, Crop Type, then convert to category dtype
- Total features: 8 (6 numerical-turned-categorical + 2 categorical)
- NO target encoding
- NO interaction features

**Models:**
- XGBoost: tree_method='hist', enable_categorical=True, depth=6-8, lr=0.03-0.07
- LightGBM: categorical_feature=cat_cols, depth=6-8, lr=0.03-0.07  
- CatBoost: native categorical support, depth=6-8, lr=0.03-0.07

**Hyperparameter Search:**
- Test depths: 6, 7, 8
- Test learning rates: 0.03, 0.05, 0.07
- Test regularization: reg_alpha=0, 0.1, 0.5

**Validation:**
- Stratified 5-fold CV
- Monitor MAP@3 and fold variance
- Compare against baseline (0.3311) and exp_002 (0.3217)

**Expected CV:** 0.340-0.350 (improvement of 0.009-0.019 from baseline)
**If successful**: Then carefully add back target encoding and selective interactions
**If fails**: Reconsider entire strategy - maybe numerical treatment is better
"""

print(next_experiment_design)

## 6. Summary and Recommendations

In [None]:
print("="*70)
print("SUMMARY: Pure Categorical Treatment Strategy")
print("="*70)

print("\n‚úÖ WHAT WORKS:")
print("- Low cardinality numerical features (14-43 unique values)")
print("- XGBoost native categorical support")
print("- CatBoost native categorical support")
print("- Stratified 5-fold CV")

print("\n‚ùå WHAT DOESN'T WORK:")
print("- Binning low-cardinality features (destroys information)")
print("- Target encoding that overwrites original features")
print("- Flattening multiclass target encoding with .mean(axis=1)")
print("- Adding interactions without validation")
print("- No hyperparameter tuning for new feature types")

print("\nüéØ NEXT STEPS:")
print("1. Implement pure categorical treatment (exp_003)")
print("2. Tune hyperparameters specifically for categorical features")
print("3. Validate CV improvement vs baseline (0.3311)")
print("4. If successful (>0.340), add back target encoding properly")
print("5. Then add selective interactions if needed")
print("6. Finally, ensemble diverse models and stack")

print("\nüìä EXPECTED IMPROVEMENT:")
print("- Conservative: 0.340 (+0.009 from baseline)")
print("- Optimistic: 0.350 (+0.019 from baseline)")
print("- Target: 0.3865 (need additional strategies after this)")

print("\n‚ö†Ô∏è RISK MITIGATION:")
print("- If pure categorical fails, test numerical + interactions approach")
print("- If still failing, research winning solutions more deeply")
print("- Consider data augmentation with original dataset")
print("- Explore alternative encoding strategies (target encoding per class)")

print("\n" + "="*70)