# Experiment 003: Pure Categorical Treatment (No Binning)

**Hypothesis**: Converting numerical features directly to categorical dtype (as strings) without binning will preserve information and yield significant improvement over baseline.

**Expected CV**: 0.430+ (improvement of +0.100 from baseline 0.3311)

**Approach**:
- Convert all numerical features (Temp, Humidity, Moisture, N, P, K) to strings then category dtype
- Keep Soil Type and Crop Type as categorical
- Use XGBoost, LightGBM, CatBoost with native categorical support
- Hyperparameters: max_depth=6-7, learning_rate=0.07
- Stratified 5-fold CV

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
import warnings
import pickle
import os
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print("\nTarget distribution:")
print(train['Fertilizer Name'].value_counts())

In [None]:
# Define MAP@3 metric for optimization
def map_at_3(predictions, true_labels):
    """Calculate MAP@3 score"""
    map_scores = []
    
    for i in range(len(true_labels)):
        # Get top 3 predictions
        pred_idx = np.argsort(predictions[i])[-3:][::-1]
        
        # Calculate average precision for this observation
        score = 0.0
        num_hits = 0
        
        for k, pred in enumerate(pred_idx, 1):
            if pred == true_labels[i]:
                num_hits += 1
                score += num_hits / k
        
        map_scores.append(score / min(3, 1))  # Divide by min(3, num_relevant_items)
    
    return np.mean(map_scores)

print("MAP@3 metric defined")

In [None]:
# Encode target
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(train['Fertilizer Name'])
print(f"Classes: {le_target.classes_}")
print(f"Number of classes: {len(le_target.classes_)}")

# Save target encoder
os.makedirs('experiments/003_pure_categorical', exist_ok=True)
pickle.dump(le_target, open('experiments/003_pure_categorical/target_encoder.pkl', 'wb'))

In [None]:
# Pure Categorical Treatment - Convert numericals to categorical WITHOUT binning
# All numerical features have low cardinality (14-43 unique values)

numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
categorical_features = ['Soil Type', 'Crop Type']

print("Converting numerical features to categorical dtype (as strings)...")
print("\nOriginal cardinalities:")
for col in numerical_features:
    unique_count = train[col].nunique()
    print(f"  {col}: {unique_count} unique values")

# Convert numericals to strings then category dtype
for col in numerical_features:
    train[col] = train[col].astype(str).astype('category')
    test[col] = test[col].astype(str).astype('category')

# Convert original categorical features to category dtype
for col in categorical_features:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

print("\n✓ All features converted to categorical dtype")
print(f"\nCategorical features ({len(numerical_features + categorical_features)}):")
for col in numerical_features + categorical_features:
    print(f"  {col}")

In [None]:
# Create feature matrices
feature_columns = numerical_features + categorical_features

X = train[feature_columns].copy()
X_test = test[feature_columns].copy()

print(f"\nFeature matrix X shape: {X.shape}")
print(f"Test feature matrix X_test shape: {X_test.shape}")
print(f"\nFinal feature columns ({len(feature_columns)}):")
for i, col in enumerate(feature_columns, 1):
    print(f"  {i}. {col}")

In [None]:
# Stratified K-Fold setup
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize predictions
oof_predictions = np.zeros((len(X), len(le_target.classes_)))
test_predictions_xgb = np.zeros((len(X_test), len(le_target.classes_)))
test_predictions_lgb = np.zeros((len(X_test), len(le_target.classes_)))
test_predictions_cat = np.zeros((len(X_test), len(le_target.classes_)))

# Model parameters (optimized for categorical features)
xgb_params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class': len(le_target.classes_),
    'max_depth': 6,  # Shallower for categorical
    'learning_rate': 0.07,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'tree_method': 'hist',
    'device': 'cuda',
    'enable_categorical': True,
    'random_state': 42,
    'verbosity': 0
}

lgb_params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': len(le_target.classes_),
    'max_depth': 6,
    'learning_rate': 0.07,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbose': -1
}

# Check GPU availability for CatBoost
device = 'GPU' if torch.cuda.is_available() else 'CPU'
print(f"Using device: {device}")

print("Model parameters defined")

In [None]:
# Cross-validation loop
fold_scores = []
fold_scores_xgb = []
fold_scores_lgb = []
fold_scores_cat = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
    print(f"\nFold {fold + 1}/{n_splits}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    # Train XGBoost
    print("  Training XGBoost...")
    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_xgb.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Predict XGBoost
    val_pred_xgb = model_xgb.predict_proba(X_val)
    oof_predictions[val_idx] = val_pred_xgb
    score_xgb = map_at_3(val_pred_xgb, y_val)
    fold_scores_xgb.append(score_xgb)
    
    # Train LightGBM
    print("  Training LightGBM...")
    train_data_lgb = lgb.Dataset(X_train, label=y_train)
    val_data_lgb = lgb.Dataset(X_val, label=y_val, reference=train_data_lgb)
    
    model_lgb = lgb.train(
        lgb_params,
        train_data_lgb,
        num_boost_round=500,
        valid_sets=[val_data_lgb],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    # Predict LightGBM
    val_pred_lgb = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
    score_lgb = map_at_3(val_pred_lgb, y_val)
    fold_scores_lgb.append(score_lgb)
    
    # Train CatBoost
    print("  Training CatBoost...")
    model_cat = CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.07,
        loss_function='MultiClass',
        random_seed=42,
        verbose=False,
        task_type=device
    )
    
    model_cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
    
    # Predict CatBoost
    val_pred_cat = model_cat.predict_proba(X_val)
    score_cat = map_at_3(val_pred_cat, y_val)
    fold_scores_cat.append(score_cat)
    
    # Ensemble predictions for this fold
    val_pred_ensemble = (val_pred_xgb + val_pred_lgb + val_pred_cat) / 3
    score_ensemble = map_at_3(val_pred_ensemble, y_val)
    fold_scores.append(score_ensemble)
    
    print(f"  XGBoost MAP@3: {score_xgb:.4f}")
    print(f"  LightGBM MAP@3: {score_lgb:.4f}")
    print(f"  CatBoost MAP@3: {score_cat:.4f}")
    print(f"  Ensemble MAP@3: {score_ensemble:.4f}")
    
    # Predict on test set
    test_pred_xgb = model_xgb.predict_proba(X_test)
    test_pred_lgb = model_lgb.predict_proba(X_test)
    test_pred_cat = model_cat.predict_proba(X_test)
    
    test_predictions_xgb += test_pred_xgb / n_splits
    test_predictions_lgb += test_pred_lgb / n_splits
    test_predictions_cat += test_pred_cat / n_splits

print("\n" + "="*50)
print("CV Results:")
print(f"  XGBoost: {np.mean(fold_scores_xgb):.4f} ± {np.std(fold_scores_xgb):.4f}")
print(f"  LightGBM: {np.mean(fold_scores_lgb):.4f} ± {np.std(fold_scores_lgb):.4f}")
print(f"  CatBoost: {np.mean(fold_scores_cat):.4f} ± {np.std(fold_scores_cat):.4f}")
print(f"  Ensemble: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}")

In [None]:
# Calculate final CV score
cv_score = map_at_3(oof_predictions, y_encoded)
baseline_score = 0.3311

print(f"\n{'='*50}")
print(f"Final CV MAP@3 Score: {cv_score:.4f}")
print(f"Baseline Score: {baseline_score:.4f}")
print(f"Improvement: {cv_score - baseline_score:.4f}")
print(f"Relative Improvement: {((cv_score - baseline_score) / baseline_score * 100):.2f}%")

if cv_score > baseline_score:
    print(f"\n✓ SUCCESS: Beat baseline by {cv_score - baseline_score:.4f}")
else:
    print(f"\n✗ REGRESSION: Worse than baseline by {baseline_score - cv_score:.4f}")

In [None]:
# Average predictions from all three models
final_test_predictions = (test_predictions_xgb + test_predictions_lgb + test_predictions_cat) / 3

# Get top 3 predictions for each test sample
top3_predictions = np.argsort(final_test_predictions, axis=1)[:, -3:][:, ::-1]

# Convert back to fertilizer names
predicted_names = []
for pred in top3_predictions:
    names = le_target.inverse_transform(pred)
    predicted_names.append(' '.join(names))

# Create submission
test_ids = test['id'].values
submission = pd.DataFrame({
    'id': test_ids,
    'Fertilizer Name': predicted_names
})

print("\nSubmission format:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

# Save submission
os.makedirs('/home/submission', exist_ok=True)
submission.to_csv('/home/submission/submission.csv', index=False)
print("\n✓ Submission saved to /home/submission/submission.csv")

In [None]:
# Save experiment artifacts
np.save('experiments/003_pure_categorical/oof_predictions.npy', oof_predictions)
np.save('experiments/003_pure_categorical/test_predictions_xgb.npy', test_predictions_xgb)
np.save('experiments/003_pure_categorical/test_predictions_lgb.npy', test_predictions_lgb)
np.save('experiments/003_pure_categorical/test_predictions_cat.npy', test_predictions_cat)
pickle.dump(feature_columns, open('experiments/003_pure_categorical/feature_columns.pkl', 'wb'))

print("\n✓ Experiment artifacts saved to experiments/003_pure_categorical/")