In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

# 1. Load Data
df = pd.read_csv('mp_agriculture_stagewise_10000rows_district_season.csv')

# 2. Define Features and a SINGLE Target for Tuning
features = ['crop', 'seed_type', 'soil', 'district', 'season']
# We focus on the main target for tuning
tune_target = 'total_duration_estimate'

X = df[features]
y = df[tune_target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Create a Pipeline for the Single-Target Model
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), features)]
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(random_state=42, n_jobs=-1))
])

# 4. Define the Search Space for Hyperparameters
# We give it a range of settings to try for each key parameter
param_dist = {
    'regressor__n_estimators': [100, 300, 500, 700],
    'regressor__max_depth': [3, 5, 7, 9],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__subsample': [0.7, 0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

# 5. Set up and Run the Randomized Search
# n_iter=25 means it will test 25 different combinations. n_jobs=-1 uses all CPU cores.
random_search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist, n_iter=25,
    cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42, verbose=1
)

print("ðŸš€ Starting hyperparameter search... (this can take several minutes)")
random_search.fit(X_train, y_train)

# 6. Get the Best Parameters
print("\nâœ… Search complete!")
best_params = random_search.best_params_
print("Found the best parameters:")
print(best_params)

ðŸš€ Starting hyperparameter search... (this can take several minutes)
Fitting 5 folds for each of 25 candidates, totalling 125 fits

âœ… Search complete!
Found the best parameters:
{'regressor__subsample': 0.8, 'regressor__n_estimators': 700, 'regressor__max_depth': 3, 'regressor__learning_rate': 0.01, 'regressor__colsample_bytree': 0.9}


In [4]:
from sklearn.multioutput import RegressorChain
from sklearn.metrics import mean_absolute_error

# --- PASTE THE BEST PARAMETERS FROM THE SEARCH ABOVE ---
# Example format: {'regressor__subsample': 0.8, 'regressor__n_estimators': 500, ...}
best_params_from_search = {
    'subsample': 0.9,
    'n_estimators': 700,
    'max_depth': 7,
    'learning_rate': 0.05,
    'colsample_bytree': 0.9
}
# Remove the 'regressor__' prefix for the final model
cleaned_params = {k.replace('regressor__', ''): v for k, v in best_params_from_search.items()}


# Define all 78 targets again
targets = ['total_duration_estimate'] + [col for col in df.columns if col.endswith(('_tmin', '_tmax', '_rh', '_rain', '_wind', '_solar_rad'))] + [col for col in df.columns if col.endswith('_stage_dur')]
Y = df[targets]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# Create the final, optimized pipeline
preprocessor_final = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), features)]
)

# Use the BEST parameters found in our base model
base_model_tuned = xgb.XGBRegressor(**cleaned_params, random_state=42, n_jobs=-1)

final_model = RegressorChain(base_estimator=base_model_tuned)

pipeline_final = Pipeline(steps=[
    ('preprocessor', preprocessor_final),
    ('regressor', final_model)
])

# Train and evaluate the final, tuned model
print("\nðŸš€ Training the final, optimized multi-output model...")
pipeline_final.fit(X_train, Y_train)
print("âœ… Final model training complete!")

predictions_final = pipeline_final.predict(X_test)
mae_final = mean_absolute_error(Y_test, predictions_final, multioutput='uniform_average')

print(f"\nðŸ“Š Overall Model Performance (Tuned Model):")
print(f"   The final average error is: {mae_final:.4f}")


ðŸš€ Training the final, optimized multi-output model...
âœ… Final model training complete!

ðŸ“Š Overall Model Performance (Tuned Model):
   The final average error is: 10.6854
