In [1]:
import pandas as pd
import joblib

# Load model and scaler
model = joblib.load("xgboost_model.pkl")
scaler = joblib.load("scaler.pkl")

# Create full input (2 rows)
full_input = pd.DataFrame([
    {
        'Year': 1966,
        'Decade': 1960,
        'State_encoded': 3,
        'Crop_encoded': 3,
        'Season_encoded': 4,
        'Area_ha': 548000.0,
        'N_req_kg_per_ha': 8.43975,
        'P_req_kg_per_ha': 4.05108,
        'K_req_kg_per_ha': 7.42698,
        'Temperature_C': 25,
        'Humidity_%': 80,
        'pH': 6.5,
        'Rainfall_mm': 1200,
        'Wind_Speed_m_s': 2.0,
        'Solar_Radiation_MJ_m2_day': 18
    },
    {
        'Year': 1966,
        'Decade': 1960,
        'State_encoded': 3,
        'Crop_encoded': 2,
        'Season_encoded': 4,
        'Area_ha': 3000.0,
        'N_req_kg_per_ha': 18.00009,
        'P_req_kg_per_ha': 8.00004,
        'K_req_kg_per_ha': 11.33339,
        'Temperature_C': 22,
        'Humidity_%': 70,
        'pH': 6.0,
        'Rainfall_mm': 800,
        'Wind_Speed_m_s': 2.5,
        'Solar_Radiation_MJ_m2_day': 20
    }
])

# Scale full 15 features
scaled_full = scaler.transform(full_input)

scaled_full_df = pd.DataFrame(
    scaled_full,
    columns=scaler.feature_names_in_
)

# Select only features used by model (10 features)
model_features = model.get_booster().feature_names
final_input = scaled_full_df[model_features]

# Predict
predictions = model.predict(final_input)

# Print results
for i, pred in enumerate(predictions):
    print(f"ðŸŒ¾ Row {i+1} Predicted Yield: {round(pred,2)} kg/ha")

ðŸŒ¾ Row 1 Predicted Yield: 337.7699890136719 kg/ha
ðŸŒ¾ Row 2 Predicted Yield: 669.0700073242188 kg/ha


In [2]:
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Reload dataset
df = pd.read_csv("merged_preprocessed_crop_yield.csv")

# Prepare features (NO scaling)
X = df.drop(columns=['Yield_kg_per_ha', 'State Name', 'Dist Name', 'Crop', 'Season'])
y = df['Yield_kg_per_ha']

# Select only final 10 features (same as your RFE result)
selected_features = [
    'Year', 'Decade', 'Crop_encoded', 'Season_encoded',
    'Area_ha', 'N_req_kg_per_ha', 'P_req_kg_per_ha',
    'K_req_kg_per_ha', 'Humidity_%', 'Rainfall_mm'
]

X = X[selected_features]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Train clean XGBoost
xgb_model_clean = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

xgb_model_clean.fit(X_train, y_train)

# Save model
joblib.dump(xgb_model_clean, "xgboost_model_clean.pkl")

print("âœ… Clean model saved successfully!")

âœ… Clean model saved successfully!
