In [None]:
import joblib
import pandas as pd

# Load the model
with open('/content/groundnut.pkl', 'rb') as f:
    model = joblib.load(f)

# Provided input data with one-hot encoded features
input_data = {
  "temp": 28.52,
  "humidity": 46.8,
  "rainfall": 832.6,
  "solar_radiation": 16.61,
  "soil_bdod": 144.5,
  "soil_cec": 19.49,
  "soil_cfvo": 10.61,
  "soil_clay": 173.4,
  "soil_sand": 377.0,
  "soil_silt": 276.8,
  "soil_nitrogen": 158.8,
  "soil_ocd": 44.1,
  "soil_ocs": 36.6,
  "soil_phh2o": 8.45,
  "soil_soc": 24.73,
  "soil_wv0010": 33.78,

  "previous_crop": "maize",
  "previous_fertilizer": "NPK",

  "fertilizer_amount": 249.7,
  "cultivation_break": 0
}

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# One-hot encode the categorical features
input_df = pd.get_dummies(input_df, columns=['previous_crop', 'previous_fertilizer'])

# Get the expected feature names from the model
expected_features = model.get_booster().feature_names

# Reindex the input DataFrame to match the expected features, filling missing values with 0
input_df = input_df.reindex(columns=expected_features, fill_value=0)

# Make the prediction
prediction = model.predict(input_df)

# Print the prediction
print("Model Prediction:", prediction)

Model Prediction: [2.8009412]


In [None]:
# SHAP explanations using model.predict as the callable (run after training)
import shap
import numpy as np
import pandas as pd

# Load the dataset to use as background data for SHAP
try:
    background_data = pd.read_csv('/content/groundnut_yield_dataset.csv')
except FileNotFoundError:
    print("Error: cotton_yield_dataset.csv not found. Please upload the file.")
    background_data = None

if background_data is not None:
    # Ensure background data has the same columns as the input data after one-hot encoding
    # Apply one-hot encoding to the background data for categorical features
    background_data = pd.get_dummies(background_data, columns=['previous_crop', 'previous_fertilizer'])

    # Align columns with the model's expected features, filling missing values with 0
    expected_features = model.get_booster().feature_names
    background_data = background_data.reindex(columns=expected_features, fill_value=0)


    # pick a representative sample from the background data
    sample = background_data.sample(n=min(100, len(background_data)), random_state=42)

    # --- Diagnosis Code ---
    print("\nData types of input_df before SHAP explainer:")
    print(input_df.dtypes)
    print("\nData types of sample before SHAP explainer:")
    print(sample.dtypes)
    # --- End Diagnosis Code ---


    # create an explainer using the model.predict callable and sample as background data
    explainer = shap.Explainer(model.predict, sample)

    # compute SHAP values for the input instance (returns a ShapleyValues object)
    # Using input_df which was created and one-hot encoded in the previous cell
    shap_values = explainer(input_df)

    # extract numeric SHAP matrix (n_samples, n_features)
    # Since input_df has only one instance, shap_values.values will have shape (1, n_features)
    shap_matrix = shap_values.values

    print("shap_matrix.shape:", shap_matrix.shape)

    # compute mean absolute SHAP per feature (for the single instance, this is just the absolute SHAP value)
    mean_abs_shap = np.abs(shap_matrix).mean(axis=0)


    # build and show importance DataFrame
    # The column names should come from the input_df or expected_features after reindexing
    importance_df = pd.DataFrame({
        'feature': input_df.columns, # Use input_df columns which are aligned with expected_features
        'mean_abs_shap': mean_abs_shap
    }).sort_values('mean_abs_shap', ascending=False).reset_index(drop=True)


    print("Top features by average absolute SHAP value:")
    display(importance_df.head(10).style.background_gradient(cmap='Blues'))
else:
    print("SHAP explanation cannot be generated without the background dataset.")


Data types of input_df before SHAP explainer:
temp                   float64
humidity               float64
rainfall               float64
solar_radiation        float64
soil_bdod              float64
soil_cec               float64
soil_cfvo              float64
soil_clay              float64
soil_sand              float64
soil_silt              float64
soil_nitrogen          float64
soil_ocd               float64
soil_ocs               float64
soil_phh2o             float64
soil_soc               float64
soil_wv0010            float64
previous_crop            int64
previous_fertilizer      int64
fertilizer_amount      float64
cultivation_break        int64
yield_category           int64
dtype: object

Data types of sample before SHAP explainer:
temp                   float64
humidity               float64
rainfall               float64
solar_radiation        float64
soil_bdod              float64
soil_cec               float64
soil_cfvo              float64
soil_clay              flo

TypeError: unsupported operand type(s) for -: 'float' and 'str'

In [None]:
import numpy as np, json, pandas as pd

def instance_explain_and_recommend(instance_df, explainer, model, top_k=5):
    # instance_df must be a single-row DataFrame
    sv = explainer(instance_df).values  # shape (1, n_features)
    sv = np.array(sv)
    if sv.ndim == 3 and sv.shape[0] == 1:
        sv = sv[0]
    elif sv.ndim == 2 and sv.shape[0] == 1:
        sv = sv[0]
    feat_names = instance_df.columns.tolist()
    pred = float(model.predict(instance_df)[0])

    # build top factors with sign
    paired = list(zip(feat_names, sv))
    # Sort by absolute SHAP value in descending order
    # Keep more factors initially to allow for filtering by positive/negative later
    paired_sorted = sorted(paired, key=lambda x: abs(x[1]), reverse=True)[:max(top_k, 5)] # Ensure we get at least 5 top factors


    top_factors = []
    factor_directions = {}
    # Store all top factors and their directions
    all_top_factors = []
    for f, s in paired_sorted:
        direction = "positive" if s > 0 else "negative"
        all_top_factors.append({"feature": f, "shap": float(round(float(s), 6)), "direction": direction})
        factor_directions[f] = direction
        top_factors.append({"feature": f, "shap": float(round(float(s), 6)), "direction": direction})


    # simple intervention rules tailored for farmers
    recs = []
    # Generate recommendations for all top factors initially
    for tf in all_top_factors:
        f = tf["feature"]
        sval = tf["shap"]
        direction = tf["direction"]
        fname = f.lower()

        recommendation_text = f"For {f}: " # Start with the feature name

        if ("fertilizer_amount" in fname) or ("npk" in fname) or ("nitrogen" in fname):
            if direction == "negative":
                recommendation_text += "Consider increasing the amount of fertilizer, especially if soil tests indicate a deficiency in key nutrients like Nitrogen, Phosphorus, or Potassium (NPK). Ensure the fertilizer type and application timing are optimized for the crop's growth stage."
            else: # Positive direction
                 recommendation_text += "Your current fertilizer level is likely beneficial. Continue with your planned fertilizer application, focusing on proper timing and method."
        elif ("soil" in fname) or ("phh2o" in fname) or ("soc" in fname) or ("ocd" in fname) or ("ocs" in fname):
            if direction == "negative":
                recommendation_text += "Test your soil to understand its specific needs. Consider adding organic matter (like compost or manure) to improve soil health, or adjust soil pH with lime (for acidic soil) or gypsum (for alkaline soil) as recommended by soil test results."
            else: # Positive direction
                recommendation_text += "Your soil conditions appear to be supportive of good yield. Continue with practices that maintain healthy soil, such as incorporating organic matter and appropriate tillage."
        elif ("rain" in fname) or ("humid" in fname) or ("wv0010" in fname):
            if direction == "negative":
                recommendation_text += "Focus on improving water management. This could involve optimizing irrigation schedules, using water-saving techniques (like mulching or drip irrigation), or improving drainage if waterlogging is an issue."
            else: # Positive direction
                recommendation_text += "Water availability and humidity levels are currently favorable. Continue to monitor weather patterns and adjust watering as needed to avoid stress during dry periods or excessive moisture during wet periods."
        elif ("temp" in fname) or ("solar_radiation" in fname):
             recommendation_text += "Consider using crop varieties known to be more resilient to local climate conditions. For high solar radiation or temperature, practices like mulching, shading (if feasible), or adjusting planting dates might help."
        elif ("cultivation_break" in fname):
            if direction == "negative":
                recommendation_text += "Consider extending or optimizing your cultivation break (fallow period) to allow the soil to recover and build up nutrients and moisture. This can help improve soil structure and reduce pest/disease cycles."
            else: # Positive direction (less likely to be a strong positive factor, but included for completeness)
                 recommendation_text += "Your current cultivation break practices seem appropriate. Continue to evaluate if the break period is sufficient for soil health and pest management."
        else:
            recommendation_text += "Review this factor and consult with a local agricultural extension officer or agronomist for specific guidance."

        recs.append({"feature": f, "recommendation": recommendation_text})

    out = {
        "predicted_yield": pred,
        "all_top_factors": all_top_factors, # Return all top factors to allow filtering
        "factor_directions": factor_directions, # This already contains directions for all top factors
        "recommendations": recs, # This contains recommendations for all top factors
        "explanation": "Top factors listed with SHAP contributions and suggested interventions."
    }
    return out

In [None]:
# Get the explanation and recommendations for the sample instance
# Get more top factors to allow for filtering positive/negative later
explanation_result = instance_explain_and_recommend(input_df, explainer, model, top_k=5) # Display top 5 overall

# Display the predicted yield
print("Predicted Yield:", explanation_result["predicted_yield"])

print("\nKey Factors Influencing Predicted Yield (Based on SHAP values):")

print("\nTop Factors by Absolute SHAP Value:")
# Sort all_top_factors by absolute SHAP value to ensure we are looking at the overall top factors
all_top_factors_sorted_abs = sorted(explanation_result["all_top_factors"], key=lambda x: abs(x["shap"]), reverse=True)

# Filter for positive and negative among the top K overall
top_k_factors = all_top_factors_sorted_abs[:5] # Take the top 5 overall factors

positive_factors = [f for f in top_k_factors if f["direction"] == "positive"]
negative_factors = [f for f in top_k_factors if f["direction"] == "negative"]


print("\nFactors that positively influenced the yield prediction:")
if positive_factors:
    for factor in positive_factors:
        print(f"- {factor['feature']}: SHAP value = {factor['shap']}")
else:
    print("None of the top major factors had a positive influence on this prediction.")

print("\nFactors that negatively influenced the yield prediction:")
if negative_factors:
    for factor in negative_factors:
        print(f"- {factor['feature']}: SHAP value = {factor['shap']}")
else:
     print("None of the top major factors had a negative influence on this prediction.")

# Display the farmer-friendly recommendations for the displayed factors
print("\nFarmer-Friendly Recommendations for Top Factors:")

# Create a set of features that were displayed (top 5 overall)
displayed_features = set([f['feature'] for f in top_k_factors])

# Iterate through the recommendations and print only for the displayed features
for rec in explanation_result["recommendations"]:
    if rec["feature"] in displayed_features:
        print(f"- {rec['recommendation']}")

Predicted Yield: 2.800941228866577

Key Factors Influencing Predicted Yield (Based on SHAP values):

Top Factors by Absolute SHAP Value:

Factors that positively influenced the yield prediction:
- fertilizer_amount: SHAP value = 0.037956
- soil_soc: SHAP value = 0.037405
- soil_nitrogen: SHAP value = 0.022297

Factors that negatively influenced the yield prediction:
- solar_radiation: SHAP value = -0.059191
- soil_sand: SHAP value = -0.024894

Farmer-Friendly Recommendations for Top Factors:
- For solar_radiation: Consider using crop varieties known to be more resilient to local climate conditions. For high solar radiation or temperature, practices like mulching, shading (if feasible), or adjusting planting dates might help.
- For fertilizer_amount: Your current fertilizer level is likely beneficial. Continue with your planned fertilizer application, focusing on proper timing and method.
- For soil_soc: Your soil conditions appear to be supportive of good yield. Continue with practices