# RISK CLASSIFICATION MODEL (Model E)
Predict crop risk level (Low/Medium/High) using Random Forest:
- Scratch implementation (educational)
- scikit-learn implementation (production)


In [1]:
# Line 1-12: Import all necessary libraries
import time
import math
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier  # NOTE: Classifier, not Regressor!
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # Classification metrics
import joblib
import random
import sys


In [None]:
# Line 13-25: Load the dataset
DATAFILE = "global_large_12year_yield_dataset.csv"  # Using existing dataset
try:
    df = pd.read_csv(DATAFILE)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
except FileNotFoundError:
    print(f"ERROR: {DATAFILE} not found. Place dataset in this folder.")
    sys.exit(1)

Dataset loaded successfully. Shape: (2438, 18)
Columns: ['field_id', 'latitude', 'longitude', 'year', 'season_number', 'season_start', 'season_end', 'season_length_days', 'yield_kg_ha', 'ndvi_peak', 'ndvi_mean', 'ndvi_auc', 'ndvi_slope_mid', 'precip_cum', 'rh_mean', 'gdd_cum', 'solar_cum', 'water_stress_index']


In [None]:
df['ws_norm'] = (df['water_stress_index'] - df['water_stress_index'].min()) / \
                (df['water_stress_index'].max() - df['water_stress_index'].min())


In [None]:
# Line 26-38: Create risk categories from water_stress_index using discrete risk classes
# This function converts water stress index (0-1) into discrete risk classes
def categorize_risk(ws_norm):
    # Example thresholds, you can tune
    if ws_norm < 0.33:
        return 0  # Low
    elif ws_norm  < 0.66:
        return 1  # Medium
    else:
        return 2  # High

if 'water_stress_index' in df.columns:
    df['risk_class'] = df['water_stress_index'].apply(categorize_risk)  # Apply categorization
else:
    print("ERROR: No 'water_stress_index' column found to categorize risk.")
    sys.exit(1)


In [4]:
# Line 39-50: Prepare features and target for classification
target_col = 'risk_class'  # Our categorical target (0, 1, 2)

# Select numeric features (excluding target)
feature_cols = [c for c in df.columns if c != target_col and np.issubdtype(df[c].dtype, np.number)]
if len(feature_cols) == 0:
    raise ValueError("No numeric features detected for classification.")

X = df[feature_cols].values  # Feature matrix
y = df[target_col].values    # Target vector (categorical)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Line 51-55: Scale features (optional for Random Forest but good practice)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
# Line 56-67: Scratch Random Forest Classifier - Node class
class ScratchTreeNode:
    __slots__ = ('pred', 'feat', 'thresh', 'left', 'right')
    def __init__(self, pred=None, feat=None, thresh=None, left=None, right=None):
        self.pred = pred  # majority class at leaf (for classification)
        self.feat = feat
        self.thresh = thresh
        self.left = left
        self.right = right


In [7]:
# Line 68-75: Gini impurity function for classification splits
# This measures how "mixed" the classes are in a node
def gini_loss(y):
    if len(y) == 0:
        return 0.0
    classes, counts = np.unique(y, return_counts=True)  # Count each class
    probs = counts / counts.sum()  # Calculate probabilities
    return float(1 - np.sum(probs ** 2))  # Gini = 1 - sum(p²)


In [8]:
# Line 76-95: Find best split using Gini impurity (not MSE like regression)
def best_split_quick(X, y, feature_indices, n_thresholds=10, min_samples_leaf=3):
    n, d = X.shape
    best_feat, best_thresh, best_score = None, None, float('inf')
    for feat in feature_indices:
        col = X[:, feat]
        if np.all(col == col[0]):  # Skip if all values are same
            continue
        thresholds = np.percentile(col, np.linspace(5,95,n_thresholds))  # Candidate thresholds
        for thresh in thresholds:
            left_mask = col <= thresh
            right_mask = ~left_mask
            if left_mask.sum() < min_samples_leaf or right_mask.sum() < min_samples_leaf:
                continue
            score = gini_loss(y[left_mask]) + gini_loss(y[right_mask])  # Combined Gini
            if score < best_score:
                best_feat = feat
                best_thresh = float(thresh)
                best_score = float(score)
    return best_feat, best_thresh, best_score if best_feat is not None else (None, None, None)


In [9]:
# Line 96-116: Build decision tree for classification
def build_scratch_tree(X, y, depth=0, max_depth=6, min_samples_leaf=5):
    # Stopping conditions
    if depth >= max_depth or len(y) <= 2*min_samples_leaf or np.all(y == y[0]):
        counts = np.bincount(y)  # Count occurrences of each class
        pred_class = np.argmax(counts)  # Majority class
        return ScratchTreeNode(pred=pred_class)
    
    m, d = X.shape
    k = max(1, int(math.sqrt(d)))  # Random feature subset
    features = np.random.choice(d, k, replace=False)
    
    feat, thresh, score = best_split_quick(X, y, features, n_thresholds=8, min_samples_leaf=min_samples_leaf)
    if feat is None:  # No good split found
        counts = np.bincount(y)
        pred_class = np.argmax(counts)
        return ScratchTreeNode(pred=pred_class)
    
    # Recursively build left and right subtrees
    left_idx = X[:, feat] <= thresh
    right_idx = ~left_idx
    left = build_scratch_tree(X[left_idx], y[left_idx], depth+1, max_depth=max_depth, min_samples_leaf=min_samples_leaf)
    right = build_scratch_tree(X[right_idx], y[right_idx], depth+1, max_depth=max_depth, min_samples_leaf=min_samples_leaf)
    return ScratchTreeNode(pred=None, feat=feat, thresh=thresh, left=left, right=right)


In [10]:
# Line 117-125: Predict using a single tree
def predict_tree(node, x):
    while node.pred is None:  # Traverse until leaf
        if x[node.feat] <= node.thresh:
            node = node.left
        else:
            node = node.right
    return node.pred  # Return class prediction


In [11]:
# Line 126-150: Scratch Random Forest Classifier class
class ScratchRandomForestClassifier:
    def __init__(self, n_trees=10, max_depth=6, min_samples_leaf=5, bootstrap=True, random_state=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.bootstrap = bootstrap
        self.trees = []
        if random_state is not None:
            random.seed(random_state)
            np.random.seed(random_state)
    
    def fit(self, X, y):
        n = X.shape[0]
        self.trees = []
        for t in range(self.n_trees):
            idx = np.random.choice(n, n, replace=True) if self.bootstrap else np.arange(n)
            tree = build_scratch_tree(X[idx], y[idx], max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf)
            self.trees.append(tree)
    
    def predict(self, X):
        preds = np.zeros((len(self.trees), X.shape[0]), dtype=int)  # Class predictions
        for i, tree in enumerate(self.trees):
            preds[i] = np.array([predict_tree(tree, x) for x in X])
        # Majority vote across trees
        return np.apply_along_axis(lambda row: np.bincount(row).argmax(), axis=0, arr=preds)


In [13]:
# Line 151-165: Train scratch Random Forest classifier
print("\n=== Training Scratch Random Forest Classifier ===")
start_time = time.time()
scratch_rf = ScratchRandomForestClassifier(n_trees=12, max_depth=7, min_samples_leaf=5, random_state=42)
scratch_rf.fit(X_train, y_train)
t_scratch_train = time.time() - start_time
print(f"Scratch RF training time: {t_scratch_train:.2f}s")

y_pred_scratch = scratch_rf.predict(X_test)
acc_scratch = accuracy_score(y_test, y_pred_scratch)  # Classification accuracy
print(f"Scratch RF accuracy: {acc_scratch:.3f}")
print(classification_report(y_test, y_pred_scratch))  # Detailed classification metrics
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_scratch))

with open("rf_risk_scratch.pkl", "wb") as f:
    pickle.dump(scratch_rf, f)



=== Training Scratch Random Forest Classifier ===
Scratch RF training time: 0.22s
Scratch RF accuracy: 0.973
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       456
           1       1.00      0.59      0.75        32

    accuracy                           0.97       488
   macro avg       0.99      0.80      0.87       488
weighted avg       0.97      0.97      0.97       488

Confusion matrix:
 [[456   0]
 [ 13  19]]


In [14]:
# Line 166-179: Train scikit-learn RandomForestClassifier
print("\n=== Training scikit-learn RandomForestClassifier ===")
start_time = time.time()
rf_sklearn = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
rf_sklearn.fit(X_train_scaled, y_train)
t_sklearn_train = time.time() - start_time
y_pred_sklearn = rf_sklearn.predict(X_test_scaled)
acc_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"sklearn RF accuracy: {acc_sklearn:.3f}")
print(classification_report(y_test, y_pred_sklearn))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_sklearn))

joblib.dump({'model': rf_sklearn, 'scaler': scaler, 'features': feature_cols}, "rf_risk_sklearn.joblib")



=== Training scikit-learn RandomForestClassifier ===
sklearn RF accuracy: 1.000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       456
           1       1.00      1.00      1.00        32

    accuracy                           1.00       488
   macro avg       1.00      1.00      1.00       488
weighted avg       1.00      1.00      1.00       488

Confusion matrix:
 [[456   0]
 [  0  32]]


['rf_risk_sklearn.joblib']

In [15]:
# Line 180-187: Example JSON output for integration
example_vector = X_test[0].reshape(1,-1)
risk_class = int(rf_sklearn.predict(example_vector)[0])  # Predicted class
risk_probs = rf_sklearn.predict_proba(example_vector)[0].tolist()  # Class probabilities
print(f"\nExample JSON output for risk model:")
print({"risk_class": risk_class, "risk_probs": risk_probs})

print("\n" + "="*60)
print("KEY DIFFERENCES FROM REGRESSION MODEL:")
print("="*60)
print("1. Uses RandomForestClassifier (not Regressor)")
print("2. Target is categorical (0, 1, 2) not continuous")
print("3. Uses Gini impurity for splits (not MSE)")
print("4. Uses accuracy, classification_report (not RMSE, R²)")
print("5. Predicts class probabilities (not continuous values)")
print("6. Majority vote for final prediction (not averaging)")
print("7. Uses water_stress_index to create risk categories")
print("="*60)



Example JSON output for risk model:
{'risk_class': 0, 'risk_probs': [0.68, 0.32, 0.0]}

KEY DIFFERENCES FROM REGRESSION MODEL:
1. Uses RandomForestClassifier (not Regressor)
2. Target is categorical (0, 1, 2) not continuous
3. Uses Gini impurity for splits (not MSE)
4. Uses accuracy, classification_report (not RMSE, R²)
5. Predicts class probabilities (not continuous values)
6. Majority vote for final prediction (not averaging)
7. Uses water_stress_index to create risk categories
