<a href="https://colab.research.google.com/github/Safarys/crop_yield_prediction_using_ML/blob/main/randomforest_scrath_cypuml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# --- Decision Tree from Scratch ---
class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        if (depth >= self.max_depth or num_samples < self.min_samples_split or len(np.unique(y)) == 1):
            return self._most_common_label(y)

        # Randomly select a subset of features
        feature_indices = np.random.choice(num_features, int(np.sqrt(num_features)), replace=False)

        # Find the best split
        best_feature, best_threshold = self._best_split(X, y, feature_indices)

        # Grow the children recursively
        left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
        left = self._grow_tree(X[left_indices, :], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices, :], y[right_indices], depth + 1)
        return {"feature": best_feature, "threshold": best_threshold, "left": left, "right": right}

    def _best_split(self, X, y, feature_indices):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feature_index in feature_indices:
            X_column = X[:, feature_index]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(X_column, y, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feature_index
                    split_thresh = threshold
        return split_idx, split_thresh

    def _information_gain(self, X_column, y, split_thresh):
        parent_loss = self._variance(y)
        left_indices, right_indices = self._split(X_column, split_thresh)
        if len(left_indices) == 0 or len(right_indices) == 0:
            return 0
        n, n_left, n_right = len(y), len(left_indices), len(right_indices)
        child_loss = (n_left / n) * self._variance(y[left_indices]) + (n_right / n) * self._variance(y[right_indices])
        return parent_loss - child_loss

    def _variance(self, y):
        return np.var(y)

    def _split(self, X_column, split_thresh):
        left_indices = np.argwhere(X_column <= split_thresh).flatten()
        right_indices = np.argwhere(X_column > split_thresh).flatten()
        return left_indices, right_indices

    def _most_common_label(self, y):
        return np.mean(y)

    def _traverse_tree(self, x, tree):
        if isinstance(tree, dict):
            feature_val = x[tree['feature']]
            if feature_val <= tree['threshold']:
                return self._traverse_tree(x, tree['left'])
            else:
                return self._traverse_tree(x, tree['right'])
        return tree

# --- RandomForest from Scratch ---
class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

# --- Load Data ---
df = pd.read_csv('/content/Crop_production.csv')

# Drop unnecessary columns
df_clean = df.drop(columns=['Unnamed: 0', 'State_Name'])

# Handle categorical 'Crop_Type' by converting to numerical labels
df_clean['Crop_Type'] = pd.factorize(df_clean['Crop_Type'])[0]

# Features and target for yield prediction
X = df_clean[['Crop_Type', 'N', 'P', 'K', 'pH', 'rainfall', 'temperature', 'Area_in_hectares']].values
y = df_clean['Yield_ton_per_hec'].values

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train RandomForest from Scratch ---
rf = RandomForest(n_trees=10, max_depth=10)
rf.fit(X_train, y_train)

# --- Make Predictions ---
y_pred = rf.predict(X_test)

# --- Calculate Mean Squared Error (MSE) and R-squared (R²) Score ---
mse = np.mean((y_test - y_pred) ** 2)
print(f"Mean Squared Error: {mse}")

r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²) Score: {r2}")

# --- Calculate Mean Absolute Percentage Error (MAPE) ---
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")

# --- Calculate Accuracy from MAPE ---
accuracy = 100 - mape
print(f"Regression Accuracy: {accuracy}%")


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Mean Squared Error: nan


ValueError: Input contains NaN.

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# --- Decision Tree from Scratch ---
class DecisionTree:
    def __init__(self, max_depth=5, min_samples_split=5):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        if (depth >= self.max_depth or num_samples < self.min_samples_split or len(np.unique(y)) == 1):
            return self._most_common_label(y)

        # Randomly select a subset of features
        feature_indices = np.random.choice(num_features, int(np.sqrt(num_features)), replace=False)

        # Find the best split
        best_feature, best_threshold = self._best_split(X, y, feature_indices)

        # Split data and grow children recursively
        left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
        left = self._grow_tree(X[left_indices, :], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices, :], y[right_indices], depth + 1)
        return {"feature": best_feature, "threshold": best_threshold, "left": left, "right": right}

    def _best_split(self, X, y, feature_indices):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feature_index in feature_indices:
            X_column = X[:, feature_index]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(X_column, y, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feature_index
                    split_thresh = threshold
        return split_idx, split_thresh

    def _information_gain(self, X_column, y, split_thresh):
        parent_loss = self._variance(y)
        left_indices, right_indices = self._split(X_column, split_thresh)
        if len(left_indices) == 0 or len(right_indices) == 0:
            return 0
        n, n_left, n_right = len(y), len(left_indices), len(right_indices)
        child_loss = (n_left / n) * self._variance(y[left_indices]) + (n_right / n) * self._variance(y[right_indices])
        return parent_loss - child_loss

    def _variance(self, y):
        return np.var(y)

    def _split(self, X_column, split_thresh):
        left_indices = np.argwhere(X_column <= split_thresh).flatten()
        right_indices = np.argwhere(X_column > split_thresh).flatten()
        return left_indices, right_indices

    def _most_common_label(self, y):
        return np.mean(y)

    def _traverse_tree(self, x, tree):
        if isinstance(tree, dict):
            feature_val = x[tree['feature']]
            if feature_val <= tree['threshold']:
                return self._traverse_tree(x, tree['left'])
            else:
                return self._traverse_tree(x, tree['right'])
        return tree

# --- RandomForest from Scratch ---
class RandomForest:
    def __init__(self, n_trees=5, max_depth=5, min_samples_split=5):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

# --- Load Data ---
df = pd.read_csv('/content/Crop_production.csv')

# Drop unnecessary columns
df_clean = df.drop(columns=['Unnamed: 0', 'State_Name'])

# Handle categorical 'Crop_Type' by converting to numerical labels
df_clean['Crop_Type'] = pd.factorize(df_clean['Crop_Type'])[0]

# Features and target for yield prediction
X = df_clean[['Crop_Type', 'N', 'P', 'K', 'pH', 'rainfall', 'temperature', 'Area_in_hectares']].values
y = df_clean['Yield_ton_per_hec'].values

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train RandomForest from Scratch ---
rf = RandomForest(n_trees=5, max_depth=5)  # Reduced n_trees and max_depth
rf.fit(X_train, y_train)

# --- Make Predictions ---
y_pred = rf.predict(X_test)

# --- Calculate Metrics ---
mse = np.mean((y_test - y_pred) ** 2)
r2 = r2_score(y_test, y_pred)

# --- Print Results ---
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R²) Score: {r2}")


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Mean Squared Error: 486.91476630091853
R-squared (R²) Score: -0.2871462071409441


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# --- Decision Tree from Scratch ---
class DecisionTree:
    def __init__(self, max_depth=5, min_samples_split=5):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        if (depth >= self.max_depth or num_samples < self.min_samples_split or len(np.unique(y)) == 1):
            return self._most_common_label(y)

        feature_indices = np.random.choice(num_features, int(np.sqrt(num_features)), replace=False)
        best_feature, best_threshold = self._best_split(X, y, feature_indices)
        left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
        left = self._grow_tree(X[left_indices, :], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices, :], y[right_indices], depth + 1)
        return {"feature": best_feature, "threshold": best_threshold, "left": left, "right": right}

    def _best_split(self, X, y, feature_indices):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feature_index in feature_indices:
            X_column = X[:, feature_index]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(X_column, y, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feature_index
                    split_thresh = threshold
        return split_idx, split_thresh

    def _information_gain(self, X_column, y, split_thresh):
        parent_loss = self._variance(y)
        left_indices, right_indices = self._split(X_column, split_thresh)
        if len(left_indices) == 0 or len(right_indices) == 0:
            return 0
        n, n_left, n_right = len(y), len(left_indices), len(right_indices)
        child_loss = (n_left / n) * self._variance(y[left_indices]) + (n_right / n) * self._variance(y[right_indices])
        return parent_loss - child_loss

    def _variance(self, y):
        return np.var(y)

    def _split(self, X_column, split_thresh):
        left_indices = np.argwhere(X_column <= split_thresh).flatten()
        right_indices = np.argwhere(X_column > split_thresh).flatten()
        return left_indices, right_indices

    def _most_common_label(self, y):
        return np.mean(y)

    def _traverse_tree(self, x, tree):
        if isinstance(tree, dict):
            feature_val = x[tree['feature']]
            if feature_val <= tree['threshold']:
                return self._traverse_tree(x, tree['left'])
            else:
                return self._traverse_tree(x, tree['right'])
        return tree

# --- RandomForest from Scratch ---
class RandomForest:
    def __init__(self, n_trees=5, max_depth=5, min_samples_split=5):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

# --- Load Data ---
df = pd.read_csv('/content/Crop_production.csv')

# Drop unnecessary columns
df_clean = df.drop(columns=['Unnamed: 0', 'State_Name'])

# Handle categorical 'Crop_Type' by converting to numerical labels
df_clean['Crop_Type'] = pd.factorize(df_clean['Crop_Type'])[0]

# Features and target for yield prediction
X = df_clean[['Crop_Type', 'N', 'P', 'K', 'pH', 'rainfall', 'temperature', 'Area_in_hectares']].values
y = df_clean['Yield_ton_per_hec'].values

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train RandomForest from Scratch ---
rf = RandomForest(n_trees=5, max_depth=5)  # Reduced n_trees and max_depth for simplicity
rf.fit(X_train, y_train)

# --- Make Predictions ---
y_pred = rf.predict(X_test)

# --- Calculate Metrics ---
mse = np.mean((y_test - y_pred) ** 2)
r2 = r2_score(y_test, y_pred)

# --- Calculate MAPE ---
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# --- Calculate MAPE-based Accuracy ---
accuracy = 100 - mape

# --- Print Results ---
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R²) Score: {r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")
print(f"Regression Accuracy (based on MAPE): {accuracy}%")


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Mean Squared Error: 406.0644619613484
R-squared (R²) Score: -0.07342058249526473
Mean Absolute Percentage Error (MAPE): inf%
Regression Accuracy (based on MAPE): -inf%


  mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100


In [7]:
# --- Calculate MAPE with Handling for Zero y_test Values ---
# Filter out zero actual values to avoid division by zero in MAPE calculation
non_zero_indices = y_test != 0
y_test_non_zero = y_test[non_zero_indices]
y_pred_non_zero = y_pred[non_zero_indices]

# If there are any non-zero values, calculate MAPE and accuracy
if len(y_test_non_zero) > 0:
    mape = np.mean(np.abs((y_test_non_zero - y_pred_non_zero) / y_test_non_zero)) * 100
    accuracy = 100 - mape
else:
    mape = np.inf
    accuracy = 0

# --- Calculate Metrics ---
mse = np.mean((y_test - y_pred) ** 2)
r2 = r2_score(y_test, y_pred)

# --- Print Results ---
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R²) Score: {r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape}%")
print(f"Regression Accuracy (based on MAPE): {accuracy}%")


Mean Squared Error: 406.0644619613484
R-squared (R²) Score: -0.07342058249526473
Mean Absolute Percentage Error (MAPE): 242.08420377529674%
Regression Accuracy (based on MAPE): -142.08420377529674%


#Scratch code of Random Forest

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# --- Decision Tree from Scratch ---
class DecisionTree:
    def __init__(self, max_depth=5, min_samples_split=5):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        if (depth >= self.max_depth or num_samples < self.min_samples_split or len(np.unique(y)) == 1):
            return self._most_common_label(y)

        # Randomly select a subset of features
        feature_indices = np.random.choice(num_features, int(np.sqrt(num_features)), replace=False)

        # Find the best split
        best_feature, best_threshold = self._best_split(X, y, feature_indices)

        # Split data and grow children recursively
        left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
        left = self._grow_tree(X[left_indices, :], y[left_indices], depth + 1)
        right = self._grow_tree(X[right_indices, :], y[right_indices], depth + 1)
        return {"feature": best_feature, "threshold": best_threshold, "left": left, "right": right}

    def _best_split(self, X, y, feature_indices):
        best_gain = -1
        split_idx, split_thresh = None, None
        for feature_index in feature_indices:
            X_column = X[:, feature_index]
            thresholds = np.unique(X_column)
            for threshold in thresholds:
                gain = self._information_gain(X_column, y, threshold)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feature_index
                    split_thresh = threshold
        return split_idx, split_thresh

    def _information_gain(self, X_column, y, split_thresh):
        parent_loss = self._variance(y)
        left_indices, right_indices = self._split(X_column, split_thresh)
        if len(left_indices) == 0 or len(right_indices) == 0:
            return 0
        n, n_left, n_right = len(y), len(left_indices), len(right_indices)
        child_loss = (n_left / n) * self._variance(y[left_indices]) + (n_right / n) * self._variance(y[right_indices])
        return parent_loss - child_loss

    def _variance(self, y):
        return np.var(y)

    def _split(self, X_column, split_thresh):
        left_indices = np.argwhere(X_column <= split_thresh).flatten()
        right_indices = np.argwhere(X_column > split_thresh).flatten()
        return left_indices, right_indices

    def _most_common_label(self, y):
        return np.mean(y)

    def _traverse_tree(self, x, tree):
        if isinstance(tree, dict):
            feature_val = x[tree['feature']]
            if feature_val <= tree['threshold']:
                return self._traverse_tree(x, tree['left'])
            else:
                return self._traverse_tree(x, tree['right'])
        return tree

# --- RandomForest from Scratch ---
class RandomForest:
    def __init__(self, n_trees=5, max_depth=5, min_samples_split=5):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

# --- Load Data ---
df = pd.read_csv('/content/Crop_production.csv')

# Drop unnecessary columns
df_clean = df.drop(columns=['Unnamed: 0', 'State_Name'])

# Handle categorical 'Crop_Type' by converting to numerical labels
df_clean['Crop_Type'] = pd.factorize(df_clean['Crop_Type'])[0]

# Features and target for yield prediction
X = df_clean[['Crop_Type', 'N', 'P', 'K', 'pH', 'rainfall', 'temperature', 'Area_in_hectares']].values
y = df_clean['Yield_ton_per_hec'].values

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train RandomForest from Scratch ---
rf = RandomForest(n_trees=5, max_depth=5)  # Reduced n_trees and max_depth
rf.fit(X_train, y_train)

# --- Make Predictions ---
y_pred = rf.predict(X_test)

# --- Calculate Metrics ---
mse = np.mean((y_test - y_pred) ** 2)
r2 = r2_score(y_test, y_pred)

# --- Calculate MAE for Accuracy ---
mae = np.mean(np.abs(y_test - y_pred))
accuracy = 100 - mae

# --- Print Results ---
print(f"Mean Squared Error: {mse}")
print(f"R-squared (R²) Score: {r2}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Regression Accuracy (based on MAE): {accuracy}%")


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Mean Squared Error: 335.33259308177935
R-squared (R²) Score: 0.1135572276951593
Mean Absolute Error (MAE): 2.317430070537241
Regression Accuracy (based on MAE): 97.68256992946276%
