In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mp
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [6]:
df =pd.read_csv('housing.csv')

In [7]:
df = pd.DataFrame(df)
df.drop(['longitude','latitude'],axis=1)

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...
20635,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [15]:
class DecisionTreeRegressorScratch:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def _mse(self, y):
        """Calculate Mean Squared Error."""
        return np.var(y) * len(y)

    def _split(self, X, y, feature, threshold):
        """Split dataset based on feature and threshold."""
        left_idx = X[:, feature] <= threshold
        right_idx = X[:, feature] > threshold
        return X[left_idx], X[right_idx], y[left_idx], y[right_idx]

    def _find_best_split(self, X, y):
        """Find the best split for the data."""
        best_feature, best_threshold, best_mse = None, None, float("inf")
        n_samples, n_features = X.shape

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split(X, y, feature, threshold)
                if len(y_left) < self.min_samples_split or len(y_right) < self.min_samples_split:
                    continue
                mse_left, mse_right = self._mse(y_left), self._mse(y_right)
                weighted_mse = (len(y_left) * mse_left + len(y_right) * mse_right) / n_samples
                if weighted_mse < best_mse:
                    best_feature, best_threshold, best_mse = feature, threshold, weighted_mse

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth=0):
        """Recursively build the tree."""
        n_samples, n_features = X.shape

        if depth >= self.max_depth or n_samples < self.min_samples_split or len(np.unique(y)) == 1:
            return {"value": np.mean(y)}

        feature, threshold = self._find_best_split(X, y)
        if feature is None:
            return {"value": np.mean(y)}

        X_left, X_right, y_left, y_right = self._split(X, y, feature, threshold)

        return {
            "feature": feature,
            "threshold": threshold,
            "left": self._build_tree(X_left, y_left, depth + 1),
            "right": self._build_tree(X_right, y_right, depth + 1),
        }

    def fit(self, X, y):
        """Fit the model to the data."""
        self.tree = self._build_tree(X, y)

    def _predict_sample(self, x, tree):
        """Predict a single sample."""
        if "value" in tree:
            return tree["value"]
        if x[tree["feature"]] <= tree["threshold"]:
            return self._predict_sample(x, tree["left"])
        return self._predict_sample(x, tree["right"])

    def predict(self, X):
        """Predict for all samples in X."""
        return np.array([self._predict_sample(x, self.tree) for x in X])

class RandomForestRegressorScratch:
    def __init__(self, n_estimators=10, max_depth=3, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def _bootstrap_sample(self, X, y):
        """Create a bootstrap sample."""
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

    def fit(self, X, y):
        """Fit the Random Forest model."""
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressorScratch(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        """Predict using the Random Forest model."""
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(predictions, axis=0)


# Convert X and y to NumPy arrays


X = df[['total_rooms', 'total_bedrooms','population','median_income']].to_numpy()
y = df['median_house_value'].to_numpy()

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressorScratch(n_estimators=10, max_depth=5)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)


rmse = np.sqrt(mse)

# Calculate range of the target variable
target_range = y_test.max() - y_test.min()

# Convert MSE to percentage accuracy
percentage_error = (rmse / target_range) * 100
percentage_accuracy = 100 - percentage_error

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Percentage Error (%):", percentage_error)
print("Percentage Accuracy (%):", percentage_accuracy)




Mean Squared Error (MSE): 6652533764.213474
Root Mean Squared Error (RMSE): 81563.06617712133
Percentage Error (%): 16.817057698137603
Percentage Accuracy (%): 83.1829423018624
