# Random Forest Regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [3]:
# Reading the encoded dataset into pandas
file_path = '../data/dataset_with_encoded_location.zip'
df = pd.read_csv(file_path, compression='zip')
df.head()


Unnamed: 0,bath,balcony,price,House_size,new_total_sqft,L1,L2,L3,L4,L5,...,L7,L8,L9,L10,L11,L12,L13,L14,L15,L16
0,2,3,62,3,1440,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1,95,3,1521,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,51,2,1200,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1,63,3,1310,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,2,70,3,1800,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Splitting the data

In [4]:
# initializing X and y
X = df.drop(columns='price')
y = df['price']

# splitting the data into tests and trains
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training the Model

In [7]:
# defining the hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required at each leaf node
}

# initializing the random forest regressor
model = RandomForestRegressor(random_state=42)

# initializing the gridsearch
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# performing the grid search 
grid_search.fit(X_train, y_train)

# Getting the best parameters and the best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Making predictions using the best estimator
y_pred = best_estimator.predict(X_test)

Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 300}


## Evaluating the Model's Performance

In [8]:
# Mean Squared error
mse = mean_squared_error(y_test, y_pred)

# Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# R-Squared
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)

Mean Squared Error: 8.450318175569452
RMSE: 2.9069430980962547
R-squared: 0.9897097408525412


# Comparing the model's performance to this dataset and the original dataset with group encoding

In [9]:

# Read the cleaned data into a dataframe
df = pd.read_csv('../data/cleaned_df.zip', compression='zip', index_col=0)
df.head()

def group_location(threshold= 0.01):
    '''
    This function takes in a threshold and groups the unique locations whose total number of
    rows/observations does not go meet the set threshold into the general category 'Other'.

    The function returns the result of the value_counts() method of the location column.

    Input:
    threshold - float between 0 and 1 

    Return:
    It returns the unique categories and the total number of values each unique category has


    '''
    counts = df['location'].value_counts(normalize=True)


    # Get the categories that represent less than set threshold
    other_categories = counts[counts < threshold].index

    # Replace these categorwies with 'Other' 
    df['location'] = df['location'].replace(other_categories, 'Other')

    return df['location'].value_counts()

In [10]:
#Encode the categorical column
encoded_df = pd.get_dummies(df, drop_first=True)

# Filter columns that start with 'location_'
encoded_cols = encoded_df.filter(like='location_').columns

# Convert selected columns to integer data type
encoded_df[encoded_cols] = encoded_df[encoded_cols].astype(int)

encoded_df.head()


Unnamed: 0,bath,balcony,price,House_size,new_total_sqft,location_ Devarachikkanahalli,location_ Mysore Highway,location_1st Block BEL Layout,location_1st Block HRBR Layout,location_1st Block Jayanagar,...,location_elachenahalli,location_kadubisnahalli,location_kanakapura road,location_manyata,location_manyata park,location_manyata tech park,location_mvj engineering college,"location_ravindra nagar, T.dasarahalli peenya",location_rr nagar,location_tc.palya
2,2.0,3.0,62.0,3.0,1440.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,1.0,95.0,3.0,1521.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,1.0,51.0,2.0,1200.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,3.0,1.0,63.25,3.0,1310.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,2.0,2.0,70.0,3.0,1800.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Splitting the data

In [11]:
# initializing X and y
X2 = encoded_df.drop(columns='price')
y2 = encoded_df['price']

# splitting the data into tests and trains
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

## Training the Model 
(This code may take around 10-12 mins to run)

In [None]:
# defining the hyperparameters
param_grid_2 = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [5, 10, 20],       # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required at each leaf node
}

# initializing the random forest regressor
model_2 = RandomForestRegressor(random_state=42)

# initializing the gridsearch
grid_search = GridSearchCV(estimator=model_2, param_grid=param_grid_2, scoring='neg_mean_squared_error', cv=5)

# performing the grid search 
grid_search.fit(X2_train, y2_train)

# Getting the best parameters and the best estimator
best_params_2 = grid_search.best_params_
best_estimator_2 = grid_search.best_estimator_

print("Best Parameters:", best_params_2)

# Making predictions using the best estimator
y_pred_2 = best_estimator_2.predict(X2_test)

KeyboardInterrupt: 

In [12]:
# Reduced parameter grid with fewer values for initial exploration
param_grid_2 = {
  'n_estimators': [100, 200],  # Number of trees in the forest
  'max_depth': [10, 20],        # Maximum depth of the trees
  'min_samples_split': [2, 5],   # Minimum number of samples required to split a node
  'min_samples_leaf': [1, 2]     # Minimum number of samples required at each leaf node
}

# initializing the random forest regressor
model_2 = RandomForestRegressor(random_state=42)

# Using RandomizedSearchCV for more efficient exploration
from sklearn.model_selection import RandomizedSearchCV

# Randomized search with fewer iterations for initial exploration (can be adjusted)
grid_search = RandomizedSearchCV(estimator=model_2, param_distributions=param_grid_2, scoring='neg_mean_squared_error', cv=5, n_iter=20)

# performing the grid search 
grid_search.fit(X2_train, y2_train)

# Getting the best parameters and the best estimator
best_params_2 = grid_search.best_params_
best_estimator_2 = grid_search.best_estimator_

print("Best Parameters:", best_params_2)

# Making predictions using the best estimator
y_pred_2 = best_estimator_2.predict(X2_test)




Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 20}


## Evaluating the model's performance

In [13]:
# Mean Squared error
mse_2 = mean_squared_error(y2_test, y_pred_2)

# Root Mean Squared Error
rmse_2 = np.sqrt(mean_squared_error(y2_test, y_pred_2))

# R-Squared
r2_2 = r2_score(y2_test, y_pred_2)

print("Mean Squared Error:", mse_2)
print("RMSE:", rmse_2)
print("R-squared:", r2_2)

Mean Squared Error: 445.4960617476684
RMSE: 21.106777625863888
R-squared: 0.45539700766531677


## Comparing both

In [17]:
# Dataset with grouping encoding on the location column
print("Dataset with grouping encoding on the location column:")
print("Mean Squared Error:", mse_2)
print("RMSE:", rmse_2)
print("R-squared:", r2_2)

# Dataset with the price_range encoding
print("\nDataset with the price_range encoding on the location column:")
print("Mean Squared Error:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)

Dataset with grouping encoding on the location column:
Mean Squared Error: 445.4960617476684
RMSE: 21.106777625863888
R-squared: 0.45539700766531677

Dataset with the price_range encoding on the location column:
Mean Squared Error: 8.450318175569452
RMSE: 2.9069430980962547
R-squared: 0.9897097408525412


## Conclusion

After conducting an extensive grid search using Random Forest Regressor with hyperparameter tuning, the optimal set of hyperparameters for our model was determined to be `{'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 300}.` This configuration resulted in a model that exhibited exceptional performance on our dataset.

The evaluation metrics further support the effectiveness of our model. The `Mean Squared Error (MSE)` was found to be approximately `8.45`, indicating that, on average, the squared difference between predicted and actual values was relatively low. Additionally, the `Root Mean Squared Error (RMSE)` was approximately `2.91`, implying that the average magnitude of errors in our predictions was small. Furthermore, the `R-squared` value of approximately `0.99` indicates that our model explains approximately 99% of the variance in the target variable, suggesting an excellent fit to the data.

In summary, the model performed better on the dataset that had `price range encoding`. The optimized **Random Forest Regressor** model demonstrates outstanding predictive performance, achieving low errors and high explanatory power. These results instill confidence in the model's ability to accurately predict house prices in Bengaluru based on the provided features.