In [72]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV, cross_val_score


df = pd.read_csv('rentprediction_dataset_v5.csv')

y = df['rent']
X = df.drop(columns={'rent'})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
param_distributions = {
    'n_estimators': np.arange(100, 1001, 100),
    'max_depth': np.arange(1, 101, 5),
    'min_samples_split': np.arange(2, 11, 2),
    'min_samples_leaf': np.arange(1, 11, 2),
}

rfr = RandomForestRegressor(random_state=42)

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rfr,
    param_distributions=param_distributions,
    n_iter=100,  # Number of parameter settings sampled
    cv=5,        # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1,   # Use all available cores
    scoring='neg_mean_squared_error'  # Optimize for MSE
)

# Fit the model
random_search.fit(X, y)

# Retrieve the best parameters and the best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Evaluate the best model using cross-validation
print("Best Hyperparameters:", best_params)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

In [None]:
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Print the best parameters
print("Best Hyperparameters:", best_params)

# Optionally, print the best model
print("Best Model:", best_model)

Best Hyperparameters: {'n_estimators': 700, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_depth': 81}
Best Model: RandomForestRegressor(max_depth=81, min_samples_leaf=3, min_samples_split=4,
                      n_estimators=700, random_state=42)
Selected Features R-squared value: -1.0950798471999472
Selected Features Mean Squared Error: 3222865.1846061316
Selected Features Mean Squared Error: 1795.2340194543249


In [83]:
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

df = pd.read_csv('rentprediction_dataset_v5.csv')

y = df['rent']
X = df.drop(columns={'rent'})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rfr = RandomForestRegressor(max_depth=81, min_samples_leaf=3, min_samples_split=4, n_estimators=700, random_state=42)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'Selected Features R-squared value: {r2}')
print(f'Selected Features Mean Squared Error: {mse}')
print(f'Selected Features Root Mean Squared Error: {rmse}')
print(f'Selected Features Mean Absolute Error: {mae}')
print(f'Selected Features Mean Absolute Percentage Error: {mape}')

In [None]:
feature_importances = rfr.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

top_20_features = importance_df.head(20)['Feature']
X_20_features = X[top_20_features]

   sqmtr  eucl_dist_to_Roelof Hartplein  eucl_dist_to_Station RAI Amsterdam  \
0   44.0                       0.045183                            0.055306   
1   44.0                       0.122084                            0.116259   
2   50.0                       0.042235                            0.058403   
3   44.0                       0.122062                            0.116238   
4   56.0                       0.046004                            0.056154   

   bathrooms  eucl_dist_to_Elandsgracht  eucl_dist_to_Drenthepark  \
0   1.896561                   0.042444                  0.055840   
1   1.763281                   0.127342                  0.116505   
2   1.226973                   0.028300                  0.059651   
3   1.761819                   0.127318                  0.116484   
4   1.901344                   0.043131                  0.056688   

   eucl_dist_to_Leidseplein  age  eucl_dist_to_Rijksmuseum    elabel    floor  \
0                  0.045605  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_20_features, y, test_size=0.2, random_state=42)

rfr = RandomForestRegressor(max_depth=81, min_samples_leaf=3, min_samples_split=4, n_estimators=700, random_state=42)

rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)


print(f'Selected Features R-squared value: {r2}')
print(f'Selected Features Mean Squared Error: {mse}')
print(f'Selected Features Root Mean Squared Error: {rmse}')
print(f'Selected Features Mean Absolute Error: {mae}')
print(f'Selected Features Mean Absolute Percentage Error: {mape}')

Selected Features R-squared value: 0.8355782306615993
Selected Features Mean Squared Error: 424788.24698409473
Selected Features Root Mean Squared Error: 651.75781313621
Selected Features Mean Absolute Error: 332.2551104391318
Selected Features Mean Absolute Percentage Error: 0.10518863624215763
