In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris  # Example dataset
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
import joblib
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Load Data

In [None]:
# Load data

data = joblib.load('/Users/sebastian/Documents/UNI/Kandidat/2. Semester/DSP/Second Part/dsp_project/dsp_project/2. Feature engineering/housing_data_engineered.pkl')

# Inspect Data

## Set display options to show all columns
pd.set_option('display.max_columns', None)

## Now when you display your DataFrame, all columns will be shown
print(data)




     Year_built  Rooms  Balcony  Garden  Parking  Fireplace  Garage  Basement  \
0          1911      3     True    True     True       True    True     False   
1          1930      3    False   False    False      False   False     False   
2          1970      4    False    True     True      False   False     False   
3          1900      4    False    True    False       True   False     False   
4          1970      2    False   False    False      False   False     False   
..          ...    ...      ...     ...      ...        ...     ...       ...   
192        1872      3    False    True     True      False   False     False   
193        1967      4    False    True     True      False    True     False   
194        1954      2    False    True     True      False    True      True   
195        1960      5    False    True     True      False    True     False   
196        1972      2     True   False    False      False   False     False   

     Elevator  Type_Andelsb

# Partition into test and train sets

In [None]:
## Check for near-zero variance

# Calculate the variance of each feature
variance = data.var()

# Set a threshold for near-zero variance 
threshold = 0.01

# Identify features with near-zero variance
near_zero_variance_features = variance[variance < threshold]

# Print the features with near-zero variance
print("Features with near-zero variance:")
print(near_zero_variance_features) # No features with zero or near-zero variance

# Extract features (X) and target (y) from the data
y = np.exp(data['Ask.price_log']) # Transfrom variable back to original state
X = data.drop(columns=['Ask.price_log'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)



Features with near-zero variance:
Series([], dtype: float64)


# Make Model

In [None]:
# Make hyperparameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200, 500], # Amount of trees
    'max_depth': [None, 10, 20, 30], # How deep the trees in the forest can go
    'min_samples_split': [2, 5, 10], # Specifies the minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4], # Specifies the minimum number of samples required to be at a leaf node.
    'max_features': ['sqrt', 'log2'], # Specifies the method used to select the number of random features to be considered at each split

}


# Initialize Random Forest regressor
rf_regressor = RandomForestRegressor(random_state = 123, # Seed
                                       bootstrap = True) # Small dataset - Use bootstrapping
                                       
# Perform grid search using cross-validation
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)



Best Parameters: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: -2811159774597.3037


# Make Predictions & Assess Accuracy

In [None]:

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate MSE
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse) # RMSE 1148518 DKK

# Create DataFrame for y_pred and y_test
df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

# Print the DataFrame
print(df)

# Save model

joblib.dump(best_model, 'Full_Random_Forest_Reg.pkl')

RMSE: 1148518.8239029888
        y_test      y_pred
78   3848000.0  3198520.00
26   1195000.0  2408020.00
42   3495000.0  3197580.00
4    1199000.0  2207524.84
167  1695000.0  4402380.00
146  1695000.0  2343880.00
92   5998000.0  3719200.00
64   2795000.0  2807300.00
54   3595000.0  2646960.00
116  2295000.0  2809740.00
19   8995000.0  6108680.00
158  1195000.0  1868550.46
168  1548000.0  1996364.84
43   5995000.0  3965760.00
63   2495000.0  3020120.00
106  1995000.0  2525084.84
112  1595000.0  2800916.82
154  1995000.0  2731976.82
102  4299000.0  5155740.00
162  3695000.0  3764820.00
188  6750000.0  5542184.84
155  1995000.0  2242993.64
169  5495000.0  4040300.00
31   1395000.0  2327720.00
95   4695000.0  3467696.82
176  2998000.0  3003260.00
149  2995000.0  3064456.82
130  4998000.0  3631820.00
160  1750000.0  2686040.00
87   1506016.0  3491268.16
175  4195000.0  3942884.84
38   2850000.0  2912473.64
83   3695000.0  4332280.00
118  5295000.0  3631820.00
110  4195000.0  3916160.00
34 

['Full_Random_Forest_Reg.pkl']

# Assess Feature Importance

In [2]:
# Fit optimal RF model
rf_regressor.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_regressor.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(feature_importances)[::-1]

# Print the feature ranking with names
print("Feature ranking:")
for f in range(X_train.shape[1]):
    feature_name = X_train.columns[indices[f]]  # Get the name of the feature
    print(f"{f + 1}. {feature_name} ({feature_importances[indices[f]]})")

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), feature_importances[indices], color="b", align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)  # Use feature names for xticks
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.show()


NameError: name 'rf_regressor' is not defined

# Make New Model With Top-10 Most Important Features

In [None]:
# Get indices of the top 10 most important features
top_feature_indices = indices[:10]

# Select top 10 for x test and x train
X_train_top_features = X_train.iloc[:, top_feature_indices]
X_test_top_features = X_test.iloc[:, top_feature_indices]

# Print the shape of the new X_train DataFrame to check if selection has been done correctly
print(X_train_top_features.shape)
print(X_test_top_features.shape)

(151, 10)
(38, 10)


# Re-estimate Model

In [None]:
# Make hyperparameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200, 500], # Amount of trees
    'max_depth': [None, 10, 20, 30], # How deep the trees in the forest can go
    'min_samples_split': [2, 5, 10], # Specifies the minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4], # Specifies the minimum number of samples required to be at a leaf node.
    'max_features': ['sqrt', 'log2'], # Specifies the method used to select the number of random features to be considered at each split

}


# Initialize Random Forest regressor
rf_regressor = RandomForestRegressor(random_state = 123, # Seed
                                       bootstrap = True) # Small dataset - Use bootstrapping
                                       
# Perform grid search using cross-validation
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_top_features, y_train) # Change X_train to modified training set

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best Score: -1958377041018.8594


# Assess Model

In [None]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_top_features)

# Evaluate MSE
rmse = root_mean_squared_error(y_test, y_pred)
print("RMSE:", rmse) # RMSE 1373840 DKK

# Create DataFrame for y_pred and y_test
df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

# Print the DataFrame
print(df)

# Save model

joblib.dump(best_model, 'Top10_features_Random_Forest_Reg.pkl')

RMSE: 1373840.5070363523
        y_test      y_pred
78   3848000.0  2543340.00
26   1195000.0  1756960.00
42   3495000.0  3865804.84
4    1199000.0  1855640.00
167  1695000.0  4879900.00
146  1695000.0  2214060.00
92   5998000.0  3489600.00
64   2795000.0  2521000.00
54   3595000.0  2848660.00
116  2295000.0  2497180.00
19   8995000.0  4642176.82
158  1195000.0  1322820.00
168  1548000.0  1373680.00
43   5995000.0  7730400.00
63   2495000.0  2402440.00
106  1995000.0  2405709.68
112  1595000.0  2400340.00
154  1995000.0  2372280.00
102  4299000.0  4773080.00
162  3695000.0  3905800.00
188  6750000.0  4074500.00
155  1995000.0  2194020.00
169  5495000.0  3951620.00
31   1395000.0  1566060.00
95   4695000.0  4129193.64
176  2998000.0  2944144.84
149  2995000.0  3476700.00
130  4998000.0  4045996.82
160  1750000.0  2254120.00
87   1506016.0  5263096.96
175  4195000.0  3919300.00
38   2850000.0  2743120.00
83   3695000.0  4731620.00
118  5295000.0  4045996.82
110  4195000.0  4740860.00
34 

['Top10_features_Random_Forest_Reg.pkl']