In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

victoria_income_full_data = pd.read_csv('/home/ryanwonhail/project-2-group-real-estate-industry-project-34/notebooks/victoria_income.csv')
target_variable = 'Personal income: Median employee income ($) (Data year: 2020)'


In [3]:
# Define the top features based on the correlation analysis
top_features_selected = [
    'Personal income: Median total income (excl. Government pensions and allowances) ($) (Data year: 2020)',
    'Personal income: Mean employee income ($) (Data year: 2020)',
    'Personal income: Total income (excl. Government pensions and allowances): Highest Quartile (%) (Data year: 2020)',
    'Total personal income: Persons earning $2000-$2999 per week (%) (Data year: 2021)',
    'Median equivalised total household income (weekly) ($) (Data year: 2021)'
]

# Ensure the dataset is aligned and doesn't have missing values for the selected features and the target variable
aligned_data_for_target_selected = victoria_income_full_data.dropna(subset=top_features_selected + [target_variable])


X_aligned_target_selected = aligned_data_for_target_selected[top_features_selected]
y_aligned_target_selected = aligned_data_for_target_selected[target_variable]
X_train_selected, X_test_selected, y_train_selected, y_test_selected = train_test_split(X_aligned_target_selected, y_aligned_target_selected, test_size=0.2, random_state=42)


model_selected = RandomForestRegressor(random_state=42)
model_selected.fit(X_train_selected, y_train_selected)


y_train_pred_selected = model_selected.predict(X_train_selected)
y_test_pred_selected = model_selected.predict(X_test_selected)

train_mse_selected = mean_squared_error(y_train_selected, y_train_pred_selected)
test_mse_selected = mean_squared_error(y_test_selected, y_test_pred_selected)
train_r2_selected = r2_score(y_train_selected, y_train_pred_selected)
test_r2_selected = r2_score(y_test_selected, y_test_pred_selected)

train_mse_selected, train_r2_selected, test_mse_selected, test_r2_selected


(np.float64(394156.9670947789),
 0.9921061987398571,
 np.float64(6546285.964082541),
 0.8699512899559063)

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
# Define the RandomForestRegressor and the hyperparameter grid
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for hyperparameter tuning with 5-fold cross-validation
grid_search_selected = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_selected.fit(X_train_selected, y_train_selected)

# Get the best estimator from the grid search
best_rf_model = grid_search_selected.best_estimator_

# Evaluate the model using cross-validation on the training set
cv_scores_selected = cross_val_score(best_rf_model, X_train_selected, y_train_selected, cv=5, scoring='r2')

# Predict on the test set with the best model
y_test_pred_tuned = best_rf_model.predict(X_test_selected)
test_mse_tuned = mean_squared_error(y_test_selected, y_test_pred_tuned)
test_r2_tuned = r2_score(y_test_selected, y_test_pred_tuned)

cv_scores_selected.mean(), test_mse_tuned, test_r2_tuned


(np.float64(0.9247074522184839),
 np.float64(6569737.859094096),
 0.869485393917897)

In [5]:

y_pred_2020_best_model = best_rf_model.predict(X_aligned_target_selected)

# Create forecast for subsequent years with a 2% increase
aligned_data_for_target_selected['2020_Forecast'] = y_pred_2020_best_model
aligned_data_for_target_selected['2021_Forecast'] = aligned_data_for_target_selected['2020_Forecast'] * 1.02
aligned_data_for_target_selected['2022_Forecast'] = aligned_data_for_target_selected['2021_Forecast'] * 1.02
aligned_data_for_target_selected['2023_Forecast'] = aligned_data_for_target_selected['2022_Forecast'] * 1.02


forecast_output_best_model = aligned_data_for_target_selected[
    ['Statistical Areas Level 2 2021 code', 'Statistical Areas Level 2 2021 name', '2020_Forecast', '2021_Forecast', '2022_Forecast', '2023_Forecast']
]


forecast_output_best_model_path = 'median_income_predictions 2020-2023 final.csv'
forecast_output_best_model.to_csv(forecast_output_best_model_path, index=False)
forecast_output_best_model_path


'median_income_predictions 2020-2023 final.csv'