### Dependancy Imports

In [150]:
# Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt

# Specific imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV

### Import Data

In [151]:
# Assign data to variable
dataset = pd.read_csv('HousingData.csv')
dataset = dataset.head(229)

# Convert the date strings into datetime objects
dataset['Date'] = pd.to_datetime(dataset['Date'])

# Convert Date Features to a number format
dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month

# Take a small look at the dataframe produced
print(dataset.head())
print(dataset.shape)

        Date  Composite_HPI  Single_Family_HPI  One_Storey_HPI  \
0 2005-01-01          100.0              100.0           100.0   
1 2005-02-01          101.0              101.0           101.2   
2 2005-03-01          102.1              102.1           102.4   
3 2005-04-01          103.1              103.2           103.8   
4 2005-05-01          103.8              103.8           104.6   

   Two_Storey_HPI  Townhouse_HPI  Apartment_HPI  Composite_Benchmark  \
0           100.0          100.0          100.0             237700.0   
1           100.9          100.9          101.3             240100.0   
2           101.9          101.7          102.3             242600.0   
3           102.9          102.5          103.1             245100.0   
4           103.4          103.2          104.1             246700.0   

   Single_Family_Benchmark  One_Storey_Benchmark  Two_Storey_Benchmark  \
0                 257600.0              205000.0              300000.0   
1                 2601

### Split data in to features

In [152]:
# Split data into X and y sets
X = dataset[[
    'Year',
    'Month',
    'Composite_HPI',
    'Single_Family_Benchmark',
    'One_Storey_Benchmark',
    'Two_Storey_Benchmark',
    'Townhouse_Benchmark',
    'Apartment_Benchmark'
    ]]

y = dataset['Composite_Benchmark']

### Split the dataset

In [153]:
# Set up your sets into train and test variables
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 0
    )

# Show the user a sample of the dataset for reference
print(f"X-train sample: {X_train.head()}")
print(f"y-train sample: {y_train.head()}")

X-train sample:      Year  Month  Composite_HPI  Single_Family_Benchmark  \
33   2007     10          134.1                 343000.0   
71   2010     12          139.9                 358500.0   
154  2017     11          229.0                 594500.0   
205  2022      2          355.7                 943800.0   
106  2013     11          160.2                 417300.0   

     One_Storey_Benchmark  Two_Storey_Benchmark  Townhouse_Benchmark  \
33               284700.0              388500.0             277200.0   
71               283400.0              413900.0             288800.0   
154              442100.0              703900.0             466200.0   
205              722200.0             1099800.0             754200.0   
106              322400.0              486400.0             312300.0   

     Apartment_Benchmark  
33              239700.0  
71              251900.0  
154             402500.0  
205             567500.0  
106             266400.0  
y-train sample: 33     31870

### Setup Basic Model

In [154]:
# Plant the seeds to grow a basic forest
forest_model = RandomForestRegressor(random_state = 0)

### Hyper Parameters

In [155]:
# Setup parameters
param_dist = {'n_estimators': [
    50,
    100,
    150,
    200,
    250,
    300,
    350,
    400,
    450,
    500,
    550,
    600,
    650,
    700,
    750
    ]}

# Initialize the random search
random_search = RandomizedSearchCV(
    estimator = forest_model,
    param_distributions = param_dist,
    n_iter = 15,
    cv = 5,
    scoring = 'neg_mean_squared_error',
    random_state = 0
)

# Fit the model
random_search.fit(X_train, y_train)

# Store the best selection for later
selected_params = random_search.best_params_

# Show the user the best parameter
print(f"Selected Param: {selected_params}")

Selected Param: {'n_estimators': 350}


### Final Model Set up

In [156]:
# Re-Initialize Random Forest using the selected Params
final_forest = RandomForestRegressor(
    n_estimators = selected_params['n_estimators'],
    random_state = 0
)

# Fit data to the new model
final_forest.fit(X_train, y_train)

# Make prediction to get a score to check
score_predict = final_forest.predict(X_test)

### Scoring

In [157]:
# Pass in your y test data and check it against your y predicted data
mse = mean_squared_error(y_test, score_predict)

# Share with the world!
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 13609729.002661912


### Create Future Data

In [158]:
# Create a dataset for future dates
future_dates = pd.date_range(start='2024-01-01', periods=120, freq='M')
future_data = pd.DataFrame({'Date': future_dates})

# # Create future dataset with date features
future_data['Year'] = future_data['Date'].dt.year
future_data['Month'] = future_data['Date'].dt.month

both_sets = [dataset, future_data]

# Add the old data to the new set
new_data = pd.concat(both_sets)

### Linear Regression to backfill the Nulls

In [159]:
# Identify the missing data columns
missing_data_cols = [
    'Composite_HPI',
    'Single_Family_Benchmark',
    'One_Storey_Benchmark',
    'Two_Storey_Benchmark',
    'Townhouse_Benchmark',
    'Apartment_Benchmark'
]

# Iterate over the list of columns and predict the future for each
for col in missing_data_cols:
  print(col)

  missing_data = new_data[
      new_data[col].isnull()
      ]

  completed_data = new_data.dropna(subset = [col])

  # Break out the X and y sets for the backfill model
  # X_backfill = completed_data.select_dtypes(
  #     include = ['float64', 'int64']
  #     ).drop(
  #         columns = missing_data_cols
  #         )

  X_backfill = new_data[col].dropna()
  X_backfill = X_backfill.array.reshape(-1, 1)

  y_backfill = completed_data[col]

  # Instatiate a Linear Regression Model
  backfill_model = LinearRegression()

  # Fit the backfill model
  backfill_model.fit(X_backfill, y_backfill)

  # Predict what the missing values would be
  X_missing = missing_data[col].replace([np.nan], 0)
  X_missing = X_missing.array.reshape(-1, 1)

  imputer = SimpleImputer(
      strategy = 'mean',
      # missing_values=np.nan
      )

  X_missing_transformed = imputer.fit_transform(X_missing)

  predicted_future_data = backfill_model.predict(X_missing_transformed)

  print(predicted_future_data)

  # # Add the predicted data to the OG new_data object
  # new_data.iloc[
  #     missing_data.index,
  #     col
  #     ] = predicted_future_data


Composite_HPI
[2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.84217094e-14 2.84217094e-14 2.84217094e-14 2.84217094e-14
 2.8421709

### Predict Outcome

In [161]:
# Grab features for prediction
features_for_prediction = new_data[[
    'Year',
    'Month',
    'Composite_HPI',
    'Single_Family_Benchmark',
    'One_Storey_Benchmark',
    'Two_Storey_Benchmark',
    'Townhouse_Benchmark',
    'Apartment_Benchmark'
    ]]

# Print a sample of the normalized future dataset
print(f"Normalized Future Data:\n {features_for_prediction}")

# Predict future prices
future_predictions = final_forest.predict(features_for_prediction)

print(future_predictions)

# Add predicted prices to future_data
future_data['Composite_Benchmark_Predicted'] = future_predictions

Normalized Future Data:
      Year  Month  Composite_HPI  Single_Family_Benchmark  \
0    2005      1          100.0                 257600.0   
1    2005      2          101.0                 260100.0   
2    2005      3          102.1                 263000.0   
3    2005      4          103.1                 265800.0   
4    2005      5          103.8                 267500.0   
..    ...    ...            ...                      ...   
115  2033      8            NaN                      NaN   
116  2033      9            NaN                      NaN   
117  2033     10            NaN                      NaN   
118  2033     11            NaN                      NaN   
119  2033     12            NaN                      NaN   

     One_Storey_Benchmark  Two_Storey_Benchmark  Townhouse_Benchmark  \
0                205000.0              300000.0             199600.0   
1                207400.0              302700.0             201400.0   
2                210000.0             

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### Graphs

In [None]:
# Plot the predicted Composite Benchmark prices over time
plt.plot(
    future_data['Date'],
    future_data['Composite_Benchmark_Predicted'],
    label='Predicted Prices'
    )

# Set up the axis
plt.xlabel('Year')
plt.ylabel('Predicted Prices')
plt.title('Next Ten Years Predicted Composite Benchmark Prices')

# Make the graph look nice
plt.tight_layout()
plt.legend()
plt.grid(True)

# Save the graph for a rainy day
plt.savefig('PredictionGraph.svg', format='svg', bbox_inches='tight')

# Display the graph to the end user
plt.show()