In [1]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt 

In [2]:
df1 = pd.read_csv('rdfn_wa_county_updated_.csv')
df2 = pd.read_csv('4_regimes_monthly_updated_.csv')

In [11]:
def model_ridge_reg(lookback, lookforward, df1):
    import numpy as np 
    features = []
    for i in df1.keys():
        if i not in ['Region', 'Month of Period End']:
            features.append(i)
    #df1 = history[['open', 'high', 'low', 'close', 'volume']].reset_index(drop=True)
    df1 = df1[features].reset_index(drop=True)

    #function to split X
    def ts_split(ts, feature_steps, target_steps):
        n_obs = len(ts) - feature_steps - target_steps + 1
        X = np.array([ts[idx:idx + feature_steps] for idx in range(n_obs)])
        return X
    # Forecasting target t+n days ahead
    def ts_split_y_n(ts, feature_steps, target_steps):
        n_obs = len(ts) - feature_steps - target_steps + 1
        y = np.array([ts[idx + feature_steps + target_steps - 1]
                    for idx in range(n_obs)])
        return y
    # Forecasting target t+1 ahead
    def ts_split_y_1(ts, feature_steps, target_steps):
        n_obs = len(ts) - feature_steps - target_steps + 1
        y = np.array([ts[idx + feature_steps:idx + feature_steps + target_steps]
                    for idx in range(n_obs)])
        return y

    value_features = lookback #we use last 6 months
    value_target = lookforward #to forecast 1 month median sale price
    #split the data into features with correct format

    df1_features_dict = {}  # Dictionary to hold the dynamically created variables
    for feature in features:
        feature_name = f"df1_{feature}"  # Dynamically create a variable name
        df1_features_dict[feature_name] = ts_split(np.array(df1[feature]), feature_steps=value_features, target_steps=value_target)

    #coin_high = ts_split(np.array(df1['high']), feature_steps = value_features, target_steps = value_target)
    #coin_low = ts_split(np.array(df1['low']), feature_steps = value_features, target_steps = value_target)
    #coin_close = ts_split(np.array(df1['close']), feature_steps = value_features, target_steps = value_target)
    #coin_vol = ts_split(np.array(df1['volume']), feature_steps = value_features, target_steps = value_target)

    #Target var of predicting price t+1 month ahead
    y_coin = ts_split_y_1(np.array(df1['Median Sale Price']), feature_steps = value_features, target_steps = value_target)

    #create the features matrix
    #combined_array = np.stack((coin_open, coin_high , coin_low, coin_close, coin_vol), axis=1)
    combined_array = np.stack(list(df1_features_dict.values()), axis=1)
    X = combined_array

    from sklearn.preprocessing import StandardScaler

    # Determine the size of the validation set
    train_idx = round(len(X)*0.5)
    X_train_full= X[:train_idx] ## all same for each model

    #test data for X and y targets
    X_test= X[train_idx:]
    y_test_coin = y_coin[train_idx:]

    val_idx = round(train_idx * 0.1)  # 10% of the training set
    train_idx_final = train_idx - val_idx

    # Split the training data into training and validation sets
    X_train = X_train_full[:train_idx_final]
    X_val = X_train_full[train_idx_final:train_idx]

    #train and validation for targets
    y_train_coin = y_coin[:train_idx_final]
    y_val_coin = y_coin[train_idx_final:train_idx]

    #reshape and convert all into pandas series
    X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
    X_train_reshaped = pd.DataFrame(X_train_reshaped)
    X_val_reshaped = X_val.reshape(X_val.shape[0], -1)
    X_val_reshaped = pd.DataFrame(X_val_reshaped)

    y_train_coin = pd.DataFrame(y_train_coin)
    y_val_coin = pd.DataFrame(y_val_coin)

    #reshape  test
    X_test_reshaped = X_test.reshape(X_test.shape[0], -1)
    X_test_reshaped = pd.DataFrame(X_test_reshaped)

    y_test_coin_reshaped = y_test_coin.reshape(y_test_coin.shape[0], -1)
    y_test_coin_reshaped = pd.DataFrame(y_test_coin_reshaped)

    # Initialize the StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_reshaped)
    X_val_scaled = scaler.transform(X_val_reshaped)
    X_test_scaled = scaler.transform(X_test_reshaped)

    from sklearn.linear_model import Ridge, ElasticNet
    from sklearn.metrics import mean_squared_error
    import numpy as np
    import matplotlib.pyplot as plt
    # Create and train the Linear Regression model
    ridge_reg = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
    model_ridge = ridge_reg.fit(X_train_scaled, y_train_coin.values.ravel())

    return model_ridge, X_train_scaled, X_test_scaled, y_train_coin.values, y_test_coin_reshaped


In [14]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

df_train = df1[:-12]
lookback = 24
lookforward = 1
model_ridge, X_train, X_test, y_train, y_test = model_ridge_reg(lookback, lookforward, df_train)
#X_train, X_test, y_train, y_test = train_test_split(model_ridge, y, test_size=0.5, random_state=42)

# Define the hyperparameter grid (tuning the regularization strength)
alpha_vals = [0.01, 0.05,0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1, 10, 100]
#param_grid = {'alpha': alpha_vals}
param_grid = {
    'alpha': alpha_vals,  # Prior for the precision of the weights
    'l1_ratio': alpha_vals  # Prior for the precision of the noise
}

# Initialize GridSearchCV to search for the best hyperparameter
grid_search = GridSearchCV(estimator=model_ridge, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_ridge = grid_search.best_estimator_

# Print the best hyperparameter
print(f"Best Ridge alpha: {grid_search.best_params_}")

# Make predictions and evaluate on the test set
y_pred = best_ridge.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Ridge Regression MSE: {mse}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best Ridge alpha: {'alpha': 0.01, 'l1_ratio': 0.1}
Ridge Regression MSE: 1179268748.2615354


  model = cd_fast.enet_coordinate_descent(


In [5]:
def model_lasso_reg(lookback, lookforward, df1):
    import numpy as np 
    features = []
    for i in df1.keys():
        if i not in ['Region', 'Month of Period End']:
            features.append(i)
    #df1 = history[['open', 'high', 'low', 'close', 'volume']].reset_index(drop=True)
    df1 = df1[features].reset_index(drop=True)

    #function to split X
    def ts_split(ts, feature_steps, target_steps):
        n_obs = len(ts) - feature_steps - target_steps + 1
        X = np.array([ts[idx:idx + feature_steps] for idx in range(n_obs)])
        return X
    # Forecasting target t+n days ahead
    def ts_split_y_n(ts, feature_steps, target_steps):
        n_obs = len(ts) - feature_steps - target_steps + 1
        y = np.array([ts[idx + feature_steps + target_steps - 1]
                    for idx in range(n_obs)])
        return y
    # Forecasting target t+1 ahead
    def ts_split_y_1(ts, feature_steps, target_steps):
        n_obs = len(ts) - feature_steps - target_steps + 1
        y = np.array([ts[idx + feature_steps:idx + feature_steps + target_steps]
                    for idx in range(n_obs)])
        return y

    value_features = lookback #we use last 6 months
    value_target = lookforward #to forecast 1 month median sale price
    #split the data into features with correct format

    df1_features_dict = {}  # Dictionary to hold the dynamically created variables
    for feature in features:
        feature_name = f"df1_{feature}"  # Dynamically create a variable name
        df1_features_dict[feature_name] = ts_split(np.array(df1[feature]), feature_steps=value_features, target_steps=value_target)

    #coin_high = ts_split(np.array(df1['high']), feature_steps = value_features, target_steps = value_target)
    #coin_low = ts_split(np.array(df1['low']), feature_steps = value_features, target_steps = value_target)
    #coin_close = ts_split(np.array(df1['close']), feature_steps = value_features, target_steps = value_target)
    #coin_vol = ts_split(np.array(df1['volume']), feature_steps = value_features, target_steps = value_target)

    #Target var of predicting price t+1 month ahead
    y_coin = ts_split_y_1(np.array(df1['Median Sale Price']), feature_steps = value_features, target_steps = value_target)

    #create the features matrix
    #combined_array = np.stack((coin_open, coin_high , coin_low, coin_close, coin_vol), axis=1)
    combined_array = np.stack(list(df1_features_dict.values()), axis=1)
    X = combined_array

    from sklearn.preprocessing import StandardScaler

    # Determine the size of the validation set
    train_idx = round(len(X)*0.5)
    X_train_full= X[:train_idx] ## all same for each model

    #test data for X and y targets
    X_test= X[train_idx:]
    y_test_coin = y_coin[train_idx:]

    val_idx = round(train_idx * 0.1)  # 10% of the training set
    train_idx_final = train_idx - val_idx

    # Split the training data into training and validation sets
    X_train = X_train_full[:train_idx_final]
    X_val = X_train_full[train_idx_final:train_idx]

    #train and validation for targets
    y_train_coin = y_coin[:train_idx_final]
    y_val_coin = y_coin[train_idx_final:train_idx]

    #reshape and convert all into pandas series
    X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
    X_train_reshaped = pd.DataFrame(X_train_reshaped)
    X_val_reshaped = X_val.reshape(X_val.shape[0], -1)
    X_val_reshaped = pd.DataFrame(X_val_reshaped)

    y_train_coin = pd.DataFrame(y_train_coin)
    y_val_coin = pd.DataFrame(y_val_coin)

    #reshape  test
    X_test_reshaped = X_test.reshape(X_test.shape[0], -1)
    X_test_reshaped = pd.DataFrame(X_test_reshaped)

    y_test_coin_reshaped = y_test_coin.reshape(y_test_coin.shape[0], -1)
    y_test_coin_reshaped = pd.DataFrame(y_test_coin_reshaped)

    # Initialize the StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_reshaped)
    X_val_scaled = scaler.transform(X_val_reshaped)
    X_test_scaled = scaler.transform(X_test_reshaped)

    from sklearn.linear_model import Lasso
    from sklearn.metrics import mean_squared_error
    import numpy as np
    import matplotlib.pyplot as plt
    # Create and train the Linear Regression model
    lasso_reg = Lasso(alpha=0.01)
    model_lasso = lasso_reg.fit(X_train_scaled, y_train_coin.values.ravel())

    # Predict on the validation and test sets
    #y_val_pred = model_lasso.predict(X_val_scaled)
    #y_test_pred_lasso = model_lasso.predict(X_test_scaled)

    return model_lasso, X_train_scaled, X_test_scaled, y_train_coin.values, y_test_coin_reshaped


In [6]:
df_train = df1[:-12]
lookback = 24
lookforward = 1
model_lasso, X_train, X_test, y_train, y_test = model_lasso_reg(lookback, lookforward, df_train)
# Define the hyperparameter grid (tuning the alpha regularization parameter)
alpha_vals = [0.01,0.05,0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1, 10, 100]
param_grid = {'alpha': alpha_vals}

# Initialize GridSearchCV to search for the best alpha parameter
grid_search = GridSearchCV(estimator=model_lasso, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Retrieve the best model after hyperparameter tuning
best_lasso = grid_search.best_estimator_

# Print the best hyperparameter
print(f"Best Lasso alpha: {grid_search.best_params_}")

# Make predictions using the best Lasso model
y_pred = best_lasso.predict(X_test)

# Evaluate the model performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Lasso Regression MSE: {mse}")

Best Lasso alpha: {'alpha': 10}
Lasso Regression MSE: 1115454861.4587917


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [7]:
def model_bayes_reg(lookback, lookforward, df1):
    import numpy as np 
    features = []
    for i in df1.keys():
        if i not in ['Region', 'Month of Period End']:
            features.append(i)
    #df1 = history[['open', 'high', 'low', 'close', 'volume']].reset_index(drop=True)
    df1 = df1[features].reset_index(drop=True)

    #function to split X
    def ts_split(ts, feature_steps, target_steps):
        n_obs = len(ts) - feature_steps - target_steps + 1
        X = np.array([ts[idx:idx + feature_steps] for idx in range(n_obs)])
        return X
    # Forecasting target t+n days ahead
    def ts_split_y_n(ts, feature_steps, target_steps):
        n_obs = len(ts) - feature_steps - target_steps + 1
        y = np.array([ts[idx + feature_steps + target_steps - 1]
                    for idx in range(n_obs)])
        return y
    # Forecasting target t+1 ahead
    def ts_split_y_1(ts, feature_steps, target_steps):
        n_obs = len(ts) - feature_steps - target_steps + 1
        y = np.array([ts[idx + feature_steps:idx + feature_steps + target_steps]
                    for idx in range(n_obs)])
        return y

    value_features = lookback #we use last 6 months
    value_target = lookforward #to forecast 1 month median sale price
    #split the data into features with correct format

    df1_features_dict = {}  # Dictionary to hold the dynamically created variables
    for feature in features:
        feature_name = f"df1_{feature}"  # Dynamically create a variable name
        df1_features_dict[feature_name] = ts_split(np.array(df1[feature]), feature_steps=value_features, target_steps=value_target)

    #coin_high = ts_split(np.array(df1['high']), feature_steps = value_features, target_steps = value_target)
    #coin_low = ts_split(np.array(df1['low']), feature_steps = value_features, target_steps = value_target)
    #coin_close = ts_split(np.array(df1['close']), feature_steps = value_features, target_steps = value_target)
    #coin_vol = ts_split(np.array(df1['volume']), feature_steps = value_features, target_steps = value_target)

    #Target var of predicting price t+1 month ahead
    y_coin = ts_split_y_1(np.array(df1['Median Sale Price']), feature_steps = value_features, target_steps = value_target)

    #create the features matrix
    #combined_array = np.stack((coin_open, coin_high , coin_low, coin_close, coin_vol), axis=1)
    combined_array = np.stack(list(df1_features_dict.values()), axis=1)
    X = combined_array

    from sklearn.preprocessing import StandardScaler

    # Determine the size of the validation set
    train_idx = round(len(X)*0.5)
    X_train_full= X[:train_idx] ## all same for each model

    #test data for X and y targets
    X_test= X[train_idx:]
    y_test_coin = y_coin[train_idx:]

    val_idx = round(train_idx * 0.1)  # 10% of the training set
    train_idx_final = train_idx - val_idx

    # Split the training data into training and validation sets
    X_train = X_train_full[:train_idx_final]
    X_val = X_train_full[train_idx_final:train_idx]

    #train and validation for targets
    y_train_coin = y_coin[:train_idx_final]
    y_val_coin = y_coin[train_idx_final:train_idx]

    #reshape and convert all into pandas series
    X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
    X_train_reshaped = pd.DataFrame(X_train_reshaped)
    X_val_reshaped = X_val.reshape(X_val.shape[0], -1)
    X_val_reshaped = pd.DataFrame(X_val_reshaped)

    y_train_coin = pd.DataFrame(y_train_coin)
    y_val_coin = pd.DataFrame(y_val_coin)

    #reshape  test
    X_test_reshaped = X_test.reshape(X_test.shape[0], -1)
    X_test_reshaped = pd.DataFrame(X_test_reshaped)

    y_test_coin_reshaped = y_test_coin.reshape(y_test_coin.shape[0], -1)
    y_test_coin_reshaped = pd.DataFrame(y_test_coin_reshaped)

    # Initialize the StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_reshaped)
    X_val_scaled = scaler.transform(X_val_reshaped)
    X_test_scaled = scaler.transform(X_test_reshaped)

    from sklearn.linear_model import BayesianRidge
    import numpy as np
    import matplotlib.pyplot as plt

    # Create and train the Bayesian Ridge Regression model
    bayesian_reg = BayesianRidge()
    model_bayes = bayesian_reg.fit(X_train_scaled, y_train_coin.values.ravel())

    return model_bayes, X_train_scaled, X_test_scaled, y_train_coin.values, y_test_coin_reshaped


In [8]:
df_train = df1[:-12]
lookback = 24
lookforward = 1
model_bayes, X_train, X_test, y_train, y_test = model_bayes_reg(lookback, lookforward, df_train)

param_grid = {
    'alpha_1': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3],  # Prior for the precision of the weights
    'lambda_1': [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]  # Prior for the precision of the noise
}

# Initialize GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(estimator=model_bayes, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Retrieve the best model after hyperparameter tuning
best_bayes_ridge = grid_search.best_estimator_

# Print the best hyperparameters
print(f"Best Bayesian Ridge hyperparameters: {grid_search.best_params_}")

# Make predictions using the best Bayesian Ridge model
y_pred = best_bayes_ridge.predict(X_test)

# Evaluate the model performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Bayesian Ridge Regression MSE: {mse}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best Bayesian Ridge hyperparameters: {'alpha_1': 1e-08, 'lambda_1': 1e-08}
Bayesian Ridge Regression MSE: 4057034776.1400814


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
