In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from darts.metrics import smape
from darts import TimeSeries
from keras.layers import LSTM, Dense, BatchNormalization, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras import backend as K

In [None]:
df = pd.read_csv('/home/zqiao/data_flake/imputed data/pho_t_data.csv',index_col=0)
df.head()

In [None]:
df.shape

In [None]:
df_new = df[['date', 'research_submkt_id', 'real_hedonic_rent_submarket']]
df_new['date'] = pd.to_datetime(df_new['date'])
df['date'] = pd.to_datetime(df['date'])

In [None]:
num_lags = 36
for lag in range(1, num_lags + 1):
    df_new[f'rent_{lag}months_ago'] = df_new.groupby('research_submkt_id')['real_hedonic_rent_submarket'].shift(lag)
df_new = df_new.dropna()
df_new = df_new.sort_values(['date', 'research_submkt_id']).reset_index(drop=True)
    
df_new.head()

In [None]:
df_new = df[['date',
             'research_submkt_id',
             'real_hedonic_rent_submarket',
             'tech_employment_histfc',
             'real_market_level_rent',
             'nominal_earnings_byresidence_histfc',
             'gdp_histfc',
             'manufacturing_employment_histfc',
             'population_histfc',
             'real_bricks_and_mortar_retail_sales',
             'compltn_rate',
             'imports_us',
             'nominal_retail_sales_histfc',
             'real_retail_sales_ex_gas',
             'unemployment_rate_histfc',
             'median_sfh_sale_price_histfc',
             'baa_credit_spreads',
             "nominal_retail_sales_histfc",
             "employment_histfc",
             "real_ecommerce",
             "spread_3m10y",
             "ecomm^2_pop",
             "weighted_pop_estimate_cryr",
             "weighted_hh_estimate_cryr"]]

#df_new = df_new.merge(df_sel, on=['date', 'research_submkt_id'], how='left')
df_new.head()

In [None]:
df_new = df[["date", 
             "research_submkt_id",
             'real_hedonic_rent_submarket',
             "real_market_level_rent",
             "gdp_histfc",
             "nominal_retail_sales_histfc",
             "employment_histfc",
             "real_ecommerce",
             "spread_3m10y",
             "real_retail_sales_ex_gas",
             "imports_us",
             "ecomm^2_pop",
             "weighted_pop_estimate_cryr",
             "weighted_hh_estimate_cryr"]]
df_new.head()

In [None]:
X = df_new.iloc[:, [0, 1] + list(range(3, len(df_new.columns)))]
Y = df_new.iloc[:, :3]

In [None]:
X

In [None]:
Y

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
Y_scaled = Y.copy()
Y_scaled['real_hedonic_rent_submarket'] = scaler.fit_transform(Y['real_hedonic_rent_submarket'].values.reshape(-1, 1))
X_scaled = X.copy()

for i in range(2, X.shape[1]):
    feature_values = X.iloc[:, i]
    scaled_feature = scaler.fit_transform(feature_values.values.reshape(-1, 1))
    X_scaled.iloc[:, i] = scaled_feature.flatten()

In [None]:
X_scaled

In [None]:
Y_scaled

In [None]:
def all_split_data_by_submarket(data, ntest, num_lags, submkt_id):

    submarket_data = data[data['research_submkt_id'] == submkt_id]
    seq_pred_train = submarket_data.iloc[-ntest-num_lags:-ntest]
    train_data = submarket_data.iloc[:-ntest]
    test_data = submarket_data.iloc[-ntest:]
    seq_pred_train_data = pd.concat([seq_pred_train,test_data])
    return train_data, test_data, seq_pred_train_data

In [None]:
Y_train, Y_test = all_split_data_by_submarket(Y_scaled,24)
y_train = Y_train.iloc[:,-1]
y_test = Y_test.iloc[:,-1]
X_train, X_test = all_split_data_by_submarket(X_scaled,24)
x_train = X_train.iloc[:,2:]
x_test = X_test.iloc[:,2:]


In [None]:
Y_train, Y_test, Y_seq_test = all_split_data_by_submarket(Y,24,12,'PHO037')
y_train = Y_train.iloc[:,-1]
y_test = Y_test.iloc[:,-1]
y_seq_test = Y_seq_test.iloc[:,-1]
X_train, X_test, X_seq_test = all_split_data_by_submarket(X,24,12,'PHO037')
x_train = X_train.iloc[:,2:]
x_test = X_test.iloc[:,2:]
x_seq_test = X_seq_test.iloc[:,2:]

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape, x_seq_test.shape, y_seq_test.shape

In [None]:
x_seq_test

In [None]:
def create_sequences(data, labels, num_lags):
    x, y = [], []
    for i in range(num_lags, len(data)):
        x.append(data.iloc[i - num_lags:i, :].values)
        y.append(labels.iloc[i])
    return np.array(x), np.array(y)

x_train_seq, y_train_seq = create_sequences(x_train, y_train, 12)
x_test_seq, y_test_seq = create_sequences(x_test, y_test, 12)
x_test_seq_seq, y_test_seq_seq = create_sequences(x_seq_test, y_seq_test, 12)

In [None]:
x_train_seq.shape, y_train_seq.shape, x_test_seq.shape, y_test_seq.shape, x_test_seq_seq.shape, x_test_seq_seq.shape

In [None]:
x_test_seq

In [None]:
def smape(y_true, y_pred):
    epsilon = 0.1  # Smoothing factor to avoid division by zero
    denominator = K.abs(y_true) + K.abs(y_pred) + epsilon
    diff = K.abs(y_true - y_pred) / denominator
    return 2.0 * K.mean(diff, axis=-1)


# Define the model
model = Sequential()
model.add(Bidirectional(LSTM(512, return_sequences=True), input_shape=(x_train_seq.shape[1], x_train_seq.shape[2])))
model.add(BatchNormalization())
#model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.1))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.1))
model.add(Dense(1))


# Compile the model
optimizer = Adam(learning_rate=0.001, clipvalue=0.5)  # Adjust learning rate and gradient clipping as needed
model.compile(loss='mean_squared_error', metrics=[smape]) # optimizer=optimizer, run_eagerly=True)

# Define callbacks
#early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.0001)

# Train the model
model.fit(x_train_seq, y_train_seq, epochs=500, batch_size=32,
          validation_data=(x_test_seq, y_test_seq))     #, callbacks=[early_stopping, reduce_lr])



In [None]:
y_pred = model.predict(x_test_seq_seq)

In [None]:
y_pred.shape

In [None]:
Y_test['y_pred'] = y_pred
Y_test

In [None]:
def all_plot_submkt_forecast(Y_test_pred, submkt_id):

    x = Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['date']
    y =  Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['real_hedonic_rent_submarket']
    y_pred =  Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['y_pred']
    
    plt.plot(x,y,label='test')
    plt.plot(x,y_pred,label='pred')
    plt.title('{} submkt_rent forecasting'.format(submkt_id))
    plt.legend()

    return plt.show()

In [None]:
all_plot_submkt_forecast(Y_test, 'PHO037')

In [None]:
y_t = TimeSeries.from_series(Y_test['real_hedonic_rent_submarket'])
y_p = TimeSeries.from_series(Y_test['y_pred'])

In [None]:
smape(y_t,y_p)

In [None]:
def all_get_submkt_forecast(df, num_lags):
    df_new = df[['date', 'research_submkt_id', 'real_hedonic_rent_submarket']]
    df_new['date'] = pd.to_datetime(df_new['date'])
    df['date'] = pd.to_datetime(df['date'])

    if num_lags is None:
        num_lags = 36

    for lag in range(1, num_lags + 1):
        df_new[f'rent_{lag}months_ago'] = df_new.groupby('research_submkt_id')['real_hedonic_rent_submarket'].shift(lag)
    df_new = df_new.dropna()
    df_new = df_new.sort_values(['date', 'research_submkt_id']).reset_index(drop=True)

    df_sel = df[
        ['date',
         'research_submkt_id',
         'base_sf',
         'tech_employment_histfc',
         'real_market_level_rent',
         'nominal_earnings_byresidence_histfc',
         'gdp_histfc',
         'manufacturing_employment_histfc',
         'population_histfc',
         'real_bricks_and_mortar_retail_sales',
         'compltn_rate',
         'imports_us',
         'nominal_retail_sales_histfc',
         'real_retail_sales_ex_gas',
         'unemployment_rate_histfc',
         'median_sfh_sale_price_histfc',
         'baa_credit_spreads']
    ]

    df_new = df_new.merge(df_sel, on=['date', 'research_submkt_id'], how='left')

    X = df_new.iloc[:, [0, 1] + list(range(3, len(df_new.columns)))]
    Y = df_new.iloc[:, :3]

    Y_test_pred = pd.DataFrame(columns=Y.columns)

    for submarket in df_new['research_submkt_id'].unique():
        submarket_data = df_new[df_new['research_submkt_id'] == submarket]
        submarket_X = X[X['research_submkt_id'] == submarket].iloc[:, 2:]
        submarket_Y = Y[Y['research_submkt_id'] == submarket].iloc[:, -1]

        # Prepare the data for the RNN model
        scaler = MinMaxScaler(feature_range=(0, 1))
        scaled_data = scaler.fit_transform(submarket_Y.values.reshape(-1, 1))
        x_scaled = np.zeros(submarket_X.shape)

        for i in range(submarket_X.shape[1]):
            feature_values = submarket_X.iloc[:, i].values.reshape(-1, 1)
            scaled_feature = scaler.fit_transform(feature_values)
            x_scaled[:, i] = scaled_feature.squeeze()

        # Convert the data into sequences and labels
        def create_sequences(data, labels, num_lags):
            X, y = [], []
            for i in range(num_lags, len(data)):
                X.append(data[i - num_lags:i, :])
                y.append(labels[i])
            return np.array(X), np.array(y)

        X_seq, y_seq = create_sequences(x_scaled, scaled_data, num_lags)

        # Split the data into training and testing sets
        train_size = int(len(X_seq) * 0.8)  # 80% train, 20% validation
        X_train, y_train = X_seq[:train_size], y_seq[:train_size]
        X_val, y_val = X_seq[train_size:], y_seq[train_size:]

        # Build the RNN model
        model = Sequential()
        model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(LSTM(50))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam')

        # Train the model
        model.fit(X_train, y_train, epochs=50, batch_size=16, validation_data=(X_val, y_val), verbose=0)

        # Make predictions
        x_test = submarket_X.iloc[-num_lags:]
        x_test_scaled = np.zeros_like(x_test.values)

        for i in range(x_test.shape[1]):
            feature_values = x_test.iloc[:, i].values.reshape(-1, 1)
            scaled_feature = scaler.transform(feature_values)
            x_test_scaled[:, i] = scaled_feature.squeeze()

        x_test_scaled = x_test_scaled.reshape(1, num_lags, -1)
        y_pred_scaled = model.predict(x_test_scaled)
        y_pred = scaler.inverse_transform(y_pred_scaled)

        # Append predictions to Y_test_pred
        submarket_Y_pred = pd.DataFrame(submarket_Y.iloc[-len(y_pred):].values.reshape(-1, 1), columns=['real_hedonic_rent_submarket'])
        submarket_Y_pred['y_pred'] = y_pred
        Y_test_pred = pd.concat([Y_test_pred, submarket_Y_pred], ignore_index=True)

    return Y_test_pred


In [None]:

# Call the function
df = pd.read_csv('/home/zqiao/data_flake/imputed data/dal_t_data.csv',index_col=0)  # Replace with your data file
num_lags = 36  # Adjust as needed
Y_test_pred = all_get_submkt_forecast(df, num_lags)


In [None]:
Y_test_pred