In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from darts.metrics import smape
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold


import ast
import multiprocessing as mp
from datetime import datetime as dtm
from typing import Optional, Sequence
from darts.metrics import mape, mae
from darts.utils.statistics import check_seasonality
from darts.utils.statistics import plot_acf
from darts.utils.statistics import plot_pacf
from darts.models.forecasting.arima import ARIMA
from darts.timeseries import TimeSeries as TS
from sklearn.model_selection import ParameterGrid as PG

In [None]:
pho_submkt = ['PHO037', 'PHO038', 'PHO039', 'PHO040', 'PHO041', 'PHO042',
              'PHO043', 'PHO044', 'PHO045', 'PHO046', 'PHO047', 'PHO048',
              'PHO049', 'PHO050', 'PHO051', 'PHO053', 'PHO054', 'PHO056',
              'PHO057', 'PHO058', 'PHO059', 'PHO060', 'PHO061', 'PHO062',
              'PHO063', 'PHO064', 'PHO065', 'PHO055']
dal_submkt = ['DAL031', 'DAL034-DAL035-DAL041', 'DAL037-DAL038-DAL039-DAL040',
              'DAL042-DAL043-DAL044-DAL045-DAL046', 'DAL047-DAL048', 'DAL049',
              'DAL050-DAL051-DAL053-DAL054-DAL055', 'DAL052-DAL056',
              'DAL057-FTW031', 'FTW029', 'FTW032-FTW033-FTW034-FTW039',
              'FTW035-FTW036-FTW037-FTW040', 'FTW038-FTW041-FTW042-FTW043']

In [None]:
def all_split_data_by_submarket(data, ntest):
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()
    submarkets = data['research_submkt_id'].unique()

    for submarket in submarkets:
        submarket_data = data[data['research_submkt_id'] == submarket]
        train_submarket = submarket_data.iloc[:-ntest]
        test_submarket = submarket_data.iloc[-ntest:]
        train_data = pd.concat([train_data, train_submarket])
        test_data = pd.concat([test_data, test_submarket])

    return train_data, test_data


def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE) between two arrays.

    Parameters:
        y_true (array-like): Array of true values.
        y_pred (array-like): Array of predicted values.

    Returns:
        float: SMAPE value.
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    
    return np.mean(numerator / denominator) * 100


def all_get_submkt_forecast(df, num_lags):
    df_new = df[['date','research_submkt_id','real_hedonic_rent_submarket']]
    df_new['date'] = pd.to_datetime(df_new['date'])
    df['date'] = pd.to_datetime(df['date'])

    if num_lags is None:
        num_lags = 36  

    for lag in range(1, num_lags+1):
        df_new['rent_{}months_ago'.format(lag)] = df_new.groupby('research_submkt_id')['real_hedonic_rent_submarket'].shift(lag)
    df_new = df_new.dropna()
    df_new = df_new.sort_values(['date', 'research_submkt_id']).reset_index(drop=True)
    
    df_sel = df[["date", 
                 "research_submkt_id",
                 "real_market_level_rent",
                 "gdp_histfc",
                 "nominal_retail_sales_histfc",
                 "employment_histfc",
                 "real_ecommerce",
                 "spread_3m10y",
                 "real_retail_sales_ex_gas",
                 "imports_us",
                 "ecomm^2_pop",
                 "weighted_pop_estimate_cryr",
                 "weighted_hh_estimate_cryr"]]

    df_new = df_new.merge(df_sel,on=['date','research_submkt_id'],how='left')

    X = df_new.iloc[:, [0, 1] + list(range(3, len(df_new.columns)))]
    Y = df_new.iloc[:,:3]

    Y_train, Y_test = all_split_data_by_submarket(Y,24)
    y_train = Y_train.iloc[:,-1]
    y_test = Y_test.iloc[:,-1]
    X_train, X_test = all_split_data_by_submarket(X,24)
    x_train = X_train.iloc[:,2:]
    x_test = X_test.iloc[:,2:]


    #param_grid = {
    #    'n_estimators': [100, 150, 200, 250, 300],
    #    'max_depth': [3, 4, 5],
    #    'learning_rate': [0.15, 0.1, 0.01, 0.001]}
    
    param_grid = {
    'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.15, 0.1, 0.01, 0.001],
    'gamma': [0, 0.1, 0.2, 0.3],
    'colsample_bytree': [0.5, 0.75, 1],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]}

    #model = XGBRegressor()
    #grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
    #grid_search.fit(x_train, y_train)

    # Get the best parameters and best score
    #best_params = grid_search.best_params_
    #best_score = -grid_search.best_score_
    #print("Best Parameters:", best_params)
    #print("Best Score (RMSE):", best_score)

    #best_model = XGBRegressor(**best_params)
    #best_model.fit(x_train, y_train)
    
        
    model = XGBRegressor()
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring=make_scorer(smape))
    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    best_score = -grid_search.best_score_
    print("Best Parameters:", best_params)
    print("Best Score (SMAPE):", best_score)

    best_model = XGBRegressor(**best_params)
    best_model.fit(x_train, y_train)
    #importance = best_model.feature_importances_
    #feature_importance_df = pd.DataFrame({'Attribute': x_train.columns, 'Importance': importance})

    #feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    #plt.figure(figsize=(10, 6))
    #plt.barh(feature_importance_df['Attribute'], feature_importance_df['Importance'])
    #plt.xlabel('Importance')
    #plt.ylabel('Attribute')
    #plt.title('Feature Importance')
    #plt.show()
    
    # using most important attributes to build new model
    #selected_attributes = feature_importance_df['Attribute'].head(15).tolist()
    #x_train_selected = x_train[selected_attributes]
    #x_test_selected = x_test[selected_attributes]
    #new_model = XGBRegressor(**best_params)
    #new_model.fit(x_train_selected, y_train)


    # predict
    #y_pred = new_model.predict(x_test_selected)
    y_pred = best_model.predict(x_test)


    smape_value = smape(y_test, y_pred)
    print(f"SMAPE: {smape_value:.2f}%")
    
    Y_test_pred = Y_test.copy()
    Y_test_pred['y_pred'] = y_pred
    
    return Y_test_pred   #, feature_importance_df

    
def all_plot_submkt_forecast(Y_test_pred, submkt_id):

    x = Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['date']
    y =  Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['real_hedonic_rent_submarket']
    y_pred =  Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['y_pred']

    plt.plot(x,y,label='test')
    plt.plot(x,y_pred,label='pred')
    plt.title('{} submkt_rent forecasting'.format(submkt_id))
    plt.legend()

    return plt.show()
    
    
def get_single_smape(Y_test_pred, submkt_id):
    
    
    y =  Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['real_hedonic_rent_submarket']
    y_pred =  Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['y_pred']
    
    smp = smape(y,y1_pred)
    
    return smp

    
def get_whole_smape_df(Y_test_pred, submkt_list):
    
    smp_ls = []
    for submkt in submkt_list:
        y =  Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['real_hedonic_rent_submarket']
        y_pred =  Y_test_pred[Y_test_pred['research_submkt_id']== submkt_id]['y_pred']
        smp = smape(y,y1_pred)
        smp_ls.append(smp)
    smp_dic = {'research_submkt_id':submkt_list,
               'smape': smp_ls}
    smp_df = pd.DataFrame(smp_dic)
        
    
    return smp