In [None]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
import pmdarima as pm
from pmdarima.arima.stationarity import ADFTest
from pmdarima.arima import ndiffs
from sklearn.metrics import mean_squared_error
from pmdarima.metrics import smape
from statsmodels.tsa.arima.model import ARIMA

import multiprocessing as mp
from datetime import datetime as dtm
from typing import Optional, Sequence
import itertools

In [None]:
def smape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2

    return np.mean(numerator / denominator) * 100


def _get_list_intersect_size(list1, list2):
    return len(set(list1) & set(list2))


def get_feature_subsets(
        feature_space,
        subset_size,
        include_features=None,
        intersect_size=1,
        num_processes=None,
):
    subset_size = max(1, subset_size)
    subset_size = min(subset_size, len(feature_space))
    subset_li = []

    if num_processes is None:
        num_processes = mp.cpu_count()

    with mp.Pool(processes=num_processes) as pool:
        results = []
        for subset in itertools.combinations(feature_space, subset_size):
            subset = list(subset)
            if include_features is not None:
                intersect_size = min(subset_size, intersect_size)
                _get_list_intersect_size(subset, include_features)

            results.append(subset)

        subset_li = pool.map(_process_subset, results)

    return subset_li


def _process_subset(subset):
    return subset

def run_auto_arima_experiment(name, exo, group, params, ntest, feature_sel):
    adf_test = ADFTest(alpha=0.05)
    p_val, should_diff = adf_test.should_diff(group['real_hedonic_rent_submarket'])
    kpss_diffs = ndiffs(group['real_hedonic_rent_submarket'], alpha=0.05, test='kpss', max_d=6)
    adf_diffs = ndiffs(group['real_hedonic_rent_submarket'], alpha=0.05, test='adf', max_d=6)
    n_diffs = max(adf_diffs, kpss_diffs)
    print(name,': ',p_val,should_diff,n_diffs)

    Y_train = group['real_hedonic_rent_submarket'][:-ntest]
    Y_test = group['real_hedonic_rent_submarket'][-ntest:]
    X_train = exo[feature_sel].iloc[:-ntest,:]
    X_test = exo[feature_sel].iloc[-ntest:,:]
    auto = pm.auto_arima(Y_train, X_train, d=n_diffs, 
                     suppress_warnings=True, error_action="ignore", 
                     min_p=1,min_q=1,max_p=6,max_q=6,
                     stepwise=True, scoring=smape,
                     max_order=None, trace=True)

    model = auto
    y_pred = model.predict(ntest, X_test)

    mse = mean_squared_error(Y_test, y_pred)
    smape_ = smape(Y_test, y_pred)
    
    return smape_

def run_auto_arima_pipeline(df, submkt_id, ntest):

    grouped = df.groupby('research_submkt_id')
    for name,group in grouped:
        if name == submkt_id:
            exo = group[[
                "real_market_level_rent",
                "gdp_histfc",
                "employment_histfc",
                "real_ecommerce",
                "spread_3m10y",
                "imports_us",
                "ecomm^2_pop",
                "weighted_pop_estimate_cryr",
                "weighted_hh_estimate_cryr"]]
            feature_sel = [
                "real_market_level_rent",
                "gdp_histfc",
                #"employment_histfc",
                #"real_ecommerce",
                #"spread_3m10y",
                #"imports_us",
                #"ecomm^2_pop",
                #"weighted_pop_estimate_cryr",
                "weighted_hh_estimate_cryr"]
            subset_li = get_feature_subsets(
                    feature_space,
                    subset_size=2,
                    include_features=None,
                    intersect_size=1,
                    num_processes=None,)
            
            param_vals = {
                "subset_li": subset_li
                }
            param_grid = list(PG(param_vals))
            num_params = len(param_grid)

            results = {}
            pool = mp.Pool(processes=mp.cpu_count())
            for idx, params in enumerate(param_grid):
                print(f"training model {idx}/{num_params - 1}: {params}")

                result = pool.apply_async(
                    run_auto_arima_experiment,
                    kwds={
                        "name": name,
                        "exo": exo,
                        "group": group,
                        "params": params,
                        "ntest": ntest,
                        "feature_sel": feature_sel
                        },
                )
                params = str(params)
                results[params] = result

            pool.close()
            pool.join()

            for key, value in results.items():
                if value.get() is not None:
                    results[key] = value.get()
                else:
                    results[key] = 0

        
    return results
            

In [None]:
df = pd.read_csv('/Users/qiaozihui/Desktop/model/pho_submkt_train_test_data.csv',index_col=0)
df.head()

In [None]:
results = run_auto_arima_pipeline(df, 'PHO037', 24)