Testing for 2 states

In [4]:
import numpy as np, pandas as pd
from hmmlearn.hmm import GaussianHMM

FILE_PATH = r"C:\Users\roosd\Downloads\econometrie jaar 3\Thesis\sp500_weekly_closed.csv"
SEEDS = range(10)

def load_returns(path):
    df = pd.read_csv(path)
    if "LogReturn" not in df.columns:
        price_col = "Close" if "Close" in df.columns else "Adj Close"
        df[price_col] = pd.to_numeric(df[price_col], errors="coerce")
        df["LogReturn"] = np.log(df[price_col] / df[price_col].shift(1))
    return pd.to_numeric(df["LogReturn"], errors="coerce").dropna().values.reshape(-1,1)

def n_params(n): return n**2 + 2*n - 1

def main():
    X = load_returns(FILE_PATH)
    n = 2
    metrics = []
    for s in SEEDS:
        m = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=s).fit(X)
        ll = m.score(X)
        k = n_params(n)
        aic = 2*k - 2*ll
        bic = k*np.log(len(X)) - 2*ll
        metrics.append({"seed": s, "logL": ll, "AIC": aic, "BIC": bic})

    df = pd.DataFrame(metrics)
    print("\n=== 2-State HMM: AIC & BIC across 10 seeds ===")
    print(df.round(2))
    print("\nMean ± SD:")
    print(f"AIC: {df['AIC'].mean():.2f} ± {df['AIC'].std(ddof=1):.2f}")
    print(f"BIC: {df['BIC'].mean():.2f} ± {df['BIC'].std(ddof=1):.2f}")

    best = df.loc[df["BIC"].idxmin()]
    print(f"\nBest (lowest BIC) → seed={int(best.seed)} | logL={best.logL:.2f} | "
          f"AIC={best.AIC:.2f} | BIC={best.BIC:.2f}")

if __name__ == "__main__":
    main()


Model is not converging.  Current: 1294.2309320732895 is not greater than 1294.2351959713071. Delta is -0.004263898017597967
Model is not converging.  Current: 1294.2443253866825 is not greater than 1294.250027656945. Delta is -0.0057022702624180965



=== 2-State HMM: AIC & BIC across 10 seeds ===
   seed     logL      AIC      BIC
0     0  1294.21 -2574.42 -2544.62
1     1  1294.25 -2574.49 -2544.69
2     2  1294.22 -2574.43 -2544.63
3     3  1294.14 -2574.28 -2544.48
4     4  1294.17 -2574.34 -2544.53
5     5  1294.21 -2574.41 -2544.61
6     6  1294.23 -2574.46 -2544.65
7     7  1294.23 -2574.45 -2544.65
8     8  1294.29 -2574.57 -2544.77
9     9  1294.14 -2574.28 -2544.47

Mean ± SD:
AIC: -2574.41 ± 0.09
BIC: -2544.61 ± 0.09

Best (lowest BIC) → seed=8 | logL=1294.29 | AIC=-2574.57 | BIC=-2544.77




Testing for 3 states

In [5]:
import numpy as np, pandas as pd
from hmmlearn.hmm import GaussianHMM

FILE_PATH = r"C:\Users\roosd\Downloads\econometrie jaar 3\Thesis\sp500_weekly_closed.csv"
SEEDS = range(10)

def load_returns(path):
    df = pd.read_csv(path)
    if "LogReturn" not in df.columns:
        price_col = "Close" if "Close" in df.columns else "Adj Close"
        df[price_col] = pd.to_numeric(df[price_col], errors="coerce")
        df["LogReturn"] = np.log(df[price_col] / df[price_col].shift(1))
    return pd.to_numeric(df["LogReturn"], errors="coerce").dropna().values.reshape(-1,1)

def n_params(n): return n**2 + 2*n - 1

def main():
    X = load_returns(FILE_PATH)
    n = 3
    metrics = []
    for s in SEEDS:
        m = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=s).fit(X)
        ll = m.score(X)
        k = n_params(n)
        aic = 2*k - 2*ll
        bic = k*np.log(len(X)) - 2*ll
        metrics.append({"seed": s, "logL": ll, "AIC": aic, "BIC": bic})

    df = pd.DataFrame(metrics)
    print("\n=== 3-State HMM: AIC & BIC across 10 seeds ===")
    print(df.round(2))
    print("\nMean ± SD:")
    print(f"AIC: {df['AIC'].mean():.2f} ± {df['AIC'].std(ddof=1):.2f}")
    print(f"BIC: {df['BIC'].mean():.2f} ± {df['BIC'].std(ddof=1):.2f}")

    best = df.loc[df["BIC"].idxmin()]
    print(f"\nBest (lowest BIC) → seed={int(best.seed)} | logL={best.logL:.2f} | "
          f"AIC={best.AIC:.2f} | BIC={best.BIC:.2f}")

if __name__ == "__main__":
    main()


Model is not converging.  Current: 1306.7456779269296 is not greater than 1306.750667111547. Delta is -0.004989184617443243



=== 3-State HMM: AIC & BIC across 10 seeds ===
   seed     logL      AIC      BIC
0     0  1295.99 -2563.97 -2504.37
1     1  1292.80 -2557.61 -2498.00
2     2  1297.88 -2567.76 -2508.16
3     3  1306.72 -2585.44 -2525.84
4     4  1297.00 -2566.00 -2506.39
5     5  1307.18 -2586.35 -2526.75
6     6  1296.01 -2564.02 -2504.41
7     7  1306.56 -2585.12 -2525.51
8     8  1294.09 -2560.19 -2500.58
9     9  1307.12 -2586.24 -2526.63

Mean ± SD:
AIC: -2572.27 ± 11.97
BIC: -2512.66 ± 11.97

Best (lowest BIC) → seed=5 | logL=1307.18 | AIC=-2586.35 | BIC=-2526.75


Testing for 4 states

In [6]:
import numpy as np, pandas as pd
from hmmlearn.hmm import GaussianHMM

FILE_PATH = r"C:\Users\roosd\Downloads\econometrie jaar 3\Thesis\sp500_weekly_closed.csv"
SEEDS = range(10)

def load_returns(path):
    df = pd.read_csv(path)
    if "LogReturn" not in df.columns:
        price_col = "Close" if "Close" in df.columns else "Adj Close"
        df[price_col] = pd.to_numeric(df[price_col], errors="coerce")
        df["LogReturn"] = np.log(df[price_col] / df[price_col].shift(1))
    return pd.to_numeric(df["LogReturn"], errors="coerce").dropna().values.reshape(-1,1)

def n_params(n): return n**2 + 2*n - 1

def main():
    X = load_returns(FILE_PATH)
    n = 4
    metrics = []
    for s in SEEDS:
        m = GaussianHMM(n_components=n, covariance_type="diag", n_iter=1000, random_state=s).fit(X)
        ll = m.score(X)
        k = n_params(n)
        aic = 2*k - 2*ll
        bic = k*np.log(len(X)) - 2*ll
        metrics.append({"seed": s, "logL": ll, "AIC": aic, "BIC": bic})

    df = pd.DataFrame(metrics)
    print("\n=== 4-State HMM: AIC & BIC across 10 seeds ===")
    print(df.round(2))
    print("\nMean ± SD:")
    print(f"AIC: {df['AIC'].mean():.2f} ± {df['AIC'].std(ddof=1):.2f}")
    print(f"BIC: {df['BIC'].mean():.2f} ± {df['BIC'].std(ddof=1):.2f}")

    best = df.loc[df["BIC"].idxmin()]
    print(f"\nBest (lowest BIC) → seed={int(best.seed)} | logL={best.logL:.2f} | "
          f"AIC={best.AIC:.2f} | BIC={best.BIC:.2f}")

if __name__ == "__main__":
    main()


Model is not converging.  Current: 1306.8548434113081 is not greater than 1306.8591225248008. Delta is -0.004279113492657416
Model is not converging.  Current: 1300.8981430796716 is not greater than 1300.8991472268776. Delta is -0.0010041472060038359
Model is not converging.  Current: 1297.1713222039866 is not greater than 1297.1809615002664. Delta is -0.009639296279829068



=== 4-State HMM: AIC & BIC across 10 seeds ===
   seed     logL      AIC      BIC
0     0  1306.84 -2567.68 -2469.75
1     1  1304.35 -2562.70 -2464.77
2     2  1300.89 -2555.77 -2457.85
3     3  1292.49 -2538.99 -2441.06
4     4  1307.99 -2569.98 -2472.05
5     5  1292.55 -2539.09 -2441.17
6     6  1303.35 -2560.70 -2462.77
7     7  1309.25 -2572.50 -2474.58
8     8  1312.61 -2579.21 -2481.29
9     9  1297.15 -2548.29 -2450.37

Mean ± SD:
AIC: -2559.49 ± 13.84
BIC: -2461.57 ± 13.84

Best (lowest BIC) → seed=8 | logL=1312.61 | AIC=-2579.21 | BIC=-2481.29
