# Stock Price Forecasting using ARIMA model

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from arch import arch_model
import matplotlib.pyplot as plt
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score, mean_absolute_error

# Importing Data Sets

In [2]:
df1 = pd.read_csv("./data/SPX_Real.csv", index_col=False)
df1.columns = ['Timestamp', 'Close']
df1.set_index("Timestamp", inplace=True)
df1.index = pd.to_datetime(df1.index, format='%d-%m-%Y')
df2 = pd.read_csv("./data/AAPL_Real.csv")
df2.columns = ['Timestamp', 'Close']
df2.set_index("Timestamp", inplace=True)
df2.index = pd.to_datetime(df2.index, format='%d-%m-%Y')
df3 = pd.read_csv("./data/TWSE_Real.csv")
df3.columns = ['Timestamp', 'Close']
df3.set_index("Timestamp", inplace=True)
df3.index = pd.to_datetime(df3.index, format='%m/%d/%Y')

# Formatting Data Sets

In [13]:
def auto_garch(arr, unscaled, name, prediction_size = 1): 
    train_data=pd.DataFrame()
    test_data=pd.DataFrame() 

    unscaled_train=pd.DataFrame()
    unscaled_test=pd.DataFrame() 
    split_index = int(len(arr)*0.9)
    train_data, test_data = arr[:split_index], arr[split_index:]
    unscaled_train, unscaled_test = unscaled[:split_index], unscaled[split_index:]
    
    best_model = None
    best_aic = np.inf
    best_order = (0, 0)

    for p in range(1, 10):
        for q in range(1, 10):
            try:
                if not (p + q) > 8:
                    # fit GARCH(p, q)
                    am = arch_model(train_data, vol='GARCH', p=p, q=q, dist='normal', mean='Constant')
                    res = am.fit(disp='off')
                    
                    # Compare criteria (AIC, BIC, etc.)
                    if res.aic < best_aic:
                        best_aic = res.aic
                        best_order = (p, q)
                        best_model = res
            except Exception as e:
                # Some combos might fail to converge
                pass
    
    print(f"Best (p, q): {best_order}")
    print(f"Best AIC: {best_aic}")
    print(best_model.summary())
    
    predictions = []
    unscaled_predictions = []
    unscaled_real = []
    real = []
    dates = []

    for i in range(1, len(test_data) - prediction_size, prediction_size):
        # Data up to this test day (expanding window)
        data_until_now = arr.iloc[:split_index + i]

        # Fit GARCH(1,1) on the returns up to this point
        garch_model = arch_model(data_until_now, vol='Garch', p=best_order[0], q=best_order[1], dist='normal', mean='Constant')
        res = garch_model.fit(disp='off')

        # Forecast the next prediction_size returns
        forecast_vals = res.forecast(horizon=prediction_size)

        # Extract the forecasts from the last in-sample date
        mean_forecasts = forecast_vals.mean.iloc[-1, :]

        # Iterate through forecasted returns
        for j, mean_return in enumerate(mean_forecasts):
            predictions.append(mean_return)

            # Calculate unscaled prediction
            prev_price = unscaled_test.iloc[i - 1 + j]
            unscaled_predictions.append(prev_price * (1 + mean_return))

            # Record actual values for later comparison
            real.append(test_data.iloc[i + j])
            unscaled_real.append(unscaled_test.iloc[i + j])
            dates.append(unscaled_test.index[i + j])
        print(i)



    mape = mean_absolute_percentage_error(unscaled_real, unscaled_predictions)
    mse = mean_squared_error(unscaled_real, unscaled_predictions)
    r2 = r2_score(unscaled_real, unscaled_predictions)
    mae = mean_absolute_error(unscaled_real, unscaled_predictions)
    print(f"{name} MAPE: {mape:.7f}% for prediction size {prediction_size}")
    print(f"{name} MSE: {mse:.5f} for prediction size {prediction_size}" )
    print(f"{name} R2: {r2:.5f} for prediction size {prediction_size}")
    print(f"{name} MAE: {mae:.5f} for prediction size {prediction_size}")
    
    # Plotting
    plt.figure(figsize=(12,8))
    plt.plot(dates[::5], real[::5], label='True Return')
    plt.plot(dates[::5], predictions[::5], label='Predicted Return')
    plt.title(f"Return Prediction for {name}")
    plt.xlabel('Date')
    plt.ylabel('Return ($)')
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
#    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
    plt.show()
    plt.close()
    
    plt.figure(figsize=(12,8))
    plt.plot(dates[::5], unscaled_real[::5], label='True Price')
    plt.plot(dates[::5], unscaled_predictions[::5], label='Predicted Price')
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    plt.title(f"Stock Price Prediction for {name}")
    plt.xlabel('Date')
    plt.ylabel('Stock Price ($)')
    plt.show()
    plt.close()
    return (mape, mse, r2, mae)

Best fit ARIMA (p,d,q) parameters:
* EXX5 : (1,1,1)
* IQQE : (0,1,0)

#Below is the same process but using interday change instead of log transform

In [11]:
dfs = {'SPX': df1, 'AAPL': df2, "TWSE" : df3}

In [6]:
import warnings
warnings.simplefilter("ignore", category=UserWarning)
warnings.simplefilter("ignore")

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning


In [None]:
@ignore_warnings(category=ConvergenceWarning)
def run_preds():
    res = []
    for name in dfs:
        for pred_size in [1, 10, 30]:
            df = dfs[name]
            df_adj = df.iloc[1::]
            df_interday = df.pct_change()
            df_interday = df_interday.dropna()
            mape, mse, r2, mae = auto_garch(df_interday, df_adj, name, pred_size)
            res.append({
                "DataFrame": name,
                "Prediction_Size": pred_size,
                "MAE": mae,
                "MSE": mse,
                "MAPE": mape,
                "r2": r2
            })
    results_df = pd.DataFrame(res)
    results_df.to_csv("garch_errors.csv", index=False)

run_preds()


Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.



Best (p, q): (2, 3)
Best AIC: -52191.53915363001
                     Constant Mean - GARCH Model Results                      
Dep. Variable:                  Close   R-squared:                       0.000
Mean Model:             Constant Mean   Adj. R-squared:                  0.000
Vol Model:                      GARCH   Log-Likelihood:                26102.8
Distribution:                  Normal   AIC:                          -52191.5
Method:            Maximum Likelihood   BIC:                          -52142.7
                                        No. Observations:                 7947
Date:                Sun, Feb 16 2025   Df Residuals:                     7946
Time:                        12:43:48   Df Model:                            1
                                 Mean Model                                 
                 coef    std err          t      P>|t|      95.0% Conf. Int.
----------------------------------------------------------------------------
mu       

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.



4
5
6
7
8
9


Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.



31
32
33
34
35
36
37
38
39
40
41
42
43


Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.

Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.



44
45
46
47
48
49
50
51
52
53
54


Inequality constraints incompatible
See scipy.optimize.fmin_slsqp for code meaning.



In [37]:
df3 = pd.read_csv("./data/TWSE_Real.csv")
df3.columns = ['Timestamp', 'Close']
df3.set_index("Timestamp", inplace=True)
df3.index = pd.to_datetime(df3.index, format='%m/%d/%Y')
df3

Unnamed: 0_level_0,Close
Timestamp,Unnamed: 1_level_1
1990-01-04,9853.15
1990-01-05,9862.42
1990-01-06,9927.06
1990-01-08,9964.72
1990-01-09,9805.40
...,...
2025-01-16,23025.10
2025-01-17,23148.08
2025-01-20,23266.82
2025-01-21,23300.01


In [39]:
df3_dict = {"TWSE" : df3}
for name in df3_dict:
    df = df3_dict[name]
    df_adj = df.iloc[1::]
    df_interday = df.diff()
    df_interday = df_interday.dropna()
    arima_auto(df_interday, df_adj, name)

 ARIMA(0,0,0)(0,0,0)[1] intercept   : AIC=87189.385, Time=0.06 sec
 ARIMA(0,0,1)(0,0,0)[1] intercept   : AIC=87161.018, Time=0.16 sec
 ARIMA(0,0,2)(0,0,0)[1] intercept   : AIC=87162.879, Time=0.28 sec
 ARIMA(0,0,3)(0,0,0)[1] intercept   : AIC=87147.859, Time=0.35 sec
 ARIMA(0,0,4)(0,0,0)[1] intercept   : AIC=87141.683, Time=0.48 sec


KeyboardInterrupt: 