In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import warnings

warnings.filterwarnings("ignore")

In [3]:
### 1. Load & Clean Data
def load_and_clean_data(filepath, ticker):
    df = pd.read_csv(filepath)
    df.columns = df.iloc[0]
    df = df[3:].reset_index(drop=True)
    df.columns = ["Date", "Close", "High", "Low", "Open", "Volume"]
    df.insert(1, "Ticker", ticker)
    df["Open"] = df["Open"].astype(float)
    df["High"] = df["High"].astype(float)
    df["Low"] = df["Low"].astype(float)
    df["Close"] = df["Close"].astype(float)
    df["Volume"] = df["Volume"].astype(int)
    df["Date"] = pd.to_datetime(df["Date"])
    df.set_index("Date", inplace=True)
    return df

In [5]:
### 2. Preprocessing: Moving Averages
def add_moving_averages(df):
    df["MA7"] = df["Close"].rolling(window=7).mean()
    df["MA30"] = df["Close"].rolling(window=30).mean()
    return df

### 3. ADF Test
def adf_test(series):
    result = adfuller(series.dropna(), autolag='AIC')
    output = pd.Series(result[:4], index=['ADF Statistic', 'p-value', '# Lags Used', 'Number of Observations'])
    for key, value in result[4].items():
        output[f'Critical Value ({key})'] = value
    return output

In [7]:
### 4a. ARIMA Data Prep
def prepare_arima_data(df):
    df_arima = df[["Close"]].copy()
    df_arima["Close_diff"] = df_arima["Close"].diff()
    df_arima.dropna(inplace=True)
    return df_arima

### 4b. ARIMAX Data Prep
def prepare_arimax_data(df):
    df_arimax = df[["Close", "MA7", "MA30", "Volume"]].copy()
    df_arimax.dropna(inplace=True)
    return df_arimax

### 4c. LSTM Data Prep
def prepare_lstm_data(df, lookback=60):
    df_lstm = df[["Close"]].copy()
    scaler = MinMaxScaler()
    df_lstm["Scaled_Close"] = scaler.fit_transform(df_lstm[["Close"]])
    X, y = [], []
    for i in range(lookback, len(df_lstm)):
        X.append(df_lstm["Scaled_Close"].iloc[i - lookback:i].values)
        y.append(df_lstm["Scaled_Close"].iloc[i])
    X, y = np.array(X), np.array(y)
    X = X.reshape((X.shape[0], X.shape[1], 1))
    return X, y, scaler

In [9]:
### 5. Train-Test Split
def split_data(data, train_ratio=0.8):
    split = int(len(data) * train_ratio)
    return data[:split], data[split:]

In [11]:
### 6a. Train ARIMA Model
def train_arima(train_data, test_data):
    model = ARIMA(train_data["Close"], order=(1, 1, 1))
    model_fit = model.fit()
    pred = model_fit.forecast(steps=len(test_data))
    return pred, model_fit

### 6b. Train ARIMAX Model
def train_arimax(train_data, test_data):
    model = SARIMAX(train_data["Close"], exog=train_data[["Volume", "MA7", "MA30"]], order=(1, 1, 1))
    model_fit = model.fit()
    pred = model_fit.forecast(steps=len(test_data), exog=test_data[["Volume", "MA7", "MA30"]])
    return pred, model_fit

### 6c. Train LSTM Model
def train_lstm(X_train, y_train, X_test, y_test):
    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
        Dropout(0.2),
        LSTM(50, return_sequences=True),
        Dropout(0.2),
        LSTM(50),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    history = model.fit(X_train, y_train, epochs=10, batch_size=16,
                        validation_data=(X_test, y_test), callbacks=[early_stop], verbose=1)
    pred = model.predict(X_test)
    return pred, model, history

In [27]:
### 7. Evaluation
def evaluate_model(true_values, predicted_values, label="Model"):
    mae = mean_absolute_error(true_values, predicted_values)
    rmse = mean_squared_error(true_values, predicted_values, squared=False)
    r2 = r2_score(true_values, predicted_values)

    print(f"{label} MAE: {mae}")
    print(f"{label} RMSE: {rmse}")
    print(f"{label} R² Score: {r2}")
    return mae

### 8. Plotting Functions
def plot_predictions(index, actual, predicted, title):
    plt.figure(figsize=(12, 5))
    plt.plot(index, actual, label="Actual", color="green")
    plt.plot(index, predicted, label="Predicted", color="red")
    plt.title(title)
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.legend()
    plt.grid()
    plt.show()

def plot_loss(history):
    plt.figure(figsize=(10, 4))
    plt.plot(history.history['loss'], label='Train Loss', color='blue')
    plt.plot(history.history['val_loss'], label='Val Loss', color='orange')
    plt.title("Training & Validation Loss Curve")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

In [29]:
if __name__ == "__main__":
    tickers = ["MSFT", "AAPL", "NVDA", "AMZN", "TSLA"]  # Add your tickers here

    for ticker in tickers:
        print(f"\n================= Processing {ticker} =================")

        # === Load and clean data ===
        filepath = f"C:/Users/Shush/Final_Sem_Project/Stocks_Data/{ticker}_data.csv"
        df = load_and_clean_data(filepath, ticker)
        df = add_moving_averages(df)

        model_mae_scores = {}

        # === ARIMA ===
        df_arima = prepare_arima_data(df)
        train_arima_df, test_arima_df = split_data(df_arima)
        arima_pred, arima_model = train_arima(train_arima_df, test_arima_df)
        arima_mae = evaluate_model(test_arima_df["Close"], arima_pred, "ARIMA")
        model_mae_scores["ARIMA"] = (arima_mae, arima_model)

        # === ARIMAX ===
        df_arimax = prepare_arimax_data(df)
        train_arimax_df, test_arimax_df = split_data(df_arimax)
        arimax_pred, arimax_model = train_arimax(train_arimax_df, test_arimax_df)
        arimax_mae = evaluate_model(test_arimax_df["Close"], arimax_pred, "ARIMAX")
        model_mae_scores["ARIMAX"] = (arimax_mae, arimax_model)

        # === LSTM ===
        lookback = 60
        X, y, scaler = prepare_lstm_data(df, lookback)
        X_train, X_test = split_data(X)
        y_train, y_test = split_data(y)
        lstm_pred_scaled, lstm_model, lstm_history = train_lstm(X_train, y_train, X_test, y_test)
        lstm_pred = scaler.inverse_transform(lstm_pred_scaled)
        y_test_unscaled = scaler.inverse_transform(y_test.reshape(-1, 1))
        lstm_mae = evaluate_model(y_test_unscaled, lstm_pred, "LSTM")
        model_mae_scores["LSTM"] = (lstm_mae, lstm_model)
        
        joblib.dump(best_model, f"{ticker}_arimax_model.pkl")



ARIMA MAE: 28.079933150379446
ARIMA RMSE: 31.03484748773525
ARIMA R² Score: -1.688279222303784
ARIMAX MAE: 5.835796814140705
ARIMAX RMSE: 7.52511489094235
ARIMAX R² Score: 0.7597101307483876
Epoch 1/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 250ms/step - loss: 0.1852 - val_loss: 0.0758
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 143ms/step - loss: 0.0330 - val_loss: 0.0296
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 143ms/step - loss: 0.0316 - val_loss: 0.0412
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 137ms/step - loss: 0.0286 - val_loss: 0.0427
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 124ms/step - loss: 0.0276 - val_loss: 0.0381
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 147ms/step - loss: 0.0275 - val_loss: 0.0454
Epoch 7/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 141ms/step -