In [1]:
pip install pmdarima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [2]:
import numpy as np
import pandas as pd
from pmdarima import auto_arima
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense


class FinancialModeling:
    def __init__(self, file_path='shampoo.csv'):
        self.file_path = file_path
        self.raw_data = None
        self.returns = None
        self.train = None
        self.test = None
        self.scaler = None
        self.lstm_model = None
        self.arima_model = None
        self.window_size = 5

    def download_data(self):
        self.raw_data = pd.read_csv(self.file_path)

        self.raw_data[['YearPart', 'MonthPart']] = self.raw_data['Month'].str.split('-', expand=True)
        self.raw_data['Date'] = pd.to_datetime(
            '200' + self.raw_data['YearPart'].str.strip() + '-' + self.raw_data['MonthPart'].str.strip() + '-01',
            format='%Y-%m-%d'
        )

        self.raw_data.set_index('Date', inplace=True)
        self.raw_data['Sales'] = pd.to_numeric(self.raw_data['Sales'], errors='coerce')
        self.raw_data.dropna(subset=['Sales'], inplace=True)
        self.raw_data.sort_index(inplace=True)

        self.returns = np.log(self.raw_data['Sales'] / self.raw_data['Sales'].shift(1)).dropna()

        if self.returns.empty:
            raise ValueError("Log returns are empty. Check the Sales data.")

    def adf_test(self):
        result = adfuller(self.returns)
        return {'ADF Statistic': result[0], 'p-value': result[1], 'Critical Values': result[4]}

    def plot_acf_pacf(self, lags=None):
        if self.returns is None:
            raise ValueError("Data not downloaded yet. Run download_data() first.")
        max_lags = (len(self.returns) // 2) - 1
        if lags is None or lags > max_lags:
            lags = max_lags
        if lags < 1:
            raise ValueError("Not enough data points to compute ACF/PACF.")

        print(f"Using {lags} lags for ACF and PACF.")

        plt.figure(figsize=(14, 6))
        plt.subplot(1, 2, 1)
        plot_acf(self.returns, lags=lags, ax=plt.gca(), title='ACF - Log Returns')
        plt.subplot(1, 2, 2)
        plot_pacf(self.returns, lags=lags, ax=plt.gca(), method='ywm', title='PACF - Log Returns')
        plt.tight_layout()
        plt.show()

    def train_auto_arima(self):
        split = int(len(self.returns) * 0.92)
        self.train = self.returns.iloc[:split]
        self.test = self.returns.iloc[split:]

        model = auto_arima(
            self.train,
            seasonal=False,
            stepwise=True,
            trace=True,
            suppress_warnings=True,
            error_action='ignore'
        )
        self.arima_model = model
        model.summary()

        pred_train = model.predict_in_sample()
        pred_test = model.predict(n_periods=len(self.test))

        train_mse = mean_squared_error(self.train, pred_train)
        test_mse = mean_squared_error(self.test, pred_test)

        return {
            'model': model,
            'train_pred': pred_train,
            'test_pred': pred_test,
            'train_mse': train_mse,
            'test_mse': test_mse
        }

    def prepare_lstm_data(self):
        data = self.returns.values.reshape(-1, 1)
        self.scaler = MinMaxScaler()
        scaled = self.scaler.fit_transform(data)

        X, y = [], []
        for i in range(self.window_size, len(scaled)):
            X.append(scaled[i - self.window_size:i])
            y.append(scaled[i])
        X, y = np.array(X), np.array(y)

        train_size = int(0.92 * len(X))
        return X[:train_size], y[:train_size], X[train_size:], y[train_size:]

    def train_lstm(self, X_train, y_train, X_test, y_test, epochs=30, batch_size=4):
        X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
        X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

        model = Sequential()
        model.add(Input(shape=(X_train.shape[1], 1)))
        model.add(LSTM(32))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mean_squared_error')
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

        self.lstm_model = model

        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)

        train_pred_inv = self.scaler.inverse_transform(train_pred)
        test_pred_inv = self.scaler.inverse_transform(test_pred)

        y_train_inv = self.scaler.inverse_transform(y_train.reshape(-1, 1))
        y_test_inv = self.scaler.inverse_transform(y_test.reshape(-1, 1))

        train_mse = mean_squared_error(y_train_inv, train_pred_inv)
        test_mse = mean_squared_error(y_test_inv, test_pred_inv)

        return {
            'model': model,
            'train_pred': train_pred_inv.flatten(),
            'test_pred': test_pred_inv.flatten(),
            'train_true': y_train_inv.flatten(),
            'test_true': y_test_inv.flatten(),
            'train_mse': train_mse,
            'test_mse': test_mse
        }

    def plot_predictions(self, arima_result, lstm_result):
        full_index = self.returns.index[self.window_size:]
        split_index = int(0.92 * len(full_index))

        lstm_pred_full = np.concatenate([lstm_result['train_pred'], lstm_result['test_pred']])
        lstm_true_full = np.concatenate([lstm_result['train_true'], lstm_result['test_true']])
        arima_pred_full = np.concatenate([arima_result['train_pred'], arima_result['test_pred']])
        arima_index = self.returns.index[:len(arima_pred_full)]

        plt.figure(figsize=(14, 5))
        plt.plot(full_index, lstm_true_full, label='True Log Returns', color='gray')
        plt.plot(full_index, lstm_pred_full, label='LSTM Prediction', color='purple')
        plt.plot(arima_index, arima_pred_full, label='Auto ARIMA Prediction', color='blue')
        plt.axvline(x=full_index[split_index], color='red', linestyle='--', label='Train/Test Split')
        plt.title('Sales Log Return Forecast: Auto ARIMA vs LSTM')
        plt.xlabel("Date")
        plt.ylabel("Log Return")
        plt.legend()
        plt.grid(True)
        plt.show()

    def print_metrics(self, arima_result, lstm_result):
        print("\n=== MODEL METRICS (Log Returns) ===")
        print(f"Auto ARIMA Train MSE: {arima_result['train_mse']:.6f}, Test MSE: {arima_result['test_mse']:.6f}")
        print(f"LSTM Train MSE: {lstm_result['train_mse']:.6f}, Test MSE: {lstm_result['test_mse']:.6f}")


# === USAGE ===
if __name__ == "__main__":
    fm = FinancialModeling(file_path='shampoo.csv')  # Ensure this file exists in the same folder
    fm.download_data()

    print("ADF Test on Log Returns:", fm.adf_test())
    fm.plot_acf_pacf()

    arima_result = fm.train_auto_arima()
    X_train, y_train, X_test, y_test = fm.prepare_lstm_data()
    lstm_result = fm.train_lstm(X_train, y_train, X_test, y_test)

    fm.print_metrics(arima_result, lstm_result)
    fm.plot_predictions(arima_result, lstm_result)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject