# Data Collection and PreProcessing

## Data Source

In [1]:
import os
import yfinance as yf
import pandas as pd  # Ensure pandas is imported for MultiIndex handling

# Define stock tickers and date range
start_date = "2014-08-01"
end_date = "2016-11-30"
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']

# Create a folder named 'datasets' if it doesn't exist
output_folder = "datasets"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Download data for each stock and save with standardized column names
for ticker in tickers:
    # Download the stock data
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    
    # Check if columns are a MultiIndex and flatten if necessary
    if isinstance(stock_data.columns, pd.MultiIndex):
        stock_data.columns = ['_'.join(col).strip().lower() for col in stock_data.columns]
    else:
        stock_data.columns = [col.replace(' ', '_').lower() for col in stock_data.columns]
    
    # Print the first few rows of the data
    print(f"\nData for {ticker}:")
    print(stock_data.head())
    
    # Save to CSV in the 'datasets' folder
    file_path = os.path.join(output_folder, f'{ticker}_stock.csv')
    stock_data.to_csv(file_path, index=True)

    print(f"Saved {ticker} data to {file_path}")


[*********************100%***********************]  1 of 1 completed



Data for IBM:
            adj close_ibm   close_ibm    high_ibm     low_ibm    open_ibm  \
Date                                                                        
2014-08-01     115.999725  180.831741  183.078400  180.554489  182.122375   
2014-08-04     116.300232  181.300186  181.596558  180.305923  181.022949   
2014-08-05     114.742531  178.871887  180.879547  178.240921  180.449326   
2014-08-06     114.724030  177.791580  178.661575  176.328873  177.208420   
2014-08-07     113.693802  176.195023  178.470367  175.506699  178.432129   

            volume_ibm  
Date                    
2014-08-01     5419431  
2014-08-04     2223691  
2014-08-05     3460063  
2014-08-06     4023962  
2014-08-07     2833196  
Saved IBM data to datasets/IBM_stock.csv


[*********************100%***********************]  1 of 1 completed



Data for AAPL:
            adj close_aapl  close_aapl  high_aapl   low_aapl  open_aapl  \
Date                                                                      
2014-08-01       21.209681   24.032499  24.155001  23.702499  23.725000   
2014-08-04       21.090542   23.897499  24.145000  23.792500  24.092501   
2014-08-05       20.986837   23.780001  23.920000  23.590000  23.840000   
2014-08-06       20.951538   23.740000  23.870001  23.677500  23.687500   
2014-08-07       20.949320   23.620001  23.987499  23.525000  23.732500   

            volume_aapl  
Date                     
2014-08-01    194044000  
2014-08-04    159832000  
2014-08-05    223732000  
2014-08-06    154232000  
2014-08-07    186844000  
Saved AAPL data to datasets/AAPL_stock.csv


[*********************100%***********************]  1 of 1 completed



Data for META:
            adj close_meta  close_meta  high_meta   low_meta  open_meta  \
Date                                                                      
2014-08-01       72.142792   72.360001  73.220001  71.550003  72.220001   
2014-08-04       73.289337   73.510002  73.879997  72.360001  72.360001   
2014-08-05       72.471794   72.690002  73.589996  72.180000  73.199997   
2014-08-06       72.252457   72.470001  73.720001  71.790001  72.019997   
2014-08-07       72.950356   73.169998  74.000000  72.699997  73.000000   

            volume_meta  
Date                     
2014-08-01     43535000  
2014-08-04     30777000  
2014-08-05     34986000  
2014-08-06     30986000  
2014-08-07     38141000  
Saved META data to datasets/META_stock.csv


[*********************100%***********************]  1 of 1 completed


Data for GOOGL:
            adj close_googl  close_googl  high_googl  low_googl  open_googl  \
Date                                                                          
2014-08-01        28.609159    28.680000   29.171499  28.514999     28.9275   
2014-08-04        29.041588    29.113501   29.191000  28.613001     28.8255   
2014-08-05        28.586216    28.657000   29.010000  28.515499     28.9690   
2014-08-06        28.653551    28.724501   28.931999  28.372499     28.4750   
2014-08-07        28.519880    28.590500   28.915501  28.471500     28.8025   

            volume_googl  
Date                      
2014-08-01      44266000  
2014-08-04      30388000  
2014-08-05      32876000  
2014-08-06      26456000  
2014-08-07      23260000  
Saved GOOGL data to datasets/GOOGL_stock.csv





## Data Cleaning
+ Check for missing values
+ identification of outliers and replacement using IQR
+ Datatype conversion of all columns

In [7]:
import pandas as pd
import numpy as np
import os

# Define the tickers and input/output paths
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to clean and validate stock data
def clean_stock_data(ticker):
    # Read the input CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    print(f"\nCleaning data for {ticker}:")
    
    # 1. Check for missing values
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # 2. Identify and handle outliers using Interquartile Range (IQR) method
    def remove_outliers(column):
        Q1 = column.quantile(0.25)
        Q3 = column.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return column[(column >= lower_bound) & (column <= upper_bound)]
    
    # Apply outlier removal to numeric columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df_cleaned = df.copy()
    
    print("\nOutliers Detection:")
    for col in numeric_columns:
        original_count = len(df)
        cleaned_series = remove_outliers(df[col])
        removed_count = original_count - len(cleaned_series)
        print(f"{col}: {removed_count} outliers removed ({removed_count/original_count*100:.2f}%)")
        df_cleaned.loc[cleaned_series.index, col] = cleaned_series
    
    # 3. Convert columns to appropriate data types
    # Ensure all numeric columns are float
    for col in numeric_columns:
        df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
    
    # 4. Additional data validations
    print("\nData Type Validation:")
    print(df_cleaned.dtypes)
    
    # 5. Check for any remaining extreme values
    print("\nValue Ranges:")
    for col in numeric_columns:
        print(f"{col}: Min = {df_cleaned[col].min()}, Max = {df_cleaned[col].max()}")
    
    # 6. Save cleaned data
    output_path = os.path.join(output_folder, f'{ticker}_stock_cleaned.csv')
    df_cleaned.to_csv(output_path, index=True)
    print(f"\nCleaned data saved to {output_path}")
    
    return df_cleaned

# Process each stock
cleaned_datasets = {}
for ticker in tickers:
    cleaned_datasets[ticker] = clean_stock_data(ticker)


Cleaning data for IBM:

Missing Values:
adj close_ibm    0
close_ibm        0
high_ibm         0
low_ibm          0
open_ibm         0
volume_ibm       0
dtype: int64

Outliers Detection:
adj close_ibm: 12 outliers removed (2.04%)
close_ibm: 65 outliers removed (11.05%)
high_ibm: 62 outliers removed (10.54%)
low_ibm: 62 outliers removed (10.54%)
open_ibm: 64 outliers removed (10.88%)
volume_ibm: 42 outliers removed (7.14%)

Data Type Validation:
adj close_ibm    float64
close_ibm        float64
high_ibm         float64
low_ibm          float64
open_ibm         float64
volume_ibm         int64
dtype: object

Value Ranges:
adj close_ibm: Min = 76.3611068725586, Max = 119.67767333984376
close_ibm: Min = 112.66730499267578, Max = 185.46844482421875
high_ibm: Min = 114.397705078125, Max = 186.42446899414065
low_ibm: Min = 111.7590789794922, Max = 184.69407653808597
open_ibm: Min = 113.25048065185548, Max = 185.98471069335935
volume_ibm: Min = 1480927, Max = 24493659

Cleaned data saved to 

In [8]:
cleaned_datasets

{'IBM':             adj close_ibm   close_ibm    high_ibm     low_ibm    open_ibm  \
 Date                                                                        
 2014-08-01     115.999725  180.831741  183.078400  180.554489  182.122375   
 2014-08-04     116.300232  181.300186  181.596558  180.305923  181.022949   
 2014-08-05     114.742531  178.871887  180.879547  178.240921  180.449326   
 2014-08-06     114.724030  177.791580  178.661575  176.328873  177.208420   
 2014-08-07     113.693802  176.195023  178.470367  175.506699  178.432129   
 ...                   ...         ...         ...         ...         ...   
 2016-11-22     108.312408  155.516251  155.831741  154.827911  155.831741   
 2016-11-23     107.852951  154.856598  155.238998  154.263855  154.818359   
 2016-11-25     108.625343  155.965576  156.013382  154.713196  154.713196   
 2016-11-28     109.544167  157.284897  157.418732  155.544937  156.022949   
 2016-11-29     108.885002  156.338425  157.179733  155.8

## Stationarity Check - ADF

In [9]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import os
import matplotlib.pyplot as plt

# Define the tickers
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to perform stationarity test
def check_stationarity(series, ticker, column):
    # Perform Augmented Dickey-Fuller test
    result = adfuller(series.dropna())
    
    print(f'\nStationarity Test Results for {ticker} - {column}:')
    print(f'ADF Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'\t{key}: {value}')
    
    # Determine stationarity
    alpha = 0.05
    is_stationary = result[1] <= alpha
    print(f'\nIs the series stationary? {is_stationary}')
    
    return is_stationary

# Function to difference the series
def difference_series(series):
    # First-order differencing
    return series.diff().dropna()

# Function to process each stock
def process_stock_stationarity(ticker):
    # Read the cleaned CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock_cleaned.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    # Print column names for debugging
    print(f"\nColumns for {ticker}:")
    print(df.columns.tolist())
    
    # Select the 'close' column (now with more robust selection)
    close_col = [col for col in df.columns if 'close' in col.lower()][0]
    print(f"\nSelected close column for {ticker}: {close_col}")
    
    series = df[close_col]
    
    # Check original series stationarity
    original_stationary = check_stationarity(series, ticker, 'Original Series')
    
    # If not stationary, apply differencing
    if not original_stationary:
        # Perform first-order differencing
        differenced_series = difference_series(series)
        
        # Check stationarity of differenced series
        differenced_stationary = check_stationarity(differenced_series, ticker, 'Differenced Series')
        
        # Plot original and differenced series
        plt.figure(figsize=(12,6))
        plt.subplot(2,1,1)
        series.plot(title=f'{ticker} - Original Close Price')
        plt.subplot(2,1,2)
        differenced_series.plot(title=f'{ticker} - Differenced Close Price')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, f'{ticker}_stationarity_plot.png'))
        plt.close()
        
        # Create a new dataframe with differenced data
        df_differenced = df.copy()
        df_differenced[close_col] = np.nan
        df_differenced.loc[differenced_series.index, close_col] = differenced_series
        
        # Save differenced data
        output_path = os.path.join(output_folder, f'{ticker}_stock_differenced.csv')
        df_differenced.to_csv(output_path, index=True)
        print(f'\nDifferenced data saved to {output_path}')
    
    return df

# Process each stock
processed_datasets = {}
for ticker in tickers:
    processed_datasets[ticker] = process_stock_stationarity(ticker)

print("\nStationarity check and differencing complete.")


Columns for IBM:
['adj close_ibm', 'close_ibm', 'high_ibm', 'low_ibm', 'open_ibm', 'volume_ibm']

Selected close column for IBM: adj close_ibm

Stationarity Test Results for IBM - Original Series:
ADF Statistic: -2.088857165914344
p-value: 0.24901873259676288
Critical Values:
	1%: -3.4415393130846725
	5%: -2.866476335860869
	10%: -2.5693989358590006

Is the series stationary? False

Stationarity Test Results for IBM - Differenced Series:
ADF Statistic: -18.298326253396013
p-value: 2.2894563935365948e-30
Critical Values:
	1%: -3.4415777369651717
	5%: -2.866493255736561
	10%: -2.569407951640003

Is the series stationary? True

Differenced data saved to datasets/IBM_stock_differenced.csv

Columns for AAPL:
['adj close_aapl', 'close_aapl', 'high_aapl', 'low_aapl', 'open_aapl', 'volume_aapl']

Selected close column for AAPL: adj close_aapl

Stationarity Test Results for AAPL - Original Series:
ADF Statistic: -2.3001852568428305
p-value: 0.1719082294583959
Critical Values:
	1%: -3.441539313

# ARIMA model for short term dependecies

## ARIMA Model setup
- ACF and PACF to identify values for the AR and MA
- Buildign and training the ARIMA model on the series

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os

# Define the tickers
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to create train-test split
def create_train_test_split(series, test_size=0.2):
    # Calculate split index
    split_index = int(len(series) * (1 - test_size))
    
    # Split the series
    train = series[:split_index]
    test = series[split_index:]
    
    print("\nDataset Split:")
    print(f"Total samples: {len(series)}")
    print(f"Training samples: {len(train)}")
    print(f"Testing samples: {len(test)}")
    
    return train, test

# Function to plot actual vs predicted
def plot_actual_vs_predicted(test_index, actual, predicted, ticker):
    plt.figure(figsize=(12,6))
    plt.plot(test_index, actual, label='Actual', color='blue')
    plt.plot(test_index, predicted, label='Predicted', color='red', linestyle='--')
    plt.title(f'{ticker} - Actual vs Predicted Close Prices')
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f'{ticker}_actual_vs_predicted.png'))
    plt.close()

# Function to build and evaluate ARIMA model
def build_arima_model(ticker):
    # Read the cleaned CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock_cleaned.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    # Select the close column
    close_col = [col for col in df.columns if 'close' in col.lower()][0]
    series = df[close_col]
    
    # Create train-test split
    train, test = create_train_test_split(series)
    
    # Determine best ARIMA parameters using grid search
    best_params = None
    best_aic = float('inf')
    
    # Try different ARIMA parameter combinations
    p_range = range(0, 3)
    d_range = range(0, 2)
    q_range = range(0, 3)
    
    for p in p_range:
        for d in d_range:
            for q in q_range:
                try:
                    # Fit ARIMA model
                    model = ARIMA(train, order=(p,d,q))
                    model_fit = model.fit()
                    
                    # Compare AIC
                    if model_fit.aic < best_aic:
                        best_aic = model_fit.aic
                        best_params = (p,d,q)
                except Exception as e:
                    continue
    
    print(f"\n{ticker} - Best ARIMA Parameters: {best_params}")
    print(f"Best AIC Score: {best_aic}")
    
    # Fit the best ARIMA model
    final_model = ARIMA(train, order=best_params)
    final_model_fit = final_model.fit()
    
    # Generate forecast for test set
    forecast = final_model_fit.forecast(steps=len(test))
    
    # Calculate error metrics
    mse = mean_squared_error(test, forecast)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test, forecast)
    r2 = r2_score(test, forecast)
    
    # Plot actual vs predicted
    plot_actual_vs_predicted(test.index, test, forecast, ticker)
    
    # Print model performance
    print(f"\n{ticker} Model Performance:")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R-squared (R2) Score: {r2}")
    
    # Save model summary
    with open(os.path.join(output_folder, f'{ticker}_model_summary.txt'), 'w') as f:
        f.write(str(final_model_fit.summary()))
    
    return {
        'model': final_model_fit,
        'params': best_params,
        'forecast': forecast,
        'metrics': {
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }
    }

# Process each stock
arima_results = {}
for ticker in tickers:
    arima_results[ticker] = build_arima_model(ticker)

print("\nARIMA model analysis complete for all stocks.")


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates


IBM - Best ARIMA Parameters: (1, 1, 1)
Best AIC Score: 1571.9464363791135

IBM Model Performance:
Mean Squared Error (MSE): 29.69730525649693
Root Mean Squared Error (RMSE): 5.44952339718777
Mean Absolute Error (MAE): 4.7506048446024565
R-squared (R2) Score: -2.196134373124859

Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._


AAPL - Best ARIMA Parameters: (0, 1, 0)
Best AIC Score: 479.9032925661302


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(



AAPL Model Performance:
Mean Squared Error (MSE): 8.469795729221808
Root Mean Squared Error (RMSE): 2.910291347824442
Mean Absolute Error (MAE): 2.5088387667122536
R-squared (R2) Score: -1.9689248285995613

Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, fr


META - Best ARIMA Parameters: (2, 1, 1)
Best AIC Score: 1806.8538055281686

META Model Performance:
Mean Squared Error (MSE): 89.96668167024048
Root Mean Squared Error (RMSE): 9.485076787788303
Mean Absolute Error (MAE): 8.160380611169659
R-squared (R2) Score: -1.8675464608748569

Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq


GOOGL - Best ARIMA Parameters: (2, 1, 2)
Best AIC Score: 704.5948979076053

GOOGL Model Performance:
Mean Squared Error (MSE): 9.4458711566516
Root Mean Squared Error (RMSE): 3.0734135999978265
Mean Absolute Error (MAE): 2.805579094288549
R-squared (R2) Score: -1.7205058293201034

ARIMA model analysis complete for all stocks.


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os

# Define the tickers
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to create train-test split
def create_train_test_split(series, test_size=0.2):
    # Calculate split index
    split_index = int(len(series) * (1 - test_size))
    
    # Split the series
    train = series[:split_index]
    test = series[split_index:]
    
    print("\nDataset Split:")
    print(f"Total samples: {len(series)}")
    print(f"Training samples: {len(train)}")
    print(f"Testing samples: {len(test)}")
    
    return train, test

# Function to plot actual vs predicted
def plot_actual_vs_predicted(test_index, actual, predicted, ticker):
    plt.figure(figsize=(12,6))
    plt.plot(test_index, actual, label='Actual', color='blue')
    plt.plot(test_index, predicted, label='Predicted', color='red', linestyle='--')
    plt.title(f'{ticker} - Actual vs Predicted Close Prices')
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f'{ticker}_actual_vs_predicted.png'))
    plt.close()

# Function to build and evaluate ARIMA model
def build_arima_model(ticker):
    # Read the cleaned CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock_differenced.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    # Select the close column
    close_col = [col for col in df.columns if 'close' in col.lower()][0]
    series = df[close_col]
    
    # Create train-test split
    train, test = create_train_test_split(series)
    
    # Determine best ARIMA parameters using grid search
    best_params = None
    best_aic = float('inf')
    
    # Try different ARIMA parameter combinations
    p_range = range(0, 3)
    d = 1      # determined from previous step - adf test, we differenced only once to attain stationarity
    q_range = range(0, 3)
    
    for p in p_range:
          for q in q_range:
              try:
                  # Fit ARIMA model
                  model = ARIMA(train, order=(p,d,q))
                  model_fit = model.fit()
                  
                  # Compare AIC
                  if model_fit.aic < best_aic:
                      best_aic = model_fit.aic
                      best_params = (p,d,q)
              except Exception as e:
                  continue
    
    print(f"\n{ticker} - Best ARIMA Parameters: {best_params}")
    print(f"Best AIC Score: {best_aic}")
    
    # Fit the best ARIMA model
    final_model = ARIMA(train, order=best_params)
    final_model_fit = final_model.fit()
    
    # Generate forecast for test set
    forecast = final_model_fit.forecast(steps=len(test))
    
    # Calculate error metrics
    mse = mean_squared_error(test, forecast)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test, forecast)
    r2 = r2_score(test, forecast)
    
    # Plot actual vs predicted
    plot_actual_vs_predicted(test.index, test, forecast, ticker)
    
    # Print model performance
    print(f"\n{ticker} Model Performance:")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R-squared (R2) Score: {r2}")
    
    # Save model summary
    with open(os.path.join(output_folder, f'{ticker}_model_summary.txt'), 'w') as f:
        f.write(str(final_model_fit.summary()))
    
    return {
        'model': final_model_fit,
        'params': best_params,
        'forecast': forecast,
        'metrics': {
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }
    }

# Process each stock
arima_results = {}
for ticker in tickers:
    arima_results[ticker] = build_arima_model(ticker)

print("\nARIMA model analysis complete for all stocks.")


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)



IBM - Best ARIMA Parameters: (1, 1, 2)
Best AIC Score: 1593.0089349885402


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)



IBM Model Performance:
Mean Squared Error (MSE): 1.1919548959947048
Root Mean Squared Error (RMSE): 1.0917668688848847
Mean Absolute Error (MAE): 0.7766880399858505
R-squared (R2) Score: -0.005194060872696804

Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'



AAPL - Best ARIMA Parameters: (0, 1, 1)
Best AIC Score: 503.55429320159806

AAPL Model Performance:
Mean Squared Error (MSE): 0.0968685540198588
Root Mean Squared Error (RMSE): 0.3112371347057719
Mean Absolute Error (MAE): 0.21505634814479382
R-squared (R2) Score: -0.013804534917558842

Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(d


META - Best ARIMA Parameters: (0, 1, 1)
Best AIC Score: 1831.682708176842

META Model Performance:
Mean Squared Error (MSE): 2.2741985401280878
Root Mean Squared Error (RMSE): 1.5080446081360086
Mean Absolute Error (MAE): 1.0754063493114412
R-squared (R2) Score: -0.00039452430844133524

Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._in


GOOGL - Best ARIMA Parameters: (0, 1, 1)
Best AIC Score: 733.9616871016435

GOOGL Model Performance:
Mean Squared Error (MSE): 0.17236297238608253
Root Mean Squared Error (RMSE): 0.415166198511009
Mean Absolute Error (MAE): 0.29804776378267145
R-squared (R2) Score: -0.0003261940377481398

ARIMA model analysis complete for all stocks.


# LSTM

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define the tickers
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to create train-test split
def create_train_test_split(series, test_size=0.2):
    # Calculate split index
    split_index = int(len(series) * (1 - test_size))
    
    # Split the series
    train = series[:split_index]
    test = series[split_index:]
    
    print("\nDataset Split:")
    print(f"Total samples: {len(series)}")
    print(f"Training samples: {len(train)}")
    print(f"Testing samples: {len(test)}")
    
    return train, test

# Function to plot actual vs predicted
def plot_actual_vs_predicted(test_index, actual, predicted, ticker):
    plt.figure(figsize=(12,6))
    plt.plot(test_index, actual, label='Actual', color='blue')
    plt.plot(test_index, predicted, label='Predicted', color='red', linestyle='--')
    plt.title(f'{ticker} - Actual vs Predicted Close Prices')
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f'{ticker}_actual_vs_predicted.png'))
    plt.close()

# Function to prepare data for LSTM
def prepare_data_for_lstm(series, window_size=60):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(series.values.reshape(-1, 1))
    
    X, y = [], []
    for i in range(window_size, len(scaled_data)):
        X.append(scaled_data[i-window_size:i, 0])
        y.append(scaled_data[i, 0])
    
    X, y = np.array(X), np.array(y)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))
    
    return X, y, scaler

# Function to build and evaluate LSTM model
def build_lstm_model(ticker, window_size=60, epochs=50, batch_size=32):
    # Read the cleaned CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock_cleaned.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    # Select the close column
    close_col = [col for col in df.columns if 'close' in col.lower()][0]
    series = df[close_col]
    
    # Create train-test split
    train, test = create_train_test_split(series)
    
    # Prepare data for LSTM
    X_train, y_train, scaler = prepare_data_for_lstm(train, window_size)
    X_test, y_test, _ = prepare_data_for_lstm(test, window_size)
    
    # Build LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=25))
    model.add(Dense(units=1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
    
    # Generate predictions
    predictions = model.predict(X_test)
    predictions = scaler.inverse_transform(predictions)
    
    # Calculate error metrics
    mse = mean_squared_error(test[window_size:], predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test[window_size:], predictions)
    r2 = r2_score(test[window_size:], predictions)
    
    # Plot actual vs predicted
    plot_actual_vs_predicted(test.index[window_size:], test[window_size:], predictions, ticker)
    
    # Print model performance
    print(f"\n{ticker} Model Performance:")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R-squared (R2) Score: {r2}")
    
    # Save model summary
    with open(os.path.join(output_folder, f'{ticker}_model_summary.txt'), 'w') as f:
        f.write(str(model.summary()))
    
    return {
        'model': model,
        'predictions': predictions,
        'metrics': {
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }
    }

# Process each stock
lstm_results = {}
for ticker in tickers:
    lstm_results[ticker] = build_lstm_model(ticker)

print("\nLSTM model analysis complete for all stocks.")

2024-12-01 18:26:33.669263: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 18:26:33.678063: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733057793.686915    6079 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733057793.689724    6079 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 18:26:33.698726: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


I0000 00:00:1733057794.959357    6079 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9407 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070, pci bus id: 0000:01:00.0, compute capability: 8.9
  super().__init__(**kwargs)


Epoch 1/50


I0000 00:00:1733057796.705963    7646 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 0.0956
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0142
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0093
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0075
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0070
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0061
Epoch 7/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0055
Epoch 8/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0053
Epoch 9/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0051
Epoch 10/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0048
Epoch 11/50
[1m13/1


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118
Epoch 1/50


  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.2118  
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0230
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0191
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0159
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0130
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0110
Epoch 7/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0119
Epoch 8/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0100
Epoch 9/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0110
Epoch 10/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0106
Epoch 11/50
[1m13


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118
Epoch 1/50


  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.1160  
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0162
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0111
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0085
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0083
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0063
Epoch 7/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0085
Epoch 8/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0089
Epoch 9/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0079
Epoch 10/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0075
Epoch 11/50
[1m13


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118
Epoch 1/50


  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.1550
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0230
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0146
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0089
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0083
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0087
Epoch 7/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0099
Epoch 8/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0088
Epoch 9/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0081
Epoch 10/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0082
Epoch 11/50
[1m13/1


LSTM model analysis complete for all stocks.


# Combined Models

##  ARIMA Residuals Fed into LSTM 

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.initializers import GlorotUniform

# Define the tickers
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to create train-test split
def create_train_test_split(series, test_size=0.2):
    # Calculate split index
    split_index = int(len(series) * (1 - test_size))
    
    # Split the series
    train = series[:split_index]
    test = series[split_index:]
    
    print("\nDataset Split:")
    print(f"Total samples: {len(series)}")
    print(f"Training samples: {len(train)}")
    print(f"Testing samples: {len(test)}")
    
    return train, test

# Function to plot actual vs predicted
def plot_actual_vs_predicted(test_index, actual, predicted, ticker):
    plt.figure(figsize=(12,6))
    plt.plot(test_index, actual, label='Actual', color='blue')
    plt.plot(test_index, predicted, label='Predicted', color='red', linestyle='--')
    plt.title(f'{ticker} - Actual vs Predicted Close Prices')
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f'{ticker}_actual_vs_predicted.png'))
    plt.close()

# Function to prepare data for LSTM
def prepare_data_for_lstm(series, window_size=60):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(series.reshape(-1, 1))
    
    X, y = [], []
    for i in range(window_size, len(scaled_data)):
        X.append(scaled_data[i-window_size:i, 0])
        y.append(scaled_data[i, 0])
    
    X, y = np.array(X), np.array(y)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))
    
    return X, y, scaler

# Function to build and evaluate hybrid ARIMA-LSTM model
def build_hybrid_model(ticker, window_size=60, epochs=50, batch_size=32):
    # Read the cleaned CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock_cleaned.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    # Select the close column
    close_col = [col for col in df.columns if 'close' in col.lower()][0]
    series = df[close_col]
    
    # Create train-test split
    train, test = create_train_test_split(series)
    
    # Fit ARIMA model
    arima_model = ARIMA(train, order=(1, 1, 1))
    arima_fit = arima_model.fit()
    
    # Generate ARIMA predictions
    arima_predictions = arima_fit.forecast(steps=len(test))
    
    # Check for NaN values in ARIMA predictions and handle them
    if np.isnan(arima_predictions).any():
        arima_predictions = np.nan_to_num(arima_predictions, nan=0.0)
    
    # Calculate ARIMA residuals
    arima_residuals = test - arima_predictions
    
    # Check for NaN values in ARIMA residuals and handle them
    if np.isnan(arima_residuals).any():
        arima_residuals = np.nan_to_num(arima_residuals, nan=0.0)
    
    # Prepare data for LSTM using ARIMA residuals
    X_train, y_train, scaler = prepare_data_for_lstm(arima_residuals, window_size)
    X_test, y_test, _ = prepare_data_for_lstm(arima_residuals, window_size)
    
    # Check for NaN values in training data
    if np.isnan(X_train).any() or np.isnan(y_train).any():
        raise ValueError("Training data contains NaN values.")
    
    # Build LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1), kernel_initializer=GlorotUniform()))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False, kernel_initializer=GlorotUniform()))
    model.add(Dropout(0.2))
    model.add(Dense(units=25, kernel_initializer=GlorotUniform()))
    model.add(Dense(units=1, kernel_initializer=GlorotUniform()))
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')
    
    # Train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)
    
    # Generate predictions
    lstm_predictions = model.predict(X_test)
    lstm_predictions = scaler.inverse_transform(lstm_predictions)
    
    # Check for NaN values in LSTM predictions and handle them
    if np.isnan(lstm_predictions).any():
        lstm_predictions = np.nan_to_num(lstm_predictions, nan=0.0)
    
    # Ensure LSTM predictions match the length of ARIMA predictions
    if len(lstm_predictions) > len(arima_predictions):
        lstm_predictions = lstm_predictions[:len(arima_predictions)]
    elif len(lstm_predictions) < len(arima_predictions):
        lstm_predictions = np.pad(lstm_predictions, (0, len(arima_predictions) - len(lstm_predictions)), 'edge')
    
    # Combine ARIMA and LSTM predictions
    combined_predictions = arima_predictions + lstm_predictions.flatten()
    
    # Calculate error metrics
    mse = mean_squared_error(test, combined_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test, combined_predictions)
    r2 = r2_score(test, combined_predictions)
    
    # Plot actual vs predicted
    plot_actual_vs_predicted(test.index, test, combined_predictions, ticker)
    
    # Print model performance
    print(f"\n{ticker} Model Performance:")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R-squared (R2) Score: {r2}")
    
    # Save model summary
    with open(os.path.join(output_folder, f'{ticker}_model_summary.txt'), 'w') as f:
        f.write(str(model.summary()))
    
    return {
        'arima_model': arima_fit,
        'lstm_model': model,
        'predictions': combined_predictions,
        'metrics': {
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }
    }

# Process each stock
hybrid_results = {}
for ticker in tickers:
    hybrid_results[ticker] = build_hybrid_model(ticker)

print("\nHybrid ARIMA-LSTM model analysis complete for all stocks.")


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118
Epoch 1/50


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0000e+00
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0000e+00
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0000e+00
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0000e+00 
Epoch 8/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 9/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 10/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118
Epoch 1/50


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.0000e+00  
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0000e+00 
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0000e+00 
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0000e+00 
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0000e+00
Epoch 8/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0000e+00 
Epoch 9/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0000e+00
Epoch 10/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118
Epoch 1/50


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0000e+00  
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0000e+00 
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0000e+00 
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0000e+00 
Epoch 8/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 9/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0000e+00 
Epoch 10/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118
Epoch 1/50


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0000e+00  
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0000e+00 
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0000e+00 
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0000e+00 
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0000e+00 
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0000e+00 
Epoch 8/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0000e+00
Epoch 9/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0000e+00 
Epoch 10/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0


Hybrid ARIMA-LSTM model analysis complete for all stocks.
