# Data Collection and PreProcessing

## Data Source

In [5]:
import os
import yfinance as yf
import pandas as pd  # Ensure pandas is imported for MultiIndex handling

# Define stock tickers and date range
start_date = "2014-08-01"
end_date = "2016-11-30"
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']

# Create a folder named 'datasets' if it doesn't exist
output_folder = "datasets"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Download data for each stock and save with standardized column names
for ticker in tickers:
    # Download the stock data
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    
    # Check if columns are a MultiIndex and flatten if necessary
    if isinstance(stock_data.columns, pd.MultiIndex):
        stock_data.columns = ['_'.join(col).strip().lower() for col in stock_data.columns]
    else:
        stock_data.columns = [col.replace(' ', '_').lower() for col in stock_data.columns]
    
    # Print the first few rows of the data
    print(f"\nData for {ticker}:")
    print(stock_data.head())
    
    # Save to CSV in the 'datasets' folder
    file_path = os.path.join(output_folder, f'{ticker}_stock.csv')
    stock_data.to_csv(file_path, index=True)

    print(f"Saved {ticker} data to {file_path}")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Data for IBM:
            adj close_ibm   close_ibm    high_ibm     low_ibm    open_ibm  \
Date                                                                        
2014-08-01     115.999710  180.831741  183.078400  180.554489  182.122375   
2014-08-04     116.300247  181.300186  181.596558  180.305923  181.022949   
2014-08-05     114.742561  178.871887  180.879547  178.240921  180.449326   
2014-08-06     114.724030  177.791580  178.661575  176.328873  177.208420   
2014-08-07     113.693825  176.195023  178.470367  175.506699  178.432129   

            volume_ibm  
Date                    
2014-08-01     5419431  
2014-08-04     2223691  
2014-08-05     3460063  
2014-08-06     4023962  
2014-08-07     2833196  
Saved IBM data to datasets/IBM_stock.csv

Data for AAPL:
            adj close_aapl  close_aapl  high_aapl   low_aapl  open_aapl  \
Date                                                                      
2014-08-01       21.209684   24.032499  24.155001  23.702499  2




## Data Cleaning
+ Check for missing values
+ identification of outliers and replacement using IQR
+ Datatype conversion of all columns

In [6]:
import pandas as pd
import numpy as np
import os

# Define the tickers and input/output paths
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to clean and validate stock data
def clean_stock_data(ticker):
    # Read the input CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    print(f"\nCleaning data for {ticker}:")
    
    # 1. Check for missing values
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # 2. Identify and handle outliers using Interquartile Range (IQR) method
    def remove_outliers(column):
        Q1 = column.quantile(0.25)
        Q3 = column.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return column[(column >= lower_bound) & (column <= upper_bound)]
    
    # Apply outlier removal to numeric columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df_cleaned = df.copy()
    
    print("\nOutliers Detection:")
    for col in numeric_columns:
        original_count = len(df)
        cleaned_series = remove_outliers(df[col])
        removed_count = original_count - len(cleaned_series)
        print(f"{col}: {removed_count} outliers removed ({removed_count/original_count*100:.2f}%)")
        df_cleaned.loc[cleaned_series.index, col] = cleaned_series
    
    # 3. Convert columns to appropriate data types
    # Ensure all numeric columns are float
    for col in numeric_columns:
        df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
    
    # 4. Additional data validations
    print("\nData Type Validation:")
    print(df_cleaned.dtypes)
    
    # 5. Check for any remaining extreme values
    print("\nValue Ranges:")
    for col in numeric_columns:
        print(f"{col}: Min = {df_cleaned[col].min()}, Max = {df_cleaned[col].max()}")
    
    # 6. Save cleaned data
    output_path = os.path.join(output_folder, f'{ticker}_stock_cleaned.csv')
    df_cleaned.to_csv(output_path, index=True)
    print(f"\nCleaned data saved to {output_path}")
    
    return df_cleaned

# Process each stock
cleaned_datasets = {}
for ticker in tickers:
    cleaned_datasets[ticker] = clean_stock_data(ticker)


Cleaning data for IBM:

Missing Values:
adj close_ibm    0
close_ibm        0
high_ibm         0
low_ibm          0
open_ibm         0
volume_ibm       0
dtype: int64

Outliers Detection:
adj close_ibm: 12 outliers removed (2.04%)
close_ibm: 65 outliers removed (11.05%)
high_ibm: 62 outliers removed (10.54%)
low_ibm: 62 outliers removed (10.54%)
open_ibm: 64 outliers removed (10.88%)
volume_ibm: 42 outliers removed (7.14%)

Data Type Validation:
adj close_ibm    float64
close_ibm        float64
high_ibm         float64
low_ibm          float64
open_ibm         float64
volume_ibm         int64
dtype: object

Value Ranges:
adj close_ibm: Min = 76.3611068725586, Max = 119.67767333984376
close_ibm: Min = 112.66730499267578, Max = 185.46844482421875
high_ibm: Min = 114.397705078125, Max = 186.42446899414065
low_ibm: Min = 111.7590789794922, Max = 184.69407653808597
open_ibm: Min = 113.25048065185548, Max = 185.98471069335935
volume_ibm: Min = 1480927, Max = 24493659

Cleaned data saved to 

In [7]:
cleaned_datasets

{'IBM':             adj close_ibm   close_ibm    high_ibm     low_ibm    open_ibm  \
 Date                                                                        
 2014-08-01     115.999710  180.831741  183.078400  180.554489  182.122375   
 2014-08-04     116.300247  181.300186  181.596558  180.305923  181.022949   
 2014-08-05     114.742561  178.871887  180.879547  178.240921  180.449326   
 2014-08-06     114.724030  177.791580  178.661575  176.328873  177.208420   
 2014-08-07     113.693825  176.195023  178.470367  175.506699  178.432129   
 ...                   ...         ...         ...         ...         ...   
 2016-11-22     108.312378  155.516251  155.831741  154.827911  155.831741   
 2016-11-23     107.852951  154.856598  155.238998  154.263855  154.818359   
 2016-11-25     108.625320  155.965576  156.013382  154.713196  154.713196   
 2016-11-28     109.544197  157.284897  157.418732  155.544937  156.022949   
 2016-11-29     108.885033  156.338425  157.179733  155.8

## Stationarity Check - ADF

In [9]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import os
import matplotlib.pyplot as plt

# Define the tickers
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to perform stationarity test
def check_stationarity(series, ticker, column):
    # Perform Augmented Dickey-Fuller test
    result = adfuller(series.dropna())
    
    print(f'\nStationarity Test Results for {ticker} - {column}:')
    print(f'ADF Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    print('Critical Values:')
    for key, value in result[4].items():
        print(f'\t{key}: {value}')
    
    # Determine stationarity
    alpha = 0.05
    is_stationary = result[1] <= alpha
    print(f'\nIs the series stationary? {is_stationary}')
    
    return is_stationary

# Function to difference the series
def difference_series(series):
    # First-order differencing
    return series.diff().dropna()

# Function to process each stock
def process_stock_stationarity(ticker):
    # Read the cleaned CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock_cleaned.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    # Print column names for debugging
    print(f"\nColumns for {ticker}:")
    print(df.columns.tolist())
    
    # Select the 'close' column (now with more robust selection)
    close_col = [col for col in df.columns if 'close' in col.lower()][0]
    print(f"\nSelected close column for {ticker}: {close_col}")
    
    series = df[close_col]
    
    # Check original series stationarity
    original_stationary = check_stationarity(series, ticker, 'Original Series')
    
    # If not stationary, apply differencing
    if not original_stationary:
        # Perform first-order differencing
        differenced_series = difference_series(series)
        
        # Check stationarity of differenced series
        differenced_stationary = check_stationarity(differenced_series, ticker, 'Differenced Series')
        
        # Plot original and differenced series
        plt.figure(figsize=(12,6))
        plt.subplot(2,1,1)
        series.plot(title=f'{ticker} - Original Close Price')
        plt.subplot(2,1,2)
        differenced_series.plot(title=f'{ticker} - Differenced Close Price')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder, f'{ticker}_stationarity_plot.png'))
        plt.close()
        
        # Create a new dataframe with differenced data
        df_differenced = df.copy()
        df_differenced[close_col] = np.nan
        df_differenced.loc[differenced_series.index, close_col] = differenced_series
        
        # Save differenced data
        output_path = os.path.join(output_folder, f'{ticker}_stock_differenced.csv')
        df_differenced.to_csv(output_path, index=True)
        print(f'\nDifferenced data saved to {output_path}')
    
    return df

# Process each stock
processed_datasets = {}
for ticker in tickers:
    processed_datasets[ticker] = process_stock_stationarity(ticker)

print("\nStationarity check and differencing complete.")


Columns for IBM:
['adj close_ibm', 'close_ibm', 'high_ibm', 'low_ibm', 'open_ibm', 'volume_ibm']

Selected close column for IBM: adj close_ibm

Stationarity Test Results for IBM - Original Series:
ADF Statistic: -2.0888544788509225
p-value: 0.2490198244177042
Critical Values:
	1%: -3.4415393130846725
	5%: -2.866476335860869
	10%: -2.5693989358590006

Is the series stationary? False

Stationarity Test Results for IBM - Differenced Series:
ADF Statistic: -18.298316729782872
p-value: 2.2894665756125447e-30
Critical Values:
	1%: -3.4415777369651717
	5%: -2.866493255736561
	10%: -2.569407951640003

Is the series stationary? True

Differenced data saved to datasets/IBM_stock_differenced.csv

Columns for AAPL:
['adj close_aapl', 'close_aapl', 'high_aapl', 'low_aapl', 'open_aapl', 'volume_aapl']

Selected close column for AAPL: adj close_aapl

Stationarity Test Results for AAPL - Original Series:
ADF Statistic: -2.3001851167711216
p-value: 0.1719082746237861
Critical Values:
	1%: -3.441539313

# ARIMA model for short term dependecies

## ARIMA Model setup
- ACF and PACF to identify values for the AR and MA
- Buildign and training the ARIMA model on the series

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os

# Define the tickers
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to create train-test split
def create_train_test_split(series, test_size=0.2):
    # Calculate split index
    split_index = int(len(series) * (1 - test_size))
    
    # Split the series
    train = series[:split_index]
    test = series[split_index:]
    
    print("\nDataset Split:")
    print(f"Total samples: {len(series)}")
    print(f"Training samples: {len(train)}")
    print(f"Testing samples: {len(test)}")
    
    return train, test

# Function to plot actual vs predicted
def plot_actual_vs_predicted(test_index, actual, predicted, ticker):
    plt.figure(figsize=(12,6))
    plt.plot(test_index, actual, label='Actual', color='blue')
    plt.plot(test_index, predicted, label='Predicted', color='red', linestyle='--')
    plt.title(f'{ticker} - Actual vs Predicted Close Prices')
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f'{ticker}_actual_vs_predicted.png'))
    plt.close()

# Function to build and evaluate ARIMA model
def build_arima_model(ticker):
    # Read the cleaned CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock_cleaned.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    # Select the close column
    close_col = [col for col in df.columns if 'close' in col.lower()][0]
    series = df[close_col]
    
    # Create train-test split
    train, test = create_train_test_split(series)
    
    # Determine best ARIMA parameters using grid search
    best_params = None
    best_aic = float('inf')
    
    # Try different ARIMA parameter combinations
    p_range = range(0, 3)
    d_range = range(0, 2)
    q_range = range(0, 3)
    
    for p in p_range:
        for d in d_range:
            for q in q_range:
                try:
                    # Fit ARIMA model
                    model = ARIMA(train, order=(p,d,q))
                    model_fit = model.fit()
                    
                    # Compare AIC
                    if model_fit.aic < best_aic:
                        best_aic = model_fit.aic
                        best_params = (p,d,q)
                except Exception as e:
                    continue
    
    print(f"\n{ticker} - Best ARIMA Parameters: {best_params}")
    print(f"Best AIC Score: {best_aic}")
    
    # Fit the best ARIMA model
    final_model = ARIMA(train, order=best_params)
    final_model_fit = final_model.fit()
    
    # Generate forecast for test set
    forecast = final_model_fit.forecast(steps=len(test))
    
    # Calculate error metrics
    mse = mean_squared_error(test, forecast)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test, forecast)
    r2 = r2_score(test, forecast)
    
    # Plot actual vs predicted
    plot_actual_vs_predicted(test.index, test, forecast, ticker)
    
    # Print model performance
    print(f"\n{ticker} Model Performance:")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R-squared (R2) Score: {r2}")
    
    # Save model summary
    with open(os.path.join(output_folder, f'{ticker}_model_summary.txt'), 'w') as f:
        f.write(str(final_model_fit.summary()))
    
    return {
        'model': final_model_fit,
        'params': best_params,
        'forecast': forecast,
        'metrics': {
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }
    }

# Process each stock
arima_results = {}
for ticker in tickers:
    arima_results[ticker] = build_arima_model(ticker)

print("\nARIMA model analysis complete for all stocks.")


Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates


IBM - Best ARIMA Parameters: (1, 1, 1)
Best AIC Score: 1571.9453289273692

IBM Model Performance:
Mean Squared Error (MSE): 29.69601463220938
Root Mean Squared Error (RMSE): 5.4494049796477215
Mean Absolute Error (MAE): 4.750473696512522
R-squared (R2) Score: -2.1959948071494932

Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._


AAPL - Best ARIMA Parameters: (0, 1, 0)
Best AIC Score: 479.9034903204182

AAPL Model Performance:
Mean Squared Error (MSE): 8.46981421518479
Root Mean Squared Error (RMSE): 2.9102945237870324
Mean Absolute Error (MAE): 2.5088415145874023
R-squared (R2) Score: -1.968931685259522

Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates


META - Best ARIMA Parameters: (2, 1, 1)
Best AIC Score: 1806.8540852314432

META Model Performance:
Mean Squared Error (MSE): 89.96670843081557
Root Mean Squared Error (RMSE): 9.485078198455486
Mean Absolute Error (MAE): 8.160383350407557
R-squared (R2) Score: -1.867547431053266

Dataset Split:
Total samples: 588
Training samples: 470
Testing samples: 118


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq


GOOGL - Best ARIMA Parameters: (2, 1, 2)
Best AIC Score: 704.5948980019842


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(



GOOGL Model Performance:
Mean Squared Error (MSE): 9.445814092453746
Root Mean Squared Error (RMSE): 3.0734043164630562
Mean Absolute Error (MAE): 2.805570839042206
R-squared (R2) Score: -1.7204893942575952

ARIMA model analysis complete for all stocks.


# LSTM

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define the tickers
tickers = ['IBM', 'AAPL', 'META', 'GOOGL']
input_folder = "datasets"
output_folder = "datasets"

# Function to create train-test split
def create_train_test_split(series, test_size=0.2):
    # Calculate split index
    split_index = int(len(series) * (1 - test_size))
    
    # Split the series
    train = series[:split_index]
    test = series[split_index:]
    
    print("\nDataset Split:")
    print(f"Total samples: {len(series)}")
    print(f"Training samples: {len(train)}")
    print(f"Testing samples: {len(test)}")
    
    return train, test

# Function to plot actual vs predicted
def plot_actual_vs_predicted(test_index, actual, predicted, ticker):
    plt.figure(figsize=(12,6))
    plt.plot(test_index, actual, label='Actual', color='blue')
    plt.plot(test_index, predicted, label='Predicted', color='red', linestyle='--')
    plt.title(f'{ticker} - Actual vs Predicted Close Prices')
    plt.xlabel('Date')
    plt.ylabel('Close Price')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f'{ticker}_actual_vs_predicted.png'))
    plt.close()

# Function to prepare data for LSTM
def prepare_data_for_lstm(series, window_size=60):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(series.values.reshape(-1, 1))
    
    X, y = [], []
    for i in range(window_size, len(scaled_data)):
        X.append(scaled_data[i-window_size:i, 0])
        y.append(scaled_data[i, 0])
    
    X, y = np.array(X), np.array(y)
    X = np.reshape(X, (X.shape[0], X.shape[1], 1))
    
    return X, y, scaler

# Function to build and evaluate LSTM model
def build_lstm_model(ticker, window_size=60, epochs=50, batch_size=32):
    # Read the cleaned CSV file
    input_path = os.path.join(input_folder, f'{ticker}_stock_cleaned.csv')
    df = pd.read_csv(input_path, parse_dates=['Date'], index_col='Date')
    
    # Select the close column
    close_col = [col for col in df.columns if 'close' in col.lower()][0]
    series = df[close_col]
    
    # Create train-test split
    train, test = create_train_test_split(series)
    
    # Prepare data for LSTM
    X_train, y_train, scaler = prepare_data_for_lstm(train, window_size)
    X_test, y_test, _ = prepare_data_for_lstm(test, window_size)
    
    # Build LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=25))
    model.add(Dense(units=1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
    
    # Generate predictions
    predictions = model.predict(X_test)
    predictions = scaler.inverse_transform(predictions)
    
    # Calculate error metrics
    mse = mean_squared_error(test[window_size:], predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(test[window_size:], predictions)
    r2 = r2_score(test[window_size:], predictions)
    
    # Plot actual vs predicted
    plot_actual_vs_predicted(test.index[window_size:], test[window_size:], predictions, ticker)
    
    # Print model performance
    print(f"\n{ticker} Model Performance:")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"R-squared (R2) Score: {r2}")
    
    # Save model summary
    with open(os.path.join(output_folder, f'{ticker}_model_summary.txt'), 'w') as f:
        f.write(str(model.summary()))
    
    return {
        'model': model,
        'predictions': predictions,
        'metrics': {
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'r2': r2
        }
    }

# Process each stock
lstm_results = {}
for ticker in tickers:
    lstm_results[ticker] = build_lstm_model(ticker)

print("\nLSTM model analysis complete for all stocks.")

ModuleNotFoundError: No module named 'tensorflow'