In [None]:
#  Compare each stock's performance against S&P500 using historical data.
#  Plot actual and predicted values for each stock along withe S&P500 index values

import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Define the list of stock tickers
tickers = ['NVDA', 'MSFT', 'AMD', 'AAPL', 'AMZN']
index_ticker = '^GSPC'  # S&P 500

# Define the date range
start_date = '2014-01-01'
end_date = '2024-01-01'

# Function to create lag features
def create_lag_features(df, lags):
    for lag in range(1, lags + 1):
        df[f'lag_{lag}'] = df['Close'].shift(lag)
    df['rolling_mean_5'] = df['Close'].rolling(window=5).mean()
    df['rolling_std_5'] = df['Close'].rolling(window=5).std()
    return df.dropna()

# Function to normalize data
def normalize_data(df):
    return (df - df.min()) / (df.max() - df.min())

# Function to fit and predict using Random Forest and XGBoost
def fit_and_predict(df, sp500_df, ticker):
    # Create lag features
    df = create_lag_features(df, 7)
    
    # Define features and target
    features = [f'lag_{i}' for i in range(1, 8)] + ['rolling_mean_5', 'rolling_std_5']
    X = df[features]
    y = df['Close']
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)
    
    # Hyperparameter tuning for Random Forest
    rf_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }
    rf_model = RandomForestRegressor(random_state=42)
    rf_grid = GridSearchCV(rf_model, rf_params, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
    rf_grid.fit(X_train, y_train)
    rf_best_model = rf_grid.best_estimator_
    
    # Predict with Random Forest
    y_pred_rf = rf_best_model.predict(X_test)
    
    # Hyperparameter tuning for XGBoost
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
    xgb_grid.fit(X_train, y_train)
    xgb_best_model = xgb_grid.best_estimator_
    
    # Predict with XGBoost
    y_pred_xgb = xgb_best_model.predict(X_test)
    
    # Normalize the data for comparison
    df['Close'] = normalize_data(df['Close'])
    y_test = normalize_data(y_test)
    y_pred_rf = normalize_data(pd.Series(y_pred_rf, index=y_test.index))
    y_pred_xgb = normalize_data(pd.Series(y_pred_xgb, index=y_test.index))
    sp500_df['Close'] = normalize_data(sp500_df['Close'])
    
    # Plot actual vs predicted with S&P 500 index
    plt.figure(figsize=(12, 6))
    plt.plot(df.index[-len(y_test):], y_test, label='Actual', marker='o')
    plt.plot(df.index[-len(y_test):], y_pred_rf, label='Random Forest Predicted', linestyle='--')
    plt.plot(df.index[-len(y_test):], y_pred_xgb, label='XGBoost Predicted', linestyle='--')
    plt.plot(sp500_df.index[-len(y_test):], sp500_df['Close'][-len(y_test):], label='S&P 500 Index', linestyle=':')
    plt.title(f'{ticker} Stock Prediction vs S&P 500')
    plt.xlabel('Date')
    plt.ylabel('Normalized Close Price')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    # Print evaluation metrics
    print(f'{ticker} Random Forest MSE: {mean_squared_error(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost MSE: {mean_squared_error(y_test, y_pred_xgb):.4f}')
    print(f'{ticker} Random Forest MAE: {mean_absolute_error(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost MAE: {mean_absolute_error(y_test, y_pred_xgb):.4f}')
    print(f'{ticker} Random Forest R²: {r2_score(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost R²: {r2_score(y_test, y_pred_xgb):.4f}')

# Download historical data for the stocks and S&P 500
data = yf.download(tickers, start=start_date, end=end_date)
sp500_data = yf.download(index_ticker, start=start_date, end=end_date)['Adj Close'].reset_index()
sp500_data = sp500_data.rename(columns={'Adj Close': 'Close'}).set_index('Date')

# Fit and predict for each stock
for ticker in tickers:
    print(f'\nProcessing {ticker}...\n')
    df_ticker = data['Adj Close'][ticker].reset_index().rename(columns={ticker: 'Close'})
    df_ticker.set_index('Date', inplace=True)
    fit_and_predict(df_ticker, sp500_data, ticker)