**S&P 500 comparison with other high performing stocks**

In [None]:
import yfinance as yf
import pandas as pd

# Define the list of stock tickers
tickers = ['AAPL', 'GOOGL', 'GOOG', 'NVDA', 'AMZN', 'DASH', 'MSFT', 'UBER', 'TSLA', 'AMD', 'HPQ', 'DELL', 'META', 'SNOW', 'HOOD', 'PTON', 'AFRM', 'ROKU', 'WISH', 'TOST']
index_ticker = '^GSPC'  # S&P 500

# Define the date range
start_date = '2014-01-01'
end_date = '2024-01-01'

# Function to calculate percentage change
def calculate_percentage_change(data):
    return ((data.iloc[-1] - data.iloc[0]) / data.iloc[0]) * 100

# Download the historical data for the stocks
data = yf.download(tickers, start=start_date, end=end_date)['Adj Close']

# Download the historical data for the S&P 500 index
sp500_data = yf.download(index_ticker, start=start_date, end=end_date)['Adj Close']

# Calculate the percentage change over the entire period for each stock
percentage_change = data.apply(calculate_percentage_change)

# Calculate the percentage change over the entire period for the S&P 500 index
sp500_change = calculate_percentage_change(sp500_data)

# Compare each stock's performance to the S&P 500
better_than_sp500 = percentage_change[percentage_change > sp500_change]

# Display the results
print("Stocks that performed better than the S&P 500 from 2014 to 2024:")
print(better_than_sp500)
print(f"\nS&P 500 Percentage Change: {sp500_change:.2f}%")

Stocks that performed better than the S&P 500 from 2014 to 2024:

Ticker

AAPL      1011.686759

AMD       3631.898782

AMZN       663.575139

GOOG       408.330600

GOOGL      401.474436

HPQ        222.939383

META       546.974967

MSFT      1101.757924

NVDA     13138.616066

TSLA      2383.144411

dtype: float64

S&P 500 Percentage Change: 160.36%

**Stock performance using processed dataset**

In [None]:
#  The stock's performance using processed data
#  Plot actual and predicted values for each stock

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from datetime import datetime

# Define the list of stock tickers and corresponding file paths
tickers = ['NVDA', 'MSFT', 'AMD', 'AAPL', 'TSLA']
file_paths = {
    'NVDA': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_nvda.us.txt',
    'MSFT': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_msft.us.txt',
    'AMD': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_amd.us.txt',
    'AAPL': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_aapl.us.txt',
    'AMZN': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_amzn.us.txt',
    'TSLA': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_tsla.us.txt'
}

# Define the date range
start_date = '2014-01-01'
end_date = datetime.today().strftime('%Y-%m-%d')

# Function to create lag features
def create_lag_features(df, lags):
    for lag in range(1, lags + 1):
        df[f'lag_{lag}'] = df['Close'].shift(lag)
    df['rolling_mean_5'] = df['Close'].rolling(window=5).mean()
    df['rolling_std_5'] = df['Close'].rolling(window=5).std()
    df['rolling_mean_10'] = df['Close'].rolling(window=10).mean()
    df['rolling_std_10'] = df['Close'].rolling(window=10).std()
    df['rolling_mean_20'] = df['Close'].rolling(window=20).mean()
    df['rolling_std_20'] = df['Close'].rolling(window=20).std()
    df['momentum'] = df['Close'] - df['Close'].shift(4)
    return df.dropna()

# Function to normalize data
def normalize_data(df):
    return (df - df.min()) / (df.max() - df.min())

# Function to fit and predict using Random Forest and XGBoost
def fit_and_predict(df, ticker):
    # Create lag features
    df = create_lag_features(df, 7)

    # Define features and target
    features = [f'lag_{i}' for i in range(1, 8)] + ['rolling_mean_5', 'rolling_std_5', 'rolling_mean_10', 'rolling_std_10', 'rolling_mean_20', 'rolling_std_20', 'momentum']
    X = df[features]
    y = df['Close']

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

    # Hyperparameter tuning for Random Forest
    rf_params = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    }
    rf_model = RandomForestRegressor(random_state=42)
    rf_random = RandomizedSearchCV(rf_model, rf_params, n_iter=20, cv=3, n_jobs=-1, random_state=42, scoring='neg_mean_squared_error')
    rf_random.fit(X_train, y_train)
    rf_best_model = rf_random.best_estimator_

    # Predict with Random Forest
    y_pred_rf = rf_best_model.predict(X_test)

    # Hyperparameter tuning for XGBoost
    xgb_params = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_random = RandomizedSearchCV(xgb_model, xgb_params, n_iter=20, cv=3, n_jobs=-1, random_state=42, scoring='neg_mean_squared_error')
    xgb_random.fit(X_train, y_train)
    xgb_best_model = xgb_random.best_estimator_

    # Predict with XGBoost
    y_pred_xgb = xgb_best_model.predict(X_test)

    # Normalize the data for comparison
    df['Close'] = normalize_data(df['Close'])
    y_test = normalize_data(y_test)
    y_pred_rf = normalize_data(pd.Series(y_pred_rf, index=y_test.index))
    y_pred_xgb = normalize_data(pd.Series(y_pred_xgb, index=y_test.index))

    # Plot actual vs predicted
    plt.figure(figsize=(12, 6))
    plt.plot(df.index[-len(y_test):], y_test, label='Actual', marker='o')
    plt.plot(df.index[-len(y_test):], y_pred_rf, label='Random Forest Predicted', linestyle='--')
    plt.plot(df.index[-len(y_test):], y_pred_xgb, label='XGBoost Predicted', linestyle='--')
    plt.title(f'{ticker} Stock Prediction')
    plt.xlabel('Date')
    plt.ylabel('Normalized Close Price')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Print evaluation metrics
    print(f'{ticker} Random Forest MSE: {mean_squared_error(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost MSE: {mean_squared_error(y_test, y_pred_xgb):.4f}')
    print(f'{ticker} Random Forest MAE: {mean_absolute_error(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost MAE: {mean_absolute_error(y_test, y_pred_xgb):.4f}')
    print(f'{ticker} Random Forest R²: {r2_score(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost R²: {r2_score(y_test, y_pred_xgb):.4f}')

# Load and process each stock data
for ticker in tickers:
    print(f'\nProcessing {ticker}...\n')
    file_path = file_paths[ticker]
    df_ticker = pd.read_csv(file_path)
    df_ticker['Date'] = pd.to_datetime(df_ticker['Date'])
    df_ticker.set_index('Date', inplace=True)
    fit_and_predict(df_ticker, ticker)

**Stock performance using historical data again SP 500**

In [None]:
#  Compare each stock's performance against S&P500 using historical data.
#  Plot actual and predicted values for each stock along withe S&P500 index values

import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Define the list of stock tickers
tickers = ['NVDA', 'MSFT', 'AMD', 'AAPL', 'AMZN']
index_ticker = '^GSPC'  # S&P 500

# Define the date range
start_date = '2014-01-01'
end_date = '2024-01-01'

# Function to create lag features
def create_lag_features(df, lags):
    for lag in range(1, lags + 1):
        df[f'lag_{lag}'] = df['Close'].shift(lag)
    df['rolling_mean_5'] = df['Close'].rolling(window=5).mean()
    df['rolling_std_5'] = df['Close'].rolling(window=5).std()
    return df.dropna()

# Function to normalize data
def normalize_data(df):
    return (df - df.min()) / (df.max() - df.min())

# Function to fit and predict using Random Forest and XGBoost
def fit_and_predict(df, sp500_df, ticker):
    # Create lag features
    df = create_lag_features(df, 7)

    # Define features and target
    features = [f'lag_{i}' for i in range(1, 8)] + ['rolling_mean_5', 'rolling_std_5']
    X = df[features]
    y = df['Close']

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

    # Hyperparameter tuning for Random Forest
    rf_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }
    rf_model = RandomForestRegressor(random_state=42)
    rf_grid = GridSearchCV(rf_model, rf_params, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
    rf_grid.fit(X_train, y_train)
    rf_best_model = rf_grid.best_estimator_

    # Predict with Random Forest
    y_pred_rf = rf_best_model.predict(X_test)

    # Hyperparameter tuning for XGBoost
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
    xgb_grid.fit(X_train, y_train)
    xgb_best_model = xgb_grid.best_estimator_

    # Predict with XGBoost
    y_pred_xgb = xgb_best_model.predict(X_test)

    # Normalize the data for comparison
    df['Close'] = normalize_data(df['Close'])
    y_test = normalize_data(y_test)
    y_pred_rf = normalize_data(pd.Series(y_pred_rf, index=y_test.index))
    y_pred_xgb = normalize_data(pd.Series(y_pred_xgb, index=y_test.index))
    sp500_df['Close'] = normalize_data(sp500_df['Close'])

    # Plot actual vs predicted with S&P 500 index
    plt.figure(figsize=(12, 6))
    plt.plot(df.index[-len(y_test):], y_test, label='Actual', marker='o')
    plt.plot(df.index[-len(y_test):], y_pred_rf, label='Random Forest Predicted', linestyle='--')
    plt.plot(df.index[-len(y_test):], y_pred_xgb, label='XGBoost Predicted', linestyle='--')
    plt.plot(sp500_df.index[-len(y_test):], sp500_df['Close'][-len(y_test):], label='S&P 500 Index', linestyle=':')
    plt.title(f'{ticker} Stock Prediction vs S&P 500')
    plt.xlabel('Date')
    plt.ylabel('Normalized Close Price')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Print evaluation metrics
    print(f'{ticker} Random Forest MSE: {mean_squared_error(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost MSE: {mean_squared_error(y_test, y_pred_xgb):.4f}')
    print(f'{ticker} Random Forest MAE: {mean_absolute_error(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost MAE: {mean_absolute_error(y_test, y_pred_xgb):.4f}')
    print(f'{ticker} Random Forest R²: {r2_score(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost R²: {r2_score(y_test, y_pred_xgb):.4f}')

# Download historical data for the stocks and S&P 500
data = yf.download(tickers, start=start_date, end=end_date)
sp500_data = yf.download(index_ticker, start=start_date, end=end_date)['Adj Close'].reset_index()
sp500_data = sp500_data.rename(columns={'Adj Close': 'Close'}).set_index('Date')

# Fit and predict for each stock
for ticker in tickers:
    print(f'\nProcessing {ticker}...\n')
    df_ticker = data['Adj Close'][ticker].reset_index().rename(columns={ticker: 'Close'})
    df_ticker.set_index('Date', inplace=True)
    fit_and_predict(df_ticker, sp500_data, ticker)

**LSTM Model**

**Target Stocks**

Apple - AAPL

Google (Alphabet Inc.) - GOOGL (for Class A shares)

Nvidia - NVDA

Amazon - AMZN

Microsoft - MSFT

AMD (Advanced Micro Devices) - AMD

HP (HP Inc., not to be confused with Hewlett Packard Enterprise) - HPQ

QUALCOMM Incorporated - QCOM

Salesforce, Inc. - CRM

Cisco Systems, Inc. - CSCO

Oracle Corporation - ORCL

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/combined_stocks.csv')

# Print the first 5 rows of the DataFrame
print(df.head())


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import yfinance as yf

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/combined_stocks.csv')

# Convert the 'Date' column to datetime type
df['Date'] = pd.to_datetime(df['Date'])

# Filtering data for the relevant years
df = df[(df['Date'] < '2019-01-01')]

# Download S&P 500 data
index_data = yf.download('^GSPC', start='2016-11-01', end='2017-11-01')
index_data.reset_index(inplace=True)  # Resetting the index so 'Date' becomes a column
index_data = index_data[['Date', 'Close']]
index_data['Percent_Return'] = index_data['Close'].pct_change()

# Resample to weekly data and sum percent returns
index_data.set_index('Date', inplace=True)  # Set 'Date' as the index again for resampling
index_data = index_data.resample('W').sum()  # Summing weekly percent returns

# List of tickers to process
tickers = ['NVDA', 'MSFT', 'AMD', 'AAPL', 'AMZN']

results = {}

for ticker in tickers:
    print(f"Processing {ticker}")
    df_ticker = df[df['Ticker'] == ticker]
    df_ticker.set_index('Date', inplace=True)
    df_ticker.loc[:, 'Percent_Return'] = df_ticker['Close'].pct_change()
    df_ticker = df_ticker.resample('W').sum()  # Summing weekly percent returns

    # Prepare data for LSTM
    scaler = MinMaxScaler(feature_range=(0, 1))
    returns = df_ticker['Percent_Return'].dropna().values.reshape(-1, 1)
    scaled_data = scaler.fit_transform(returns)

    # Create the training data
    n_past = 4  # Adjusted for weekly data
    X_train, y_train = [], []
    for i in range(n_past, len(scaled_data)-52):  # Reserving the last year for testing, adjusted for weeks
        X_train.append(scaled_data[i-n_past:i, 0])
        y_train.append(scaled_data[i, 0])
    X_train, y_train = np.array(X_train), np.array(y_train)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))

    # Compile and fit the model
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train, y_train, epochs=20, batch_size=32)

    # Predicting values
    test_data = scaled_data[-(n_past+52):]  # Last year data + 4 weeks
    X_test = []
    for i in range(n_past, len(test_data)):
        X_test.append(test_data[i-n_past:i, 0])
    X_test = np.array(X_test)
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
    predicted_returns = model.predict(X_test)
    predicted_returns = scaler.inverse_transform(predicted_returns)

    # Actual percent returns
    actual_returns = df_ticker['Percent_Return'].tail(52).values  # Last year of weekly data

    # Plotting the results
    plt.figure(figsize=(10, 6))
    plt.plot(df_ticker.index[-52:], actual_returns, color='blue', label='Actual Percent Returns')
    plt.plot(df_ticker.index[-52:], predicted_returns[:, 0], color='red', label='Predicted Percent Returns')

    # Include S&P 500 percent returns for comparison
    sp500_returns = index_data['Percent_Return'].tail(52).dropna()  # last year weekly data
    plt.plot(sp500_returns.index, sp500_returns.values, color='green', label='S&P 500 Percent Returns')

    plt.title(f'Weekly Percent Return Prediction for {ticker} vs S&P 500 (Ending 2018-04-01)')
    plt.xlabel('Date')
    plt.ylabel('Percent Return')
    plt.legend()
    plt.show()

    # Store results
    results[ticker] = predicted_returns

    # Calculate statistics
    mse = mean_squared_error(actual_returns, predicted_returns[:, 0])
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual_returns, predicted_returns[:, 0])
    mape = np.mean(np.abs((actual_returns - predicted_returns[:, 0]) / actual_returns)) * 100
    r2 = r2_score(actual_returns, predicted_returns[:, 0])

    # Print statistics
    print(f"Metrics for {ticker}: MSE: {mse}, RMSE: {rmse}, MAE: {mae}, MAPE: {mape}%, R^2: {r2}")

**ARIMA Model** (not used in presentation)

In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/combined_stocks.csv')

# Print the first 5 rows of the DataFrame
print(df.head())

In [None]:
import matplotlib.pyplot as plt

# Convert 'Date' to datetime if not already done
df['Date'] = pd.to_datetime(df['Date'])

# Define start and end dates
start_date = '2008-01-01'
end_date = '2018-01-01'

# Slice the DataFrame to include only data within the specified date range
# Make a copy to avoid SettingWithCopyWarning when modifying this slice
selected_data = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)].copy()

# Calculate daily returns
selected_data['Daily_Return'] = selected_data.groupby('Ticker')['Close'].pct_change()

# Pivot the DataFrame to make each ticker's returns a column
pivot_df = selected_data.pivot(index='Date', columns='Ticker', values='Daily_Return')

# Print the first 5 rows of the DataFrame
print(selected_data.head())


In [None]:
# Plotting
plt.figure(figsize=(14, 7))
for column in pivot_df.columns:
    plt.plot(pivot_df.index, pivot_df[column], label=column)

plt.title('Daily Returns of Different Stocks')
plt.xlabel('Date')
plt.ylabel('Daily Return')
plt.legend(title='Ticker')
plt.grid(True)
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


# Convert 'Date' to datetime if it's your DataFrame index or a column
selected_data['Date'] = pd.to_datetime(selected_data['Date'])
df.set_index('Date', inplace=True)

# List of unique tickers
tickers = selected_data['Ticker'].unique()

# Loop through each ticker and plot ACF and PACF
for ticker in tickers:
    data = selected_data[selected_data['Ticker'] == ticker]['Close']

    # Plot ACF
    plt.figure(figsize=(12, 6))
    plot_acf(data, lags=50, title=f'Autocorrelation Function for {ticker}')
    plt.show()

    # Plot PACF
    plt.figure(figsize=(12, 6))
    plot_pacf(data, lags=50, title=f'Partial Autocorrelation Function for {ticker}')
    plt.show()


In [None]:
from statsmodels.tsa.arima.model import ARIMA


# Convert 'Date' to datetime type and set it as the DataFrame index
selected_data['Date'] = pd.to_datetime(selected_data['Date'])
selected_data.set_index('Date', inplace=True)
selected_data.index = pd.DatetimeIndex(selected_data.index).to_period('D')  # Assuming daily data

# List of tickers
tickers = ['aapl', 'googl', 'nvda', 'amzn', 'msft', 'amd', 'hpq', 'qcom', 'crm', 'csco', 'orcl']

# Iterate through each ticker
for ticker in tickers:
    selected_data_ticker = selected_data[selected_data['Ticker'].str.lower() == ticker.lower()]
    closing_prices = selected_data_ticker['Close']

    # Fit an ARIMA model (potentially adjust p, d, q based on data characteristics)
    model = ARIMA(closing_prices, order=(1, 1, 1))
    model_fit = model.fit()

    # Forecasting
    forecast = model_fit.get_forecast(steps=365*2)  # 2 years
    mean_forecast = forecast.predicted_mean
    confidence_intervals = forecast.conf_int()

    # Convert PeriodIndex to DateTimeIndex for plotting
    closing_prices.index = closing_prices.index.to_timestamp()
    mean_forecast.index = mean_forecast.index.to_timestamp()

    # Plotting the results
    plt.figure(figsize=(10, 5))
    plt.plot(closing_prices.index, closing_prices, label='Actual')
    plt.plot(mean_forecast.index, mean_forecast, color='red', label='Forecast')
    plt.fill_between(mean_forecast.index,
                     confidence_intervals['lower Close'],
                     confidence_intervals['upper Close'], color='pink')
    plt.title(f'{ticker.upper()} Stock Price Forecast')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.show()