In [None]:
#  The stock's performance using processed data
#  Plot actual and predicted values for each stock 

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from datetime import datetime

# Define the list of stock tickers and corresponding file paths
tickers = ['NVDA', 'MSFT', 'AMD', 'AAPL', 'TSLA']
file_paths = {
    'NVDA': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_nvda.us.txt',
    'MSFT': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_msft.us.txt',
    'AMD': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_amd.us.txt',
    'AAPL': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_aapl.us.txt',
    'AMZN': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_amzn.us.txt',
    'TSLA': r'C:\USD\Machine Learning - Fundamentals AAI 510\Final Team Project\processed\processed\processed_tsla.us.txt'
}

# Define the date range
start_date = '2014-01-01'
end_date = datetime.today().strftime('%Y-%m-%d')

# Function to create lag features
def create_lag_features(df, lags):
    for lag in range(1, lags + 1):
        df[f'lag_{lag}'] = df['Close'].shift(lag)
    df['rolling_mean_5'] = df['Close'].rolling(window=5).mean()
    df['rolling_std_5'] = df['Close'].rolling(window=5).std()
    df['rolling_mean_10'] = df['Close'].rolling(window=10).mean()
    df['rolling_std_10'] = df['Close'].rolling(window=10).std()
    df['rolling_mean_20'] = df['Close'].rolling(window=20).mean()
    df['rolling_std_20'] = df['Close'].rolling(window=20).std()
    df['momentum'] = df['Close'] - df['Close'].shift(4)
    return df.dropna()

# Function to normalize data
def normalize_data(df):
    return (df - df.min()) / (df.max() - df.min())

# Function to fit and predict using Random Forest and XGBoost
def fit_and_predict(df, ticker):
    # Create lag features
    df = create_lag_features(df, 7)
    
    # Define features and target
    features = [f'lag_{i}' for i in range(1, 8)] + ['rolling_mean_5', 'rolling_std_5', 'rolling_mean_10', 'rolling_std_10', 'rolling_mean_20', 'rolling_std_20', 'momentum']
    X = df[features]
    y = df['Close']
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)
    
    # Hyperparameter tuning for Random Forest
    rf_params = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    }
    rf_model = RandomForestRegressor(random_state=42)
    rf_random = RandomizedSearchCV(rf_model, rf_params, n_iter=20, cv=3, n_jobs=-1, random_state=42, scoring='neg_mean_squared_error')
    rf_random.fit(X_train, y_train)
    rf_best_model = rf_random.best_estimator_
    
    # Predict with Random Forest
    y_pred_rf = rf_best_model.predict(X_test)
    
    # Hyperparameter tuning for XGBoost
    xgb_params = {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_random = RandomizedSearchCV(xgb_model, xgb_params, n_iter=20, cv=3, n_jobs=-1, random_state=42, scoring='neg_mean_squared_error')
    xgb_random.fit(X_train, y_train)
    xgb_best_model = xgb_random.best_estimator_
    
    # Predict with XGBoost
    y_pred_xgb = xgb_best_model.predict(X_test)
    
    # Normalize the data for comparison
    df['Close'] = normalize_data(df['Close'])
    y_test = normalize_data(y_test)
    y_pred_rf = normalize_data(pd.Series(y_pred_rf, index=y_test.index))
    y_pred_xgb = normalize_data(pd.Series(y_pred_xgb, index=y_test.index))
    
    # Plot actual vs predicted
    plt.figure(figsize=(12, 6))
    plt.plot(df.index[-len(y_test):], y_test, label='Actual', marker='o')
    plt.plot(df.index[-len(y_test):], y_pred_rf, label='Random Forest Predicted', linestyle='--')
    plt.plot(df.index[-len(y_test):], y_pred_xgb, label='XGBoost Predicted', linestyle='--')
    plt.title(f'{ticker} Stock Prediction')
    plt.xlabel('Date')
    plt.ylabel('Normalized Close Price')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    # Print evaluation metrics
    print(f'{ticker} Random Forest MSE: {mean_squared_error(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost MSE: {mean_squared_error(y_test, y_pred_xgb):.4f}')
    print(f'{ticker} Random Forest MAE: {mean_absolute_error(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost MAE: {mean_absolute_error(y_test, y_pred_xgb):.4f}')
    print(f'{ticker} Random Forest R²: {r2_score(y_test, y_pred_rf):.4f}')
    print(f'{ticker} XGBoost R²: {r2_score(y_test, y_pred_xgb):.4f}')

# Load and process each stock data
for ticker in tickers:
    print(f'\nProcessing {ticker}...\n')
    file_path = file_paths[ticker]
    df_ticker = pd.read_csv(file_path)
    df_ticker['Date'] = pd.to_datetime(df_ticker['Date'])
    df_ticker.set_index('Date', inplace=True)
    fit_and_predict(df_ticker, ticker)