# Imports

In [1]:
import pandas as pd
import os
import numpy as np
import re
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, html, dcc, Input, Output
import plotly.io as pio
import dash_bootstrap_components as dbc
from plotly.subplots import make_subplots
import random
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# from sklearn.metrics import mean_squared_error# deprecated
from sklearn.metrics import root_mean_squared_error,make_scorer# alternative
# also import MAPE and relative_error
from sklearn.metrics import mean_absolute_percentage_error


from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import xgboost as xgb
from datetime import timedelta

def extract_date_features(df):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values(by='Date')
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day
    df['day_of_week'] = df['Date'].dt.dayofweek
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    # return df.drop(columns='Date')
    return df

# Reading CSVs

In [None]:
# Define the folder paths
stock_prices_path = os.path.join('..', 'data', 'processed', 'stock_prices','processed_stock_prices.csv')
df_stock_prices = pd.read_csv(stock_prices_path)
df_stock_prices['Date'] = pd.to_datetime(df_stock_prices['Date'])


# Stocks Predictions using XGBoost

In [None]:
valid_targets = {
        'Volume': ['Open', 'High', 'Low', 'Close', 'year', 'month', 'day', 'day_of_week', 'is_weekend'#]
                 ,'Open_Lag1','Open_Lag3','Open_Lag7','Close_Lag1','Close_Lag3','Close_Lag7','High_Lag1',
                'High_Lag3','High_Lag7','Low_Lag1','Low_Lag3','Low_Lag7','Volume_Lag1','Volume_Lag3',
                'Volume_Lag7','Open_MA3','Open_MA7','Close_MA3','Close_MA7','High_MA3','High_MA7',
                'Low_MA3','Low_MA7','Volume_MA3','Volume_MA7'#]
                ,'insider_TransactionValue_MA7','insider_TRANS_PRICEPERSHARE_Lag7','insider_TRANS_SHARES_Lag7',
                'insider_TransactionValue_MA21','insider_TRANS_PRICEPERSHARE_Lag21','insider_TRANS_SHARES_Lag21',]
    }

# Define the parameter grid for XGBoost
param_grid = {
    'regressor__n_estimators': [10,50,100,300],# This is the number of boosting rounds, or the number of trees in the model. risk overfitting if too high and will be computationally expensive.
    'regressor__learning_rate': [0.01,0.05], # Also known as the “eta” parameter, the learning rate controls the impact of each tree on the final outcome.
    # Lower values (e.g., 0.01 or 0.05) make the model train more conservatively, often requiring more n_estimators to reach an optimal fit, but they help prevent overfitting.
    'regressor__max_depth': [2,4,8,16],# Limits the maximum depth of each decision tree.
    'regressor__subsample': [0.5,0.7,0.9],# This parameter specifies the fraction of the training samples used to fit each individual tree
    'regressor__colsample_bytree': [0.6, 0.8] # Defines the fraction of features (columns) to be randomly sampled for each tree.
}




symbol = 'AAPL'
date_start = '2014-01-01'
date_end = '2017-12-31'


data = df_stock_prices[
            (df_stock_prices['SYMBOL'] == symbol) &
            (df_stock_prices['Date'] >= date_start) &
            (df_stock_prices['Date'] <= date_end)
        ].copy()
data = extract_date_features(data)
target = 'Volume'
numerical_features = valid_targets[target]
categorical_features = ['SYMBOL','Exists in Insiders','InsiderTransactionInLast7Days','InsiderTransactionInLast21Days']

X = data.drop(target, axis=1)
y = data[target]
preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), numerical_features),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)
        ]
    )




# Define the model with the XGBoost regressor
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=0)

# Use a pipeline to handle preprocessing and modeling together
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])
# Set up TimeSeriesSplit for walk-forward validation
tscv = TimeSeriesSplit(n_splits=5)  # 5 splits in walk-forward fashion

# # GridSearchCV with TimeSeriesSplit
# grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=tscv, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')
rmse_scorer = make_scorer(lambda y_true, y_pred: root_mean_squared_error(y_true, y_pred), greater_is_better=False)
mean_absolute_percentage_error_scorer = make_scorer(lambda y_true, y_pred: mean_absolute_percentage_error(y_true, y_pred), greater_is_better=False)

# Update GridSearchCV to use RMSE as the scoring metric
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=tscv, n_jobs=-1, verbose=1, scoring=mean_absolute_percentage_error_scorer)


# Fit the grid search with walk-forward validation
grid_search.fit(X, y)

# Display best hyperparameters and score
print("Best hyperparameters found:", grid_search.best_params_)
print("Best CV score:", -grid_search.best_score_)# CV stands for cross validation

# Extract top 3 configurations
top_3_configs = grid_search.cv_results_['params']
top_3_scores = grid_search.cv_results_['mean_test_score']
# calculate the top 3 R2 score 


# Combine scores and configs into a list of tuples and sort based on the score
top_3_configs_sorted = sorted(zip(top_3_scores, top_3_configs), key=lambda x: x[0], reverse=True)[:3]

# Display top 3 configurations
for idx, (score, config) in enumerate(top_3_configs_sorted, 1):
    print(f"Rank {idx}: {config} with score: {-score:.2e}")










In [None]:
# Apple stock analysis for the entire year of 2014
    # Improved volume RMSE from 2.77e7 (linear regression) to 2.70e7 using XGBoost without lag features and moving averages
    # Further improved volume RMSE from 2.70e7 to 2.35e7 using XGBoost with lag features and moving averages and insider

# Apple stock analysis for the period from 2014 to 2017
    # Improved volume RMSE from 2.14e7 (linear regression) to 2.13e7 using XGBoost without lag features and moving averages
    # Further improved volume RMSE from 2.13e7 to 1.18e+07 using XGBoost with lag features and moving averages and insdier 

# Fitting 5 folds for each of 192 candidates, totalling 960 fits
# Best hyperparameters found: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 4, 'regressor__n_estimators': 300, 'regressor__subsample': 0.9}
# Best CV score: 11835991.344843375 (RMSE)
# Rank 1: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 4, 'regressor__n_estimators': 300, 'regressor__subsample': 0.9} with score: 1.18e+07
# Rank 2: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 4, 'regressor__n_estimators': 300, 'regressor__subsample': 0.7} with score: 1.21e+07
# Rank 3: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 4, 'regressor__n_estimators': 100, 'regressor__subsample': 0.9} with score: 1.21e+07

# Best hyperparameters found: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 4, 'regressor__n_estimators': 300, 'regressor__subsample': 0.9}
# Best CV score: 0.2149187219244552 (MAPE)
# Rank 1: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 4, 'regressor__n_estimators': 300, 'regressor__subsample': 0.9} with score: 2.15e-01
# Rank 2: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 4, 'regressor__n_estimators': 300, 'regressor__subsample': 0.7} with score: 2.32e-01
# Rank 3: {'regressor__colsample_bytree': 0.6, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 4, 'regressor__n_estimators': 300, 'regressor__subsample': 0.9} with score: 2.32e-01




symbol = 'AAPL'
date_start = '2014-01-01'
date_end = '2017-12-31'


data = df_stock_prices[
            (df_stock_prices['SYMBOL'] == symbol) &
            (df_stock_prices['Date'] >= date_start) &
            (df_stock_prices['Date'] <= date_end)
        ].copy()
data = extract_date_features(data)
target = 'Volume'
numerical_features = valid_targets[target]
categorical_features = ['SYMBOL','Exists in Insiders','InsiderTransactionInLast7Days','InsiderTransactionInLast21Days']

X = data.drop(target, axis=1)
y = data[target]
preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), numerical_features),
            ('cat', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))
            ]), categorical_features)
        ]
    )


# Best hyperparameters
best_params = {
    'regressor__colsample_bytree': 0.8,
    'regressor__learning_rate': 0.05,
    'regressor__max_depth': 4,
    'regressor__n_estimators': 300,
    'regressor__subsample': 0.9
}
# Update model with best hyperparameters
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=0,
    colsample_bytree=best_params['regressor__colsample_bytree'],
    learning_rate=best_params['regressor__learning_rate'],
    max_depth=best_params['regressor__max_depth'],
    n_estimators=best_params['regressor__n_estimators'],
    subsample=best_params['regressor__subsample']
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', model)])
# Split data into training and testing (last 20%)
split_index = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

# Fit the pipeline
pipeline.fit(X_train, y_train)
# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate
rmse = (root_mean_squared_error(y_test, y_pred))
print(f"RMSE on the last 20% data: {rmse:.2e}")
# calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R2 score on the last 20% data: {r2:.2f}")

# Example prediction
symbol = 'AAPL'
date_start = '2017-01-01'
date_end = '2017-12-31'

new_data = df_stock_prices[
    (df_stock_prices['SYMBOL'] == symbol) &
    (df_stock_prices['Date'] >= date_start) &
    (df_stock_prices['Date'] <= date_end)
].copy()
new_data = extract_date_features(new_data)
X_new = new_data.drop('Volume', axis=1)
predictions = pipeline.predict(X_new)







