# Question 1 
In the lecture we have implemented a few ML models, one using Tensorflow for predicting tomorrow's price the stock mtr (0066.HK).

This question asks you to implement yet another ML model using any regression method
available in the scikit-learn library, excluding Linear Regression and Neural Networks with
the same set of stock price (i.e., 0066.HK between "2010-
01-01" and "2020-06-30"). 

You should submit a Jupyter notebook that includes the
three ML models (i.e., the Tensorflow implementation from the lectures and your
implementations using sklearn), and compare their accuracy on predicting the
price of 0066.HK during the period "2021-01-01" and "2021-04-30".

## Reqiurements
1. numpy
2. pandas
3. matplotlib
4. scikit-learn
5. tensorflow
6. akshare

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

%pip install akshare
import akshare as ak # for getting stock data

In [None]:
# get stock data
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


symbol = "00066"
start = "2010-01-01"
end = "2020-06-30"
predict_start = "2021-01-01"
predict_end = "2021-04-30"
stock_train = ak.stock_hk_hist(symbol=symbol, start_date=start, end_date=end, adjust='') # without adjust
stock_predict_actual = ak.stock_hk_hist(symbol=symbol, start_date=predict_start, end_date=predict_end, adjust='') # without adjust

# use GradientBoostingRegressor to predict stock price
# stock_train columns: 日期/开盘/收盘/最高/最低/成交量/成交额/振幅/涨跌幅/涨跌额/换手率
# use close price to predict

# add lag and rolling features
def create_features(data, lag_days=3, roll_days=3):
    for lag in range(1, lag_days + 1):
        data[f'lag_{lag}'] = data['收盘'].shift(lag)
    data['rolling_mean'] = data['收盘'].rolling(window=roll_days).mean()
    data['rolling_std'] = data['收盘'].rolling(window=roll_days).std()
    data.dropna(inplace=True)  # drop rows with NaN values
    return data

# create features
stock_train = create_features(stock_train)
stock_predict_actual = create_features(stock_predict_actual)

# form up X_train, y_train, X_test, y_test
X_train = stock_train.drop('收盘', axis=1)
y_train = stock_train['收盘']
X_test = stock_predict_actual.drop('收盘', axis=1)
y_test = stock_predict_actual['收盘']
# select only numerical features
X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

# 定义要比较的模型
models = {
    'GradientBoosting': GradientBoostingRegressor(),
    'RandomForest': RandomForestRegressor(),
    'SVR': SVR()
}

# 定义超参数网格
param_grids = {
    'GradientBoosting': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__max_depth': [3, 5, 7]
    },
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [5, 10, None],
        'model__min_samples_split': [2, 5, 10]
    },
    'SVR': {
        'model__C': [0.1, 1, 10],
        'model__gamma': ['scale', 'auto'],
        'model__kernel': ['rbf', 'linear']
    }
}

# 使用 TimeSeriesSplit 进行交叉验证
tscv = TimeSeriesSplit(n_splits=5)

# 存储结果
best_models = {}
results = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    # 定义管道
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    # 定义 GridSearchCV
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grids[model_name],
        cv=tscv,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )

    # 训练模型
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

    # 预测并评估模型
    y_pred = best_models[model_name].predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    # 存储结果
    results[model_name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'Best Params': grid_search.best_params_
    }

    # 打印每个模型的最优参数及评估结果
    print(f"{model_name} - Best Params: {grid_search.best_params_}")
    print(f"{model_name} - Optimized MAE: {mae}")
    print(f"{model_name} - Optimized MSE: {mse}")
    print(f"{model_name} - Optimized RMSE: {rmse}")

In [None]:
# 可视化预测效果 分为3个子图
plt.figure(figsize=(18, 6))
plt.suptitle(f"Stock Price Prediction for {symbol}.HK")

# 子图1
plt.subplot(1, 3, 1)
plt.plot(y_test.index, y_test, label='Actual', color='blue')
plt.plot(y_test.index, best_models['GradientBoosting'].predict(X_test), label='Predicted', color='red')
plt.title("GradientBoosting")
plt.legend()

# 子图2
plt.subplot(1, 3, 2)
plt.plot(y_test.index, y_test, label='Actual', color='blue')
plt.plot(y_test.index, best_models['RandomForest'].predict(X_test), label='Predicted', color='yellow')
plt.title("RandomForest")
plt.legend()

# 子图3
plt.subplot(1, 3, 3)
plt.plot(y_test.index, y_test, label='Actual', color='blue')
plt.plot(y_test.index, best_models['SVR'].predict(X_test), label='Predicted', color='green')
plt.title("SVR")
plt.legend()

plt.show()

In [None]:
# 画出每个模型的MAE, MSE, RMSE
metrics_df = pd.DataFrame(results).T
metrics_df.plot(kind='bar', figsize=(12, 6))
plt.ylabel("Error")
plt.title("Model Comparison - Error Metrics")
plt.show()