In [28]:
import pandas as pd
import datetime
import logging
import requests
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup
import dotenv
import os
from pprint import pprint
import json

In [29]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 50)

In [30]:
dotenv.load_dotenv('../.env')
# os.getenv("AV_API_KEY")

True

In [38]:
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol=IBM&interval=5min&apikey={os.getenv("AV_API_KEY")}'
r = requests.get(url)
data = r.json()

In [39]:
data

{'Information': 'Thank you for using Alpha Vantage! Our standard API rate limit is 25 requests per day. Please subscribe to any of the premium plans at https://www.alphavantage.co/premium/ to instantly remove all daily rate limits.'}

In [32]:
with open('../data/raw/stock_data.json', 'w') as file:
    json.dump(data, file)

In [33]:
pprint(data.get('Time Series (5min)'))

None


In [34]:
df = pd.DataFrame(data.get("Time Series (5min)"))
df

In [35]:
df_transposed = df.T
df_transposed

In [36]:
df_transposed.columns = ['open', 'high', 'low', 'close', 'volume']

ValueError: Length mismatch: Expected axis has 0 elements, new values have 5 elements

In [None]:
df_transposed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 2024-04-29 19:55:00 to 2024-04-29 11:15:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   open    100 non-null    object
 1   high    100 non-null    object
 2   low     100 non-null    object
 3   close   100 non-null    object
 4   volume  100 non-null    object
dtypes: object(5)
memory usage: 8.8+ KB


In [None]:
df_transposed[['open', 'high', 'low', 'close', 'volume']] = df_transposed[['open', 'high', 'low', 'close', 'volume']].astype(float)

In [None]:
df_transposed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 2024-04-29 19:55:00 to 2024-04-29 11:15:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    100 non-null    float64
 1   high    100 non-null    float64
 2   low     100 non-null    float64
 3   close   100 non-null    float64
 4   volume  100 non-null    float64
dtypes: float64(5)
memory usage: 8.8+ KB


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_transposed.drop('close', axis=1)
y = df_transposed['close']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

In [None]:
xgb.predict(X_test)

array([166.88089, 166.39665, 166.8022 , 166.39645, 166.77872, 167.36183,
       167.1997 , 166.85292, 167.24858, 167.14357, 167.32582, 167.30098,
       166.83296, 167.34976, 166.56825, 167.25104, 166.57237, 166.5786 ,
       167.14293, 167.12738], dtype=float32)

In [None]:
xgb.score(X_test, y_test)

0.8710905947981256

In [None]:
import optuna
from sklearn.metrics import mean_squared_error, accuracy_score
import warnings
warnings.filterwarnings('ignore')

def objective(trial):
    # Define hyperparameters to be tuned
    param = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0)
    }

    if param['booster'] == 'gbtree' or param['booster'] == 'dart':
        param['max_depth'] = trial.suggest_int('max_depth', 1, 9)
        param['eta'] = trial.suggest_loguniform('eta', 1e-8, 1.0)
        param['gamma'] = trial.suggest_loguniform('gamma', 1e-8, 1.0)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    if param['booster'] == 'dart':
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        param['rate_drop'] = trial.suggest_loguniform('rate_drop', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)

    # Training the model
    model = XGBRegressor(**param, random_state=42)
    model.fit(X_train, y_train)

    # Predict on validation set and calculate RMSE
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    return rmse


In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
best_params = study.best_params
best_rmse = study.best_value

[I 2024-04-30 12:43:18,391] A new study created in memory with name: no-name-3e2ca323-0b32-4901-8f4b-da09f284e881
[I 2024-04-30 12:43:18,638] Trial 0 finished with value: 0.32204010701851526 and parameters: {'booster': 'gbtree', 'lambda': 8.521490898993901e-08, 'alpha': 3.241767511915741e-07, 'max_depth': 8, 'eta': 1.2841041749959345e-07, 'gamma': 2.0477254911151958e-05, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.32204010701851526.
[I 2024-04-30 12:43:18,905] Trial 1 finished with value: 0.3175567024390942 and parameters: {'booster': 'dart', 'lambda': 0.5320881848562823, 'alpha': 0.040426276675886746, 'max_depth': 9, 'eta': 0.00018175568567685047, 'gamma': 1.0086058005547998e-05, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 3.6758729328729314e-07, 'skip_drop': 8.796090674182637e-08}. Best is trial 1 with value: 0.3175567024390942.
[I 2024-04-30 12:43:18,944] Trial 2 finished with value: 0.12496198026204522 and parameters:

In [None]:
xgb_tuned = XGBRegressor(**best_params, random_state=42)
xgb_tuned.fit(X_train, y_train) 

In [None]:
xgb_tuned.score(X_test, y_test)

0.9161013940287901

In [None]:
xgb_tuned.predict(X_test)

array([166.85878, 166.50278, 166.74881, 166.5053 , 166.84102, 167.30414,
       167.20015, 166.74771, 167.24855, 167.1722 , 167.30414, 167.3015 ,
       166.85274, 167.30414, 166.58148, 167.25291, 166.59279, 166.58351,
       167.1914 , 167.1875 ], dtype=float32)

In [None]:
xgb_tuned.predict(df_transposed.drop('close', axis=1).tail(1))

array([166.9673], dtype=float32)