1. Write a function to load and process a dataset with multiple features with the following
requirements:
- a. This function will allow you to specify the start date and the end date for the whole dataset as inputs.
- b. This function will allow you to deal with the NaN issue in the data.
- c. This function will also allow you to use different methods to split the data into train/test data; e.g. you can split it according to some specified ratio of train/test and you can specify to split it by date or randomly.
- d. This function will have the option to allow you to store the downloaded data on your local machine for future uses and to load the data locally to save time.
- e. This function will also allow you to have an option to scale your feature columns and store the scalers in a data structure to allow future access to these scalers

In [153]:
# Libraries
import os
import pandas as pd
import yfinance as yf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [154]:
def load_process_data(ticker, start_date, end_date, handle_nan=True, split_method='ratio', scale=True, split_date='2021-06-01'):
    DATA_DIR = 'stock_data'
    test_size = 0.2
    feature_columns = ["Adj Close", "Volume", "Open", "High", "Low"]

    # Ensure the data directory exists
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

    # Define the file path and load or download data
    file_path = os.path.join(DATA_DIR, f'{ticker}_{start_date}_{end_date}.csv')
    if os.path.exists(file_path):
        print(f"Loading data from {file_path}")
        data = pd.read_csv(file_path, index_col='Date', parse_dates=True)
    else:
        print(f"Downloading data {ticker} from Yahoo finance")
        data = yf.download(ticker, start=start_date, end=end_date)
        data.to_csv(file_path)
        print(f"Data saved to {file_path}")

    # Handle NaN values
    if handle_nan:
        data.dropna(inplace=True)
    else:
        data.fillna(data.mean(), inplace=True)

    # Split data according to the specified method
    result = {}
    if split_method == 'ratio':
        print('Splitting by ratio')
        train_samples = int((1 - test_size) * len(data))
        result['data_train'] = data[:train_samples]
        result['data_test'] = data[train_samples:]
    elif split_method == 'date':
        print(f'Splitting by date: {split_date}')
        split_date = pd.to_datetime(split_date)
        result['data_train'] = data[data.index <= split_date]
        result['data_test'] = data[data.index > split_date]
    elif split_method == 'random':
        print('Randomly splitting with shuffling')
        result['data_train'], result['data_test'] = train_test_split(data, test_size=test_size, shuffle=True)
    else:
        raise ValueError("Invalid split_method. Choose 'ratio', 'date', or 'random'.")

    # Scale feature columns if required
    if scale:
        scaler = MinMaxScaler(feature_range=(0, 1))
        result['data_train'][feature_columns] = scaler.fit_transform(result['data_train'][feature_columns])
        result['data_test'][feature_columns] = scaler.transform(result['data_test'][feature_columns])
        result['scaler'] = scaler

    return result

In [161]:
# Example usage of load_process_data

ticker = 'AAPL'
start_date = '2020-01-01'
end_date = '2021-01-01'
split_date = '2020-06-01'

# Example 1: Splitting by ratio and scaling the data
print("Example 1: Splitting by ratio and scaling the data")
result_ratio = load_process_data(
    ticker=ticker,
    start_date=start_date,
    end_date=end_date,
    handle_nan=True,
    split_method='ratio',
    scale=True
)
print("Training data sample (ratio split):")
print(result_ratio['data_train'].head())
print("\nTest data sample (ratio split):")
print(result_ratio['data_test'].head())
print("\nScaler parameters (ratio split):")
print(result_ratio['scaler'].data_min_, result_ratio['scaler'].data_max_)

# Example 2: Splitting by date
print("\nExample 2: Splitting by date")
result_date = load_process_data(
    ticker=ticker,
    start_date=start_date,
    end_date=end_date,
    handle_nan=True,
    split_method='date',
    split_date=split_date,
    scale=False
)
print("Training data sample (date split):")
print(result_date['data_train'].head())
print("\nTest data sample (date split):")
print(result_date['data_test'].head())

# Example 3: Random split with scaling
print("\nExample 3: Random split with scaling")
result_random = load_process_data(
    ticker=ticker,
    start_date=start_date,
    end_date=end_date,
    handle_nan=True,
    split_method='random',
    scale=True
)
print("Training data sample (random split):")
print(result_random['data_train'].head())
print("\nTest data sample (random split):")
print(result_random['data_test'].head())
print("\nScaler parameters (random split):")
print(result_random['scaler'].data_min_, result_random['scaler'].data_max_)


Example 1: Splitting by ratio and scaling the data
Downloading data AAPL from Yahoo finance


[*********************100%%**********************]  1 of 1 completed

Data saved to stock_data/AAPL_2020-01-01_2021-01-01.csv
Splitting by ratio
Training data sample (ratio split):
                Open      High       Low      Close  Adj Close    Volume
Date                                                                    
2020-01-02  0.211493  0.222930  0.266809  75.087502   0.239121  0.159837
2020-01-03  0.214317  0.222868  0.271041  74.357498   0.229866  0.191137
2020-01-06  0.203891  0.220951  0.258925  74.949997   0.237378  0.110491
2020-01-07  0.222664  0.223858  0.274208  74.597504   0.232909  0.083022
2020-01-08  0.214348  0.234803  0.273174  75.797501   0.248122  0.150018

Test data sample (ratio split):
                Open      High       Low       Close  Adj Close    Volume
Date                                                                     
2020-10-20  0.734517  0.765012  0.807438  117.510002   0.787209  0.127917
2020-10-21  0.740350  0.761672  0.818035  116.870003   0.779039  0.028385
2020-10-22  0.750031  0.753386  0.793997  115.750




In [155]:
TICKER = 'AMZN'

START_DATE = '2020-01-01'

END_DATE = '2023-01-01'

In [156]:
data = load_process_data(TICKER, START_DATE, END_DATE)

Loading data from stock_data/AMZN_2020-01-01_2023-01-01.csv
Splitting by ratio


In [159]:
handle_nan=True
split_method='date'
scale=False
split_date='2021-06-01'

In [150]:
data = load_process_data(TICKER, START_DATE, END_DATE, dealNaN, split_by_ratio, split_date, scale)

Loading data from stock_data/AMZN_2020-01-01_2023-01-01.csv


ValueError: Invalid split_method. Choose 'ratio', 'date', or 'random'.

In [151]:
handle_nan = False
split_by_ratio = False
split_by_date = False
split_by_randomly = True
scale = True
split_date = '06-01-2021'

In [152]:
data = load_process_data(TICKER, START_DATE, END_DATE, dealNaN, split_by_ratio, split_date, scale)

Loading data from stock_data/AMZN_2020-01-01_2023-01-01.csv
Split by date with specified date
Training samples: 356
Test samples: 400
