## Import Required Libraries

We'll import all necessary Python libraries for financial data analysis, portfolio optimization, and visualization.

In [1]:
import warnings
import os
from datetime import datetime, timedelta
import json
import pandas as pd
import numpy as np

from Backtester.helpers import create_stratified_datasets,prepare_folders,create_resampled_datasets

#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'AssetPricingPortfolio')))

warnings.filterwarnings('ignore')

# Define settings, load universe data and prepare resampled datasets for backtest

In [2]:
# define backtest settings
universe_name = "selection3" 

#common settings
datasets_settings={
    'lookback_periods': 21 * 6+1,  #np.ceil(252/4)  # in days
    'backtest_duration': 252 * 2,  # in days
    'num_datasets': 100,  # number of resampled datasets to create
    'random_seed': 12,  # random seed for reproducibility, otherwise None
    'num_assets': 53,  # number of assets in each resampled dataset. Only for full resample.
    'datasets_randomization_method': 'full_resample',
    'assets_per_group': 3,  # only for stratified sampling
    'group_by': 'asset_class',  # only for stratified sampling
}


In [3]:
universe_info_df,universe_data, test_settings = prepare_folders(universe_name, datasets_settings['backtest_duration'], datasets_settings['lookback_periods'], datasets_settings['num_datasets'], datasets_settings['random_seed'])

if datasets_settings['datasets_randomization_method'] == 'stratified':
    datasets_info, my_dataset_list = create_stratified_datasets(
        master_dataset=universe_data,
        universe_info_df=universe_info_df,
        assets_per_group=datasets_settings['assets_per_group'],
        group_by=datasets_settings['group_by'],
        backtest_duration=datasets_settings['backtest_duration'],
        num_datasets=datasets_settings['num_datasets'],
        random_seed=datasets_settings['random_seed']
    )
elif datasets_settings['datasets_randomization_method'] == 'full_resample':
    datasets_info, my_dataset_list = create_resampled_datasets(
        master_dataset=universe_data,
        #universe_info_df=universe_info_df,
        num_assets=datasets_settings['num_assets'],
        backtest_duration=datasets_settings['backtest_duration'],
        num_datasets=datasets_settings['num_datasets'],
        random_seed=datasets_settings['random_seed']
    )


#Save the datasets data
universe_folder_path = os.path.join(os.getcwd(),'data', universe_name)
test_folder_path = test_settings['test_folder_path']

universe_data = pd.read_csv(os.path.join(universe_folder_path, "universe_data.csv"), header=[0,1], index_col=0, parse_dates=True)
if isinstance(universe_data.columns, pd.MultiIndex):
    asset_names = universe_data.columns.get_level_values(0).unique()
    print(f"Downloaded data for {len(asset_names)} assets")
else:
    print(f"Downloaded data for {len(universe_data.columns)} assets")
print(f"Universe data date range: {universe_data.index.min().strftime('%Y-%m-%d')} to {universe_data.index.max().strftime('%Y-%m-%d')}")


# Convert DataFrame objects in datasets_info to dict for JSON serialization
serializable_datasets_info = {}
for k, v in datasets_info.items():
    serializable_v = v.copy()
    if isinstance(serializable_v.get('assets'), pd.DataFrame):
        serializable_v['assets'] = serializable_v['assets'].to_dict()
    serializable_datasets_info[k] = serializable_v

with open(os.path.join(test_folder_path, f'datasets_info.json'), 'w') as f:
    json.dump(serializable_datasets_info, f, indent=4)

### Save each dataset to CSV file
datasets_folder_path = os.path.join(test_folder_path, 'datasets')
os.makedirs(datasets_folder_path, exist_ok=True)
for i, dataset in enumerate(my_dataset_list):
    dataset.to_csv(os.path.join(datasets_folder_path, f'dataset_{i+1}.csv'))

with open(os.path.join(test_folder_path, 'test_settings.json'), 'w') as f:
    json.dump(test_settings, f, indent=4)

with open(os.path.join(test_folder_path, 'datasets_settings.json'), 'w') as f:
    json.dump(datasets_settings, f, indent=4)

Downloaded data for 53 assets
Universe data date range: 2012-10-24 to 2025-10-31
Created folder: c:\my-git\DataScience-novaIMS\APPM-individual\data\selection3\test-1
Downloaded data for 53 assets
Universe data date range: 2012-10-24 to 2025-10-31
