In [29]:
# import neccessary library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer 
import os 

In [30]:
# Avoid data leakage to test data
# Impute missing data in train and test datasets using KNNImputer.
def impute_data(train, test):
    imputer = KNNImputer(n_neighbors=5)

    # Fit the imputer on the training data
    imputer.fit(train)

    # Transform both training and testing data
    train_imputed = imputer.transform(train)
    test_imputed = imputer.transform(test)

    return train_imputed, test_imputed

# get the feature data from data folder
def load_index_data(start_date, test_start_dates, end_date, rebal_period):
    # Load index data, impute missing values and return train and test datasets
    index_data_dir = os.path.join(os.getcwd(), 'data', 'feature_data')
    result_train_df = pd.DataFrame()
    result_test_df = pd.DataFrame()
    
    index_data_name_file = {
        'CBOE DJIA Volatility Index':'CBOE DJIA Volatility Index - Close.xlsx',
        'CBOE NASDAQ Volatility Index': 'CBOE NASDAQ Volatility Index - Close.xlsx',
        'CBOE S&P100 Volatility Index': 'CBOE S&P100 Volatility Index - Close.xlsx',
        'CBOE S&P500 Volatility Index': 'CBOE S&P500 Volatility Index - Close.xlsx'
    }
    
    for ind_name in index_data_name_file.keys():
        input = pd.read_excel(os.path.join(index_data_dir, index_data_name_file[ind_name]), index_col='Date')
        input = input.sort_index(ascending=False)
        
        # Calculate monthly percentage changes to enhance the stationarity of the data
        # Fill the first with 0
        index_df = input.iloc[:, 0].resample(rebal_period).first().pct_change().fillna(0)
        
         # Split the data into train and test based on the provided dates
        train_data = index_df.loc[start_date:test_start_dates]
        test_data = index_df.loc[test_start_dates:end_date]
        
        train_imputed, test_imputed = impute_data(train_data.values.reshape(-1, 1), test_data.values.reshape(-1, 1))
        
        # Convert back to Series
        train_imputed_series = pd.Series(train_imputed.flatten(), index=train_data.index)
        test_imputed_series = pd.Series(test_imputed.flatten(), index=test_data.index)
        
        # Rename the series for clarity
        train_imputed_series.rename(ind_name, inplace=True)
        test_imputed_series.rename(ind_name, inplace=True)

        result_train_df = pd.concat([result_train_df, train_imputed_series], axis=1)
        result_test_df = pd.concat([result_test_df, test_imputed_series], axis=1)
        
    
    result_train_df.index = pd.to_datetime(result_train_df.index)
    result_test_df.index = pd.to_datetime(result_test_df.index)

    return result_train_df, result_test_df

In [32]:
# Define general parameters
train_start_date = '2007-01-01'
test_start_date = '2017-01-01'
test_end_date = '2022-12-31'
rebal_period = '1M'
train_data, test_data = load_index_data(train_start_date, test_start_date, test_end_date, rebal_period)

In [33]:
train_data

Unnamed: 0,CBOE DJIA Volatility Index,CBOE NASDAQ Volatility Index,CBOE S&P100 Volatility Index,CBOE S&P500 Volatility Index
2007-01-31,0.000000,0.000000,0.000000,0.000000
2007-02-28,-0.167095,-0.022286,-0.139153,-0.143688
2007-03-31,0.543210,0.257160,0.627510,0.534433
2007-04-30,-0.111333,-0.135286,-0.149291,-0.081542
2007-05-31,-0.065266,-0.089247,-0.067440,-0.070200
...,...,...,...,...
2016-08-31,-0.119215,-0.137255,-0.198561,-0.157752
2016-09-30,0.023089,0.059917,0.108618,0.083601
2016-10-31,0.101167,0.030539,0.090688,0.006677
2016-11-30,0.293993,0.302648,0.432814,0.367723


In [37]:
test_data

Unnamed: 0,CBOE DJIA Volatility Index,CBOE NASDAQ Volatility Index,CBOE S&P100 Volatility Index,CBOE S&P500 Volatility Index
2017-01-31,-0.039648,-0.065192,-0.160371,-0.086709
2017-02-28,-0.087920,-0.184932,-0.080645,-0.080934
2017-03-31,0.040235,-0.014515,0.078486,0.061812
2017-04-30,-0.066076,-0.041085,-0.105308,-0.012759
2017-05-31,-0.110440,-0.013743,-0.126316,-0.183360
...,...,...,...,...
2022-08-31,0.015541,-0.161074,0.000000,-0.144569
2022-09-30,0.173615,0.122087,0.000000,0.119089
2022-10-31,0.167716,0.102294,0.000000,0.177621
2022-11-30,-0.065075,-0.100394,0.000000,-0.142525


In [None]:
# get target index data
def index_cum_data_loading(start_date, end_date, rebal_period):
    index_data_dir = os.path.join(os.getcwd(), 'data', 'index_data')
    result_df = pd.DataFrame()
    
    index_data_name_file = {
        'msci_world':'MSCI World.xlsx'
    }
    
    for ind_name in index_data_name_file.keys():
        #load_index_data(self.start_date, self.end_date, index_data_dir, index_data_name_file[ind_name], ind_name)
        input = pd.read_excel(os.path.join(index_data_dir, index_data_name_file[ind_name]), index_col='Date')
        input = input.sort_index(ascending=False)
        reb_index = input.iloc[:, 0].resample(rebal_period).first()
        #calculate percentage change on month level
        index_df = reb_index / reb_index.iloc[0]
        index_df = index_df.fillna(1)
        index_df.rename(ind_name, inplace=True)
        index_df = index_df.loc[start_date:end_date]
        result_df = pd.concat([result_df, index_df], axis=1).fillna(0)
        
    result_df.index = pd.to_datetime(result_df.index)
    return result_df

In [35]:
def plot_target_index(word_index):
    ax = word_index.plot()
    ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:}".format(float(x))))
    plt.xlabel('Date')
    plt.ylabel('Cumulative P&L')
    plt.title('Cumulative Profit and Loss')
    plt.show()

In [36]:
# loading the target index
word_index = index_cum_data_loading(start_date, end_date, rebal_period)
# rename the target index
word_index.columns = ['msci_world_cum']
plot_target_index(word_index)

NameError: name 'start_date' is not defined

In [None]:
market_data = data.join(word_index, how='right')
market_data

In [None]:
market_data.corr()