In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from zipfile import ZipFile
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('selected_factor_and_return.csv')
df.set_index(['date'], inplace=True)

if "Unnamed: 0" in df.columns:
  df.drop("Unnamed: 0", axis=1, inplace=True)

df.head()


Unnamed: 0_level_0,Ticker,Size,Liquidity,cpv_1mo,mr_1yr,mr_1w,daily_return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-03-02,4151.T,0.603338,-0.370407,,,-1.381918,-0.007344
2005-03-03,4151.T,0.603338,-0.370407,0.426358,,-0.462423,0.016029
2005-03-04,4151.T,0.603338,-0.370407,-1.005234,,-1.477961,-0.001213
2005-03-07,4151.T,0.603338,-0.370407,-0.731422,,-0.398283,0.001215
2005-03-08,4151.T,0.603338,-0.370407,-1.081762,,-1.433256,0.002427


In [3]:
# get list of features and target column
features_cols = df.columns[1:-1]
target_col = df.columns[-1]

In [4]:
target = df[target_col]
has_nan = target.isnull().values.any()
print("Contains NaN:", has_nan)

has_inf = target.isin([np.inf, -np.inf]).values.any()
print("Contains infinite values:", has_inf)

Contains NaN: False
Contains infinite values: False


In [5]:
df=df.dropna(subset=['Size', 'Liquidity', 'cpv_1mo', 'mr_1yr', 'mr_1w'], how='all')
df.fillna(0, inplace=True)
data_1d = df.copy()


In [6]:
train_end_1 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.4)]
val_end_1 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.45)]
test_end_1 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.5)]

train_start_2 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.25)]
train_end_2 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.65)]
val_end_2 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.7)]
test_end_2 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.75)]

train_start_3 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.5)]
train_end_3 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.9)]
val_end_3 = data_1d.index.unique()[int(len(data_1d.index.unique())*0.95)]


In [7]:
train_data_1 = data_1d[data_1d.index <= train_end_1]
val_data_1 = data_1d[(data_1d.index > train_end_1) & (data_1d.index <= val_end_1)]
test_data_1 = data_1d[(data_1d.index > val_end_1) & (data_1d.index <= test_end_1)]

train_data_2 = data_1d[(data_1d.index > train_start_2) & (data_1d.index <= train_end_2) ]
val_data_2 = data_1d[(data_1d.index > train_end_2) & (data_1d.index <= val_end_2)]
test_data_2 = data_1d[(data_1d.index > val_end_2) & (data_1d.index <= test_end_2)]

train_data_3 = data_1d[(data_1d.index > train_start_3) & (data_1d.index <= train_end_3) ]
val_data_3 = data_1d[(data_1d.index > train_end_3) & (data_1d.index <= val_end_3)]
test_data_3 = data_1d[(data_1d.index > val_end_3)]



In [8]:
# Loop through the groups 1 to 3
for i in range(1, 4):
    # Process training data
    globals()[f'y_train_{i}'] = globals()[f'train_data_{i}'][target_col].copy()
    globals()[f'X_train_{i}'] = globals()[f'train_data_{i}'][features_cols].copy()

    # Process validation data
    globals()[f'y_val_{i}'] = globals()[f'val_data_{i}'][target_col].copy()
    globals()[f'X_val_{i}'] = globals()[f'val_data_{i}'][features_cols].copy()

    # Process testing data
    globals()[f'y_test_{i}'] = globals()[f'test_data_{i}'][target_col].copy()
    globals()[f'X_test_{i}'] = globals()[f'test_data_{i}'][features_cols].copy()


X_train_1.info()


<class 'pandas.core.frame.DataFrame'>
Index: 52439 entries, 2005-03-02 to 2012-09-20
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Size       52439 non-null  float64
 1   Liquidity  52439 non-null  float64
 2   cpv_1mo    52439 non-null  float64
 3   mr_1yr     52439 non-null  float64
 4   mr_1w      52439 non-null  float64
dtypes: float64(5)
memory usage: 2.4+ MB


The stock returns in the following two days are split to two target variables.

In [9]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV


In [10]:
%%time

parameters = {
    'n_estimators': [200,400],
    'learning_rate': [0.001, 0.005],
    'max_depth': [8, 10],
    'gamma': [0.001,  0.01],
    'random_state': [42]
}


CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 5.01 µs


In [11]:
from sklearn.metrics import mean_squared_error

In [12]:

for i in range(1, 4):
    # Constructing the eval_set with validation data
    eval_set = [(globals()[f'X_train_{i}'], globals()[f'y_train_{i}']), (globals()[f'X_val_{i}'], globals()[f'y_val_{i}'])]

    # Initialize the model
    model = xgb.XGBRegressor(objective='reg:squarederror', verbose=False)
    clf = GridSearchCV(model, parameters, verbose=1)

    # Fit the model
    clf.fit(globals()[f'X_train_{i}'], globals()[f'y_train_{i}'])

    print(f'Round {i}')
    print(f'Best params: {clf.best_params_}')
    print(f'Best validation score = {clf.best_score_}')

    # Fitting the model with the best parameters
    model = xgb.XGBRegressor(**clf.best_params_, objective='reg:squarederror')
    model.fit(globals()[f'X_train_{i}'], globals()[f'y_train_{i}'], eval_set=eval_set, verbose=False)

    # Plot feature importance
    fig, ax = plt.subplots(figsize=(10, 8))
    xgb.plot_importance(model, ax=ax)
    plt.savefig(f'XGB figure importance/feature_importance_{i}.png') 
    plt.close()  # Close the plot to free up memory


    # Make predictions on the test set
    y_pred = model.predict(globals()[f'X_test_{i}'])
    
    test_date = globals()[f'y_test_{i}'].index
    test_ticker = globals()[f'test_data_{i}'].Ticker
    pred_date_df = pd.DataFrame(test_date.to_list()).rename(columns={0: 'date'})
    pred_ticker_df = pd.DataFrame(test_ticker).reset_index(drop=True)
    y_pred_df = pd.DataFrame(y_pred).rename(columns={0: 'prediction'})
    pred_df = pd.concat([pred_date_df, pred_ticker_df, y_pred_df], axis=1)

    # calculate ic
    pred_pivot = pred_df.pivot(index='date', columns='Ticker', values='prediction').fillna(0)
    obs_pivot = globals()[f'test_data_{i}'][['Ticker','daily_return']].reset_index().pivot(index='date', columns='Ticker', values='daily_return').fillna(0)  

    ic = pred_pivot.corrwith(obs_pivot, axis=1).mean()
    print(f"Predicted cross-section IC: {ic:.4f}")
    
    mse = mean_squared_error(globals()[f'y_test_{i}'], y_pred)
    print(f'mean_squared_error = {mse}')
    
    
    # save final round predict
    pred_df.to_csv(f'backtest/predict data/XGB_rolling_test_3_round_{i}.csv', index=False)

    valid_date = globals()[f'y_val_{i}'].index
    valid_ticker = globals()[f'val_data_{i}'].Ticker
    valid_y_pred = model.predict(globals()[f'X_val_{i}'])
    valid_pred_date_df = pd.DataFrame(valid_date.to_list()).rename(columns={0: 'date'})
    valid_pred_ticker_df = pd.DataFrame(valid_ticker).reset_index(drop=True)
    valid_y_pred_df = pd.DataFrame(valid_y_pred).rename(columns={0: 'prediction'})
    valid_pred_df = pd.concat([valid_pred_date_df, valid_pred_ticker_df, valid_y_pred_df], axis=1)

    valid_pred_df.to_csv(f'backtest/predict data/XGB_rolling_val_3_round_{i}.csv', index=False)

print("Completed processing all groups.")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Round 1
Best params: {'gamma': 0.001, 'learning_rate': 0.001, 'max_depth': 8, 'n_estimators': 200, 'random_state': 42}
Best validation score = 0.0010135258932433943
Predicted cross-section IC: -0.0242
mean_squared_error = 0.0003154396565919581
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Round 2
Best params: {'gamma': 0.001, 'learning_rate': 0.005, 'max_depth': 8, 'n_estimators': 400, 'random_state': 42}
Best validation score = 0.002939120274534002
Predicted cross-section IC: -0.0034
mean_squared_error = 0.0004685664087873538
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Round 3
Best params: {'gamma': 0.001, 'learning_rate': 0.005, 'max_depth': 10, 'n_estimators': 400, 'random_state': 42}
Best validation score = 0.0049158610237910235
Predicted cross-section IC: 0.0076
mean_squared_error = 0.00032395473866185027
Completed processing all groups.
