# Empirical analysis of model predictions

### Load modules

In [1]:
# Imports
import pandas as pd
import numpy as np

### Define Functions

In [2]:
# Initialize dataframes used for aggregation and evaluation
def initialize_evaluation_dataframes():
    """
    This function creates dataframes used for the aggregated evaluation

    :return: dataframes for aggregated evaluation
    """
    # R2-scores
    r2_scores = pd.DataFrame(columns=['model', 'r2_score']).set_index('model')

    # Portfolio metrics
    equal_weight = pd.DataFrame()
    value_weight = pd.DataFrame()
    buy_and_hold = pd.DataFrame()

    # Performance metrics
    ew_winner_perform = pd.DataFrame(columns=['performance_metrics']).set_index('performance_metrics')
    vw_winner_perform = pd.DataFrame(columns=['performance_metrics']).set_index('performance_metrics')
    ew_wml_perform = pd.DataFrame(columns=['performance_metrics']).set_index('performance_metrics')
    vw_wml_perform = pd.DataFrame(columns=['performance_metrics']).set_index('performance_metrics')

    return r2_scores, equal_weight, value_weight, buy_and_hold, ew_winner_perform, vw_winner_perform, ew_wml_perform, vw_wml_perform


def initialize_plotting_dataframes():
    """
    This function creates dataframes used for the ploting the results.

    :return: dataframes for plotting
    """
    ew_plot_long = pd.DataFrame(columns=['YM']).set_index('YM')
    vw_plot_long = pd.DataFrame(columns=['YM']).set_index('YM')
    ew_plot_long_short = pd.DataFrame(columns=['YM']).set_index('YM')
    vw_plot_long_short = pd.DataFrame(columns=['YM']).set_index('YM')

    return ew_plot_long, vw_plot_long, ew_plot_long_short, vw_plot_long_short


# General analysis
def calculate_r2_score(y_true, y_pred):
    """
    This function calculates my custom r2-score (without demeaning the denominator)

    :param y_true: array of true returns
    :param y_pred: array of predicted returns
    :return: custom r2 score
    """
    return 1 - (np.power(y_true - y_pred, 2)).sum() / (np.power(y_true, 2)).sum()


# Machine learning portfolios
label = [1, 2, 3, 4, 5]
def create_empty_dataframes(label):
    """
    This function creates empty dataframes used for the analysis of machine learning portfolios.

    :param label: label of resulting portfolios (quintiles)
    :return: several dataframes
    """
    cols_df = label + ['YM']

    # create df's
    ew_pf_ret_true = pd.DataFrame(columns=cols_df).set_index('YM')
    ew_pf_ret_pred = pd.DataFrame(columns=cols_df).set_index('YM')
    vw_pf_ret_true = pd.DataFrame(columns=cols_df).set_index('YM')
    vw_pf_ret_pred = pd.DataFrame(columns=cols_df).set_index('YM')
    bh_pf_ret = pd.DataFrame(columns=['bh_ew', 'bh_vw', 'YM']).set_index('YM')
    
    return ew_pf_ret_true, ew_pf_ret_pred, vw_pf_ret_true, vw_pf_ret_pred, bh_pf_ret


def create_monthly_returns_and_weights(returns, month, ew_pf_ret_true, ew_pf_ret_pred, vw_pf_ret_true, vw_pf_ret_pred, bh_pf_ret):
    """
    This function creates monthly portfolio returns and append the results to the respective dataframe.
    Furthermore, it returns the weights used for the construction of the machine learning portfolios.

    :param returns: dataframe with predicted and true returns
    :param month: value of the current month
    :param ew_pf_ret_true: aggregated df with true equal weighted portfolio returns for each label
    :param ew_pf_ret_pred: aggregated df with predicted equal weighted portfolio returns for each label
    :param vw_pf_ret_true: aggregated df with true value weighted portfolio returns for each label
    :param vw_pf_ret_pred: aggregated df with predicted value weighted portfolio returns for each label
    :param bh_pf_ret: aggregated df with portfolio returns for buy and hold
    :return: dataframes with weights used to construct the machine learning portfolios
    """
    # Initialize
    df = returns.loc[month]
    label = list(ew_pf_ret_true.columns)

    # Create & assign labels
    df['label_ew'] = pd.qcut(df['y_pred'].rank(method='first'), q=len(label), labels=label)  # Equal weight
    df = df.sort_values(by=['y_pred'])
    df['mve_m_cumsum'] = df.mve_m.cumsum()  # Helping column for labeling value weight
    df['label_vw'] = pd.cut(df['mve_m_cumsum'], bins=len(label), labels=label)  # Value weight
    df = df.reset_index()
    df = df.set_index(['permno', 'label_ew', 'label_vw'])

    # Create supporting dataframes
    df_ew = pd.DataFrame()
    df_vw = pd.DataFrame()

    # Create equal and value weights
    df['const'] = 1
    df_ew['ew_sum'] = df.groupby('label_ew')['const'].sum()
    df = df.join(df_ew, on=['label_ew'])
    df_vw['vw_sum'] = df.groupby('label_vw')['mve_m'].sum()
    df = df.join(df_vw, on=['label_vw'])
    df['ew_weight'] = df['const'] / df['ew_sum']
    df['vw_weight'] = df['mve_m'] / df['vw_sum']

    # Create weights for buy-and-hold
    n_stocks = df.shape[0]
    mve_m_sum = df['mve_m'].sum()
    df['bh_ew_weight'] = df['const'] / n_stocks
    df['bh_vw_weight'] = df['mve_m'] / mve_m_sum

    # Create dataframe with equal weights
    df_ew_weights = df.reset_index()
    df_ew_weights = df_ew_weights.loc[:, ['permno', 'label_ew', 'ew_weight']]
    df_ew_weights['YM'] = month
    df_ew_weights = df_ew_weights.set_index(['YM', 'permno'])

    # Create dataframe with value weights
    df_vw_weights = df.reset_index()
    df_vw_weights = df_vw_weights.loc[:, ['permno', 'label_vw', 'vw_weight']]
    df_vw_weights['YM'] = month
    df_vw_weights = df_vw_weights.set_index(['YM', 'permno'])

    # Create weighted returns
    df['ew_ret_true'] = df['ew_weight'] * df['y_true']
    df['ew_ret_pred'] = df['ew_weight'] * df['y_pred']
    df['vw_ret_true'] = df['vw_weight'] * df['y_true']
    df['vw_ret_pred'] = df['vw_weight'] * df['y_pred']

    # Create portfolio returns
    ew_pf_ret_true.loc[month] = (np.log(1+df.groupby('label_ew')['ew_ret_true'].sum())).values
    ew_pf_ret_pred.loc[month] = (np.log(1+df.groupby('label_ew')['ew_ret_pred'].sum())).values
    vw_pf_ret_true.loc[month] = (np.log(1+df.groupby('label_vw')['vw_ret_true'].sum())).values
    vw_pf_ret_pred.loc[month] = (np.log(1+df.groupby('label_vw')['vw_ret_pred'].sum())).values

    # Create bh portfolio returns
    bh_pf_ret.loc[month, 'bh_ew'] = np.log(1+np.sum(df['bh_ew_weight'] * df['y_true']))
    bh_pf_ret.loc[month, 'bh_vw'] = np.log(1+np.sum(df['bh_vw_weight'] * df['y_true']))

    return df_ew_weights, df_vw_weights


def create_winner_minus_loser(ew_pf_ret_true, ew_pf_ret_pred, vw_pf_ret_true, vw_pf_ret_pred):
    """
    This function creates winner minus loser portfolios for each input dataframe

    :param ew_pf_ret_true: aggregated df with true equal weighted portfolio returns for each label
    :param ew_pf_ret_pred: aggregated df with predicted equal weighted portfolio returns for each label
    :param vw_pf_ret_true: aggregated df with true value weighted portfolio returns for each label
    :param vw_pf_ret_pred: aggregated df with predicted value weighted portfolio returns for each label
    """
    ew_pf_ret_true['wml'] = ew_pf_ret_true.iloc[:, -1] - ew_pf_ret_true.iloc[:, 0]
    ew_pf_ret_pred['wml'] = ew_pf_ret_pred.iloc[:, -1] - ew_pf_ret_pred.iloc[:, 0]
    vw_pf_ret_true['wml'] = vw_pf_ret_true.iloc[:, -1] - vw_pf_ret_true.iloc[:, 0]
    vw_pf_ret_pred['wml'] = vw_pf_ret_pred.iloc[:, -1] - vw_pf_ret_pred.iloc[:, 0]


def create_aggregated_portfolio_metrics(model, equal_weight, value_weight, ew_pf_ret_true, ew_pf_ret_pred, vw_pf_ret_true, vw_pf_ret_pred):
    """
    This function creates aggregated portfolio metrics for equal and value weighted portfolios (namely average monthly
    predicted returns, average monthly realized returns, their standard deviations, and the annualized sharpe ratio)

    :param model: name of the model
    :param equal_weight: evaluation dataframe for equal weights
    :param value_weight: evaluation dataframe for value weights
    :param ew_pf_ret_true: aggregated df with true equal weighted portfolio returns for each label
    :param ew_pf_ret_pred: aggregated df with predicted equal weighted portfolio returns for each label
    :param vw_pf_ret_true: aggregated df with true value weighted portfolio returns for each label
    :param vw_pf_ret_pred: aggregated df with predicted value weighted portfolio returns for each label
    """
    # Number of month (we calculate for monthly portfolio metrics)
    num_month = ew_pf_ret_true.shape[0]

    # Equal weight
    equal_weight['{}_pred'.format(model)] = ew_pf_ret_pred.sum() / num_month * 100  # avg monthly predicted returns
    equal_weight['{}_true'.format(model)] = ew_pf_ret_true.sum() / num_month * 100  # avg monthly realized returns
    equal_weight['{}_std'.format(model)] = ew_pf_ret_true.std() * 100  # std of monthly realized returns
    equal_weight['{}_SR'.format(model)] = (equal_weight['{}_true'.format(model)] * 12) / (equal_weight['{}_std'.format(model)] * np.sqrt(12))  # annualized sharpe ratio

    # Value weight
    value_weight['{}_pred'.format(model)] = vw_pf_ret_pred.sum() / num_month * 100  # avg monthly predicted returns
    value_weight['{}_true'.format(model)] = vw_pf_ret_true.sum() / num_month * 100  # avg monthly realized returns
    value_weight['{}_std'.format(model)] = vw_pf_ret_true.std() * 100  # std of monthly realized returns
    value_weight['{}_SR'.format(model)] = (value_weight['{}_true'.format(model)] * 12) / (value_weight['{}_std'.format(model)] * np.sqrt(12))  # annualized sharpe ratio


def calculate_turnover(df_weights, predictions, quintile='High'):
    """
    This function calculates the average monthly turnover for High/Low quintile portfolios (equal and value weighted).

    :param df_weights: dataframe with weights and respective label of stocks for every month
    :param predictions: dataframe with true monthly returns for every stock
    :param quintile: ['High', 'Low'] string to specify whether to select High or Low quintile
    :return: average monthly turnover for winner portfolio
    """
    # Combine portfolio weights and true monthly returns
    df = df_weights.join(predictions)

    # Rename columns
    if 'ew_weight' in df.columns:
        df = df.rename(columns={'ew_weight': 'weight', 'label_ew': 'label'})
    elif 'vw_weight' in df.columns:
        df = df.rename(columns={'vw_weight': 'weight', 'label_vw': 'label'})
    else:
        print('no appropriate weight column found')
        return

    # Select quintile
    if quintile == 'High':
        df = df[df.label == df.label.max()]
    elif quintile == 'Low':
        df = df[df.label == df.label.min()]
    else:
        print('no appropriate quintile entered')
        return
    df = df.reset_index()

    # Create column with weight x return
    df['w_ret'] = df.weight * (1 + df.y_true)
    
    # Create pivot1 with weight in month t
    pivot1 = df.pivot(index='YM', columns='permno', values='weight').fillna(0)

    # Create pivot2 (pivot1 shifted by one month)  -> w_i,t+1
    pivot2 = pivot1.shift(-1)

    # Create pivot3 with weight in month t  x  stock return month t+1  -> # w_i,t * (1+r_i,t+1)
    pivot3 = df.pivot(index='YM', columns='permno', values='w_ret').fillna(0)
    
    # Apply turnover formula: sum(sum(abs( w_i,t+1 - (w_i,t * (1+r_i,t+1))/sum(w_i,t * (1+r_i,t+1)) ))) / T
    turnover = abs(pivot2.sub(pivot3.div(pivot3.sum(axis=1), axis=0))).sum().sum() / (pivot2.shape[0]-1)
    turnover = turnover * 100  # monthly turnover in percent
    
    return turnover


def calculate_max_dd(df_true_pf_returns, portfolio_type):
    """
    This function calculates the maximum drawdown of portfolios.

    :param df_true_pf_returns: dataframe with true monthly portfolio returns
    :param portfolio_type: string to specify if it's a 'winner' (long only) or 'wml' portfolio (long-short)
    :return: maximum drawdown
    """
    # Create cumulative log returns
    if portfolio_type == 'winner':
        cum_log_returns = df_true_pf_returns.iloc[:, -2].cumsum()
    elif portfolio_type == 'wml':
        cum_log_returns = df_true_pf_returns.loc[:, 'wml'].cumsum()
    else:
        print('wrong input')
        return

    # Return maximum drawdown
    return (cum_log_returns.cummax() - cum_log_returns).max() * 100


def calculate_alpha(true_pf_returns, df_bm_returns):
    """
    This function calculate the jensen's alpha as well as the t-statistic of it.

    :param true_pf_returns: dataframe with monthly true returns of a machine learning portfolio
    :param df_bm_returns:  dataframe with monthly returns of a benchmark (buy and hold portfolio)
    :return: alpha, t-statistic(alpha)
    """
    # Stack and transpose the both return dfs in a numpy array
    array = np.stack((true_pf_returns, df_bm_returns), axis=1).T.astype('float64')

    # Calculate covariance matrix
    COV = np.cov(array, bias=True)
   
    # Calculate beta
    beta = COV[0, 1] / COV[1, 1]

    # Calculate time-series of portfolio excess returns (over benchmark) and monthly jensen's alpha
    df_alpha = true_pf_returns - beta * df_bm_returns
    alpha = df_alpha.mean() * 100

    # Calculate t-stats of the time-series alpha as  t = (mean excess return * sqrt(n)) / std(excess returns) 
    t_test = df_alpha.mean() * np.sqrt(df_alpha.shape[0]) / df_alpha.std()

    return alpha, t_test

### R2-score, portfolio metrics, and performance metrics for predictions of each machine learning method

In [3]:
# Define models
#models = ['OLS-3', 'RF_trad', 'NN1_trad', 'NN2_trad', 'NN3_trad', 'RF_esg', 'NN1_esg', 'NN2_esg', 'NN3_esg']
models = ['OLS-3', 'RF_trad', 'RF_esg', 'NN1_trad', 'NN1_esg', 'NN2_trad', 'NN2_esg', 'NN3_trad', 'NN3_esg']

In [4]:
# Initialize dataframes
r2_scores, equal_weight, value_weight, buy_and_hold, ew_winner_perform, vw_winner_perform, ew_wml_perform, vw_wml_perform = initialize_evaluation_dataframes()
ew_plot_long, vw_plot_long, ew_plot_long_short, vw_plot_long_short = initialize_plotting_dataframes()

In [5]:
# Get relevant monthly firm characteristics raw data
chars = pd.read_parquet('C:/Users/rafae/Documents/HSG/Master Thesis/Data/Final/data07_model_input.parquet')
chars = chars.sort_values(by=['YM', 'permno'])

# Filter for testing years
chars = chars.set_index('year')
chars = chars.loc[['2019', '2020', '2021']]
chars['YM'] = chars['YM'].dt.strftime('%Y-%m')
chars = chars.reset_index().set_index(['YM', 'permno'])

# Select relevant columns
chars = chars[['mve_m', 'ret_ex']]
chars = chars.rename(columns={'ret_ex': 'y_true'})
chars 

Unnamed: 0_level_0,Unnamed: 1_level_0,mve_m,y_true
YM,permno,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01,10026,2.899156e+06,0.004225
2019-01,10104,1.802714e+08,0.036026
2019-01,10107,8.012092e+08,0.075381
2019-01,10138,2.224993e+07,0.072777
2019-01,10145,1.063276e+08,0.076596
...,...,...,...
2021-12,93304,1.695898e+06,-0.096386
2021-12,93373,2.065325e+05,-0.019481
2021-12,93374,8.587073e+06,-0.047552
2021-12,93423,3.661156e+06,-0.072569


In [6]:
# Iteration over models
for model in models:
    # Get monthly predictions
    pred = pd.read_csv(r'results/{}_predictions.csv'.format(model), index_col=['YM', 'permno'])
    
    # Append preditions to true returns and market cap
    pred = pd.concat([chars, pred], axis=1)
    
    # R2-score
    # Calculate r2 score
    r2 = calculate_r2_score(pred.y_true, pred.y_pred)
    r2_scores.loc[model] = r2
    
    # Portfolio metrics
    # Create empty dataframes for machine learning portfolios
    ew_pf_ret_true, ew_pf_ret_pred, vw_pf_ret_true, vw_pf_ret_pred, bh_pf_ret = create_empty_dataframes(label)

    # Iterate over months (machine learning portfolios)
    ew_weights = []
    vw_weights = []
    months = list(pred.reset_index().set_index('YM').index.drop_duplicates().astype(str))
    for month in months:
        # Create monthly machine learning portfolio returns and weights
        df_ew_weights, df_vw_weights = create_monthly_returns_and_weights(pred, month, ew_pf_ret_true, ew_pf_ret_pred, vw_pf_ret_true, vw_pf_ret_pred, bh_pf_ret)
        ew_weights.append(df_ew_weights)
        vw_weights.append(df_vw_weights)
    ew_weights = pd.concat(ew_weights)
    vw_weights = pd.concat(vw_weights)
    
    # Create winner minus loser portfolios
    create_winner_minus_loser(ew_pf_ret_true, ew_pf_ret_pred, vw_pf_ret_true, vw_pf_ret_pred)
    
    # Create aggregated portfolio metrics
    create_aggregated_portfolio_metrics(model, equal_weight, value_weight, ew_pf_ret_true, ew_pf_ret_pred, vw_pf_ret_true, vw_pf_ret_pred)
    
    # Performance metrics
    # Calculate maximum monthly drawdowns
    ew_winner_perform.loc['max_dd', model] = calculate_max_dd(ew_pf_ret_true, 'winner')
    vw_winner_perform.loc['max_dd', model] = calculate_max_dd(vw_pf_ret_true, 'winner')
    ew_wml_perform.loc['max_dd', model] = calculate_max_dd(ew_pf_ret_true, 'wml')
    vw_wml_perform.loc['max_dd', model] = calculate_max_dd(vw_pf_ret_true, 'wml')
        
    # Calculate maximum 1 month loss
    ew_winner_perform.loc['max_1m_loss', model] = ew_pf_ret_true.loc[:,max(label)].min() * (-100)
    vw_winner_perform.loc['max_1m_loss', model] = ew_pf_ret_true.loc[:,max(label)].min() * (-100)
    ew_wml_perform.loc['max_1m_loss', model] = ew_pf_ret_true.loc[:, 'wml'].min() * (-100)
    vw_wml_perform.loc['max_1m_loss', model] = vw_pf_ret_true.loc[:, 'wml'].min() * (-100)
        
    # Calculate avg monthly turnover (winner and wml portfolio)
    ew_winner_perform.loc['turnover', model] = calculate_turnover(ew_weights, pred, quintile='High')
    vw_winner_perform.loc['turnover', model] = calculate_turnover(vw_weights, pred, quintile='High')
    ew_wml_perform.loc['turnover', model] = calculate_turnover(ew_weights, pred, quintile='High') + calculate_turnover(ew_weights, pred, quintile='Low')
    vw_wml_perform.loc['turnover', model] = calculate_turnover(vw_weights, pred, quintile='High') + calculate_turnover(vw_weights, pred, quintile='Low')
    
    # Assign monthly mean return and annualized SR to the performance metrics
    ew_winner_perform.loc['SR_(p.a.)', model], ew_winner_perform.loc['mean_return', model] = equal_weight.loc[max(label), '{}_SR'.format(model)], equal_weight.loc[max(label), '{}_true'.format(model)]
    vw_winner_perform.loc['SR_(p.a.)', model], vw_winner_perform.loc['mean_return', model] = value_weight.loc[max(label), '{}_SR'.format(model)], value_weight.loc[max(label), '{}_true'.format(model)]
    ew_wml_perform.loc['SR_(p.a.)', model], ew_wml_perform.loc['mean_return', model] = equal_weight.loc['wml', '{}_SR'.format(model)], equal_weight.loc['wml', '{}_true'.format(model)]
    vw_wml_perform.loc['SR_(p.a.)', model], vw_wml_perform.loc['mean_return', model] = value_weight.loc['wml', '{}_SR'.format(model)], value_weight.loc['wml', '{}_true'.format(model)]
    
    # Calculate Performance Metrics (relative to OLS-3 benchmark)
    if (model != 'OLS-3'):
        # Calculate alpha and t-stats
        ew_winner_perform.loc['alpha', model], ew_winner_perform.loc['t-stat(a)', model] = calculate_alpha(ew_pf_ret_true.loc[:,max(label)], ew_plot_long['OLS-3_long'])
        vw_winner_perform.loc['alpha', model], vw_winner_perform.loc['t-stat(a)', model] = calculate_alpha(vw_pf_ret_true.loc[:,max(label)], vw_plot_long['OLS-3_long'])
        ew_wml_perform.loc['alpha', model], ew_wml_perform.loc['t-stat(a)', model] = calculate_alpha(ew_pf_ret_true.loc[:,'wml'], ew_plot_long_short['OLS-3'])
        vw_wml_perform.loc['alpha', model], vw_wml_perform.loc['t-stat(a)', model] = calculate_alpha(vw_pf_ret_true.loc[:,'wml'], vw_plot_long_short['OLS-3'])

    # Generate Dataframes for plotting
    ew_plot_long['{}_long'.format(model)], ew_plot_long['{}_short'.format(model)] = ew_pf_ret_true.loc[:, max(label)], ew_pf_ret_true.loc[:, min(label)]
    vw_plot_long['{}_long'.format(model)], vw_plot_long['{}_short'.format(model)] = vw_pf_ret_true.loc[:, max(label)], vw_pf_ret_true.loc[:, min(label)]
    ew_plot_long_short['{}'.format(model)] = ew_pf_ret_true.loc[:,'wml']
    vw_plot_long_short['{}'.format(model)] = vw_pf_ret_true.loc[:,'wml']
    
# Create buy and hold portfolio metrics
buy_and_hold['buy_and_hold'] = bh_pf_ret.sum()

# Add buy and hold to plotting dataframes
ew_plot_long['buy_and_hold'] = bh_pf_ret.bh_ew
vw_plot_long['buy_and_hold'] = bh_pf_ret.bh_vw

# Save results
r2_scores.to_excel(r'empirical_analysis/r2_scores.xlsx')
equal_weight.to_excel(r'empirical_analysis/ew_portfolio_metrics.xlsx')
value_weight.to_excel(r'empirical_analysis/vw_portfolio_metrics.xlsx')
buy_and_hold.to_excel(r'empirical_analysis/bh_portfolio_metrics.xlsx')
bh_pf_ret.to_excel(r'empirical_analysis/bh_pf_returns.xlsx')
ew_winner_perform.to_excel(r'empirical_analysis/ew_performance_winner.xlsx')
vw_winner_perform.to_excel(r'empirical_analysis/vw_performance_winner.xlsx')
ew_wml_perform.to_excel(r'empirical_analysis/ew_performance_wml.xlsx')
vw_wml_perform.to_excel(r'empirical_analysis/vw_performance_wml.xlsx')
ew_plot_long.to_excel(r'empirical_analysis/ew_plot_long.xlsx')
vw_plot_long.to_excel(r'empirical_analysis/vw_plot_long.xlsx')
ew_plot_long_short.to_excel(r'empirical_analysis/ew_plot_long_short.xlsx')
vw_plot_long_short.to_excel(r'empirical_analysis/vw_plot_long_short.xlsx')