In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import pickle

In [5]:
# import data
data = pd.read_csv('../../data/definitive_dataset.csv')
data['date'] = pd.to_datetime(data['date'])

# load selected features with pickle
with open('../../data/selected_features.pkl', 'rb') as f:
    selected_features = pickle.load(f)

price_data = pd.read_csv('../../data/non_diff_adjusted_price_data.csv')
price_data['date'] = pd.to_datetime(price_data['date'])

In [None]:
# NO NEED TO RUN THIS CELL

for period in selected_features:
    start_date = pd.to_datetime(period)
    end_date = start_date + pd.DateOffset(years=2) - pd.DateOffset(days=91)

    training_data = data[(data['date'] >= start_date) & (data['date'] <= end_date)]
    training_features = selected_features[start_date]

    X = training_data[training_features].copy()
    y = training_data['target'].copy()
    negative_mask = (y == -1)
    y[negative_mask] = 0

    # import the model with pickle
    n_estimators = 500
    start_date_str = start_date.strftime('%Y-%m-%d')
    model_name = f'xg_boost_{start_date_str}_{n_estimators}_correct_dataset.pkl'
    try:
        with open(f'../../models/final_models/{model_name}', 'rb') as f:
            model = pickle.load(f)
    except FileNotFoundError:
        print(f'Model {model_name} not found')
        print('')
        print('-----------------------------------------------------------------------------------------------------------------')    
        continue
    first_trading_date = start_date + pd.DateOffset(years=2)
    dates = data['date'].unique()

    # Print basic infomation:
    print(f'Period: {period}\n')
    print(f'Training features:\n{training_features}')

    # find the first trading day after the end date
    while first_trading_date not in dates:
        first_trading_date += pd.DateOffset(days=1)

    print(f'First trading date: {first_trading_date}')

    X_prediction_mask = data['date'] == first_trading_date
    X_prediction = data[X_prediction_mask][training_features].copy()

    assert len(X_prediction) > 0, 'No data to predict'
    permnos_first_trading_day = data[X_prediction_mask]['permno'].copy()
    target = data[X_prediction_mask]['target'].copy()
    target[target == -1] = 0


    return_date = first_trading_date + pd.DateOffset(days=90)

    # find the first trading day after the end date
    while return_date not in dates:
        return_date -= pd.DateOffset(days=1)

    print(f'Return date: {return_date}')

    X_return_mask = price_data['date'] == return_date
    X_prediction_mask = price_data['date'] == first_trading_date

    # calculate 90 days return
    trading_day_price = price_data[X_prediction_mask][['permno', 'prc_adj']].copy()
    return_day_price = price_data[X_return_mask]['prc_adj'].copy()
    permnos_return = price_data[X_return_mask]['permno'].copy()


    # find the predictions
    y_pred_proba = model.predict_proba(X_prediction)
    y_pred = model.predict(X_prediction)

    # check if prediction is correct
    correct_prediction = y_pred == target

    # put the predictions in a dataframe
    predictions_df = pd.DataFrame({
        'permno': permnos_first_trading_day,
        '0_probability': y_pred_proba[:, 0],
        '1_probability': y_pred_proba[:, 1],
        'prediction': y_pred,
        'correct_prediction': correct_prediction
    })
    trading_day_price.rename(columns={'prc_adj': 'trading_day_price'}, inplace=True)
    predictions_df = pd.merge(predictions_df, trading_day_price, left_on='permno', right_on='permno', how='inner')

    return_price_df = pd.DataFrame({
        'permnos': permnos_return,
        'price_return': return_day_price,
    })

    # merge the two dataframes
    merged_df = pd.merge(predictions_df, return_price_df, left_on='permno', right_on='permnos')

    # calculate the return
    merged_df['return(%)'] = (merged_df['price_return'] / merged_df['trading_day_price'] - 1)*100


    # Sanity check to see if everything lines up 
    merged_df_2 = pd.merge(merged_df, price_data[price_data['date'] == first_trading_date][['permno', 'target']], left_on='permno', right_on='permno')
    display(merged_df_2)



    n = 15
    # find the n argmin and argmax
    top_n = merged_df['1_probability'].nlargest(n)
    bottom_n = merged_df['1_probability'].nsmallest(n)

    #features_to_print = ['permno','0_probability', '1_probability', 'correct_prediction', 'return(%)']

    print(f'Top {n} long predictions:')
    display(merged_df.loc[top_n.index])#[features_to_print])
    print('-'*58)
    print(f'Top {n} short predictions:')
    display(merged_df.loc[bottom_n.index])#[features_to_print])


    # gain is usually the most realiable metric to evaluate the importance of the features
    xgb.plot_importance(model, importance_type='gain')
    plt.show()

    print('')
    print('-----------------------------------------------------------------------------------------------------------------')

In [121]:
# define a function to predict the future stock direction, for each model it makes predictions every 3 months for the next 2 years. 
# It should then return the top n stock it has the most confidence in both long and short direction for each period

def get_predictions(period, n_estimators=500, short=True, number_of_stocks=15):
    start_date = pd.to_datetime(period)
    end_date = start_date + pd.DateOffset(years=2) - pd.DateOffset(days=91)

    training_data = data[(data['date'] >= start_date) & (data['date'] <= end_date)]
    training_features = selected_features[start_date]
    
    X = training_data[training_features].copy()
    y = training_data['target'].copy()
    negative_mask = (y == -1)
    y[negative_mask] = 0

    # import the model with pickle
    start_date_str = start_date.strftime('%Y-%m-%d')
    model_name = f'xg_boost_{start_date_str}_{n_estimators}_correct_dataset.pkl'
    try:
        with open(f'../../models/final_models/{model_name}', 'rb') as f:
            model = pickle.load(f)
    except FileNotFoundError:
        print(f'Model {model_name} not found')
        print('')
        if short:
            return None, None
        return None

    first_trading_date = start_date + pd.DateOffset(years=2)
    dates = data['date'].unique()
    # find the first trading day after the end date
    while first_trading_date not in dates:
        first_trading_date += pd.DateOffset(days=1)
    print(first_trading_date)
    dates = data['date'].unique()

    output_dict = {}
    if short:
        output_dict_short = {}
    for i in range(8):
        X_prediction_mask = (data['date'] == first_trading_date)
        X_prediction = data[X_prediction_mask][training_features].copy()

        assert len(X_prediction) > 0, 'No data to predict'
        permnos_first_trading_day = data[X_prediction_mask]['permno'].copy()
        target = data[X_prediction_mask]['target'].copy()
        target[target == -1] = 0

        return_date = first_trading_date + pd.DateOffset(days=90)

        # find the first trading day after the end date
        while return_date not in dates:
            if i % 2 == 1:
                return_date += pd.DateOffset(days=1)
            else:
                return_date -= pd.DateOffset(days=1)

        # find the predictions
        y_pred_proba = model.predict_proba(X_prediction)

        predictions_df = pd.DataFrame({
            'permno': permnos_first_trading_day,
            '0_probability': y_pred_proba[:, 0],
            '1_probability': y_pred_proba[:, 1],
        })

        # find the n argmin and argmax
        top_n = predictions_df['1_probability'].nlargest(number_of_stocks)
        # if any element in top_n has predictions_df['1_probability'] <0.5, then we remove it
        top_n = top_n[predictions_df.loc[top_n.index]['1_probability'] >= 0.5]
        if short:
            bottom_n = predictions_df['1_probability'].nsmallest(number_of_stocks)
            # if any element in bottom_n has predictions_df['1_probability'] >0.5, then we remove it
            bottom_n = bottom_n[predictions_df.loc[bottom_n.index]['1_probability'] <= 0.5]
            
        # put the predictions in output_dict
        output_dict[f'{start_date}_{i}'] = {
            'permno': predictions_df.loc[top_n.index]['permno'].values,
            'permno_probabilities': top_n.values
        }
        if short:
            output_dict_short[f'{start_date}_{i}'] = {
                'permno': predictions_df.loc[bottom_n.index]['permno'].values,
                'permno_probabilities': bottom_n.values
            }
        
        # move to next period:
        first_trading_date = return_date
    if short:
        return output_dict, output_dict_short
    else:
        return output_dict


In [123]:
# run get_prediction for all periods

all_predictions = {}
all_short_predictions = {}
for period in selected_features:
    predictions, short_predictions = get_predictions(period, n_estimators=500, short=True, number_of_stocks=10)
    if predictions is not None:
        all_predictions.update(predictions)
        all_short_predictions.update(short_predictions)

2010-01-04 00:00:00
2012-01-03 00:00:00
2014-01-02 00:00:00
Model xg_boost_2014-01-01_500_correct_dataset.pkl not found

Model xg_boost_2016-01-01_500_correct_dataset.pkl not found

2020-01-02 00:00:00


In [113]:
for i in all_predictions:
    print(i)
    print(all_predictions[i]['permno'])

2008-01-01 00:00:00_0
[89509]
2008-01-01 00:00:00_1
[78877]
2008-01-01 00:00:00_2
[90720]
2008-01-01 00:00:00_3
[66384]
2008-01-01 00:00:00_4
[91233]
2008-01-01 00:00:00_5
[87070]
2008-01-01 00:00:00_6
[87070]
2008-01-01 00:00:00_7
[87070]
2010-01-01 00:00:00_0
[82196]
2010-01-01 00:00:00_1
[91937]
2010-01-01 00:00:00_2
[57665]
2010-01-01 00:00:00_3
[80303]
2010-01-01 00:00:00_4
[89301]
2010-01-01 00:00:00_5
[91391]
2010-01-01 00:00:00_6
[90547]
2010-01-01 00:00:00_7
[90547]
2012-01-01 00:00:00_0
[76185]
2012-01-01 00:00:00_1
[86288]
2012-01-01 00:00:00_2
[66325]
2012-01-01 00:00:00_3
[90720]
2012-01-01 00:00:00_4
[27422]
2012-01-01 00:00:00_5
[90162]
2012-01-01 00:00:00_6
[92772]
2012-01-01 00:00:00_7
[11786]
2018-01-01 00:00:00_0
[16816]
2018-01-01 00:00:00_1
[36397]
2018-01-01 00:00:00_2
[36397]
2018-01-01 00:00:00_3
[36397]
2018-01-01 00:00:00_4
[49373]
2018-01-01 00:00:00_5
[90162]
2018-01-01 00:00:00_6
[90162]
2018-01-01 00:00:00_7
[90162]


In [120]:
for i in all_short_predictions:
    print(i)
    print(all_short_predictions[i]['permno'])
    print(all_short_predictions[i]['permno_probabilities'])

2008-01-01 00:00:00_0
[79588]
[0.00274739]
2008-01-01 00:00:00_1
[54181]
[0.00209514]
2008-01-01 00:00:00_2
[86964]
[0.01728031]
2008-01-01 00:00:00_3
[65875]
[0.00069632]
2008-01-01 00:00:00_4
[49656]
[0.00061968]
2008-01-01 00:00:00_5
[90199]
[0.00076272]
2008-01-01 00:00:00_6
[86783]
[0.00143962]
2008-01-01 00:00:00_7
[83111]
[2.3427709e-05]
2010-01-01 00:00:00_0
[77481]
[0.00236557]
2010-01-01 00:00:00_1
[90215]
[0.0014678]
2010-01-01 00:00:00_2
[90215]
[0.00036638]
2010-01-01 00:00:00_3
[76614]
[0.00055705]
2010-01-01 00:00:00_4
[90215]
[0.00135231]
2010-01-01 00:00:00_5
[18729]
[0.00070091]
2010-01-01 00:00:00_6
[58246]
[0.00113966]
2010-01-01 00:00:00_7
[79906]
[9.1062706e-05]
2012-01-01 00:00:00_0
[32707]
[0.00192199]
2012-01-01 00:00:00_1
[67467]
[0.00104772]
2012-01-01 00:00:00_2
[14295]
[0.00196575]
2012-01-01 00:00:00_3
[85035]
[0.00191758]
2012-01-01 00:00:00_4
[85035]
[0.00047514]
2012-01-01 00:00:00_5
[38156]
[0.00036869]
2012-01-01 00:00:00_6
[38156]
[0.00047293]
2012-0