# Demo


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from datetime import datetime

from ast import literal_eval

from scipy.cluster import hierarchy
from scipy.spatial import distance

from sklearn.base import clone
from sklearn.preprocessing import StandardScaler,Normalizer
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score,mean_absolute_error

import glob, os 

import warnings
warnings.simplefilter(action='ignore')

def make_walkforward_model(features,outcome,algo=LinearRegression()):
    recalc_dates = features.resample('Y').mean().index.values[:-1]
    
    ## Train models
    models = pd.Series(index=recalc_dates)
    for date in recalc_dates:    
        X_train = features.loc[slice(None,date)]
#         X_train = features.loc[slice(date-pd.Timedelta('365 days'),date)]
        
        y_train = outcome.loc[slice(None,date)]
#         y_train = outcome.loc[slice(date-pd.Timedelta('365 days'),date)]
#         print(f'Train with data prior to: {date} ({y_train.count()} obs)')
        
        model = clone(algo)
        model.fit(X_train,y_train)
        models.loc[date] = model

    begin_dates = models.index
    end_dates = models.index[1:].append(pd.to_datetime(['2099-12-31']))

    ## Generate OUT OF SAMPLE walk-forward predictions
    predictions = pd.Series(index=features.index)
    for i,model in enumerate(models): #loop thru each models object in collection
#         print(f'Using model trained on {begin_dates[i]}, Predict from: {begin_dates[i]} to: {end_dates[i]}')
        X = features.loc[slice(begin_dates[i],end_dates[i])]
        p = pd.Series(model.predict(X),index=X.index)
        predictions.loc[X.index] = p
    
    return models,predictions

def calc_scorecard(y_pred,y_true):
    
    def make_df(y_pred,y_true):
        y_pred.name = 'y_pred'
        y_true.name = 'y_true'

        df = pd.concat([y_pred,y_true],axis=1).dropna()

        df['sign_pred'] = df.y_pred.apply(np.sign)
        df['sign_true'] = df.y_true.apply(np.sign)
        df['is_correct'] = 0
        df.loc[df.sign_pred * df.sign_true > 0 ,'is_correct'] = 1 # only registers 1 when prediction was made AND it was correct
        df['is_incorrect'] = 0
        df.loc[df.sign_pred * df.sign_true < 0,'is_incorrect'] = 1 # only registers 1 when prediction was made AND it was wrong
        df['is_predicted'] = df.is_correct + df.is_incorrect
        df['result'] = df.sign_pred * df.y_true 
        return df
    
    df = make_df(y_pred,y_true)
    
    scorecard = pd.Series()
    
    # building block metrics
    scorecard.loc['RSQ'] = r2_score(df.y_true,df.y_pred)
    scorecard.loc['MAE'] = mean_absolute_error(df.y_true,df.y_pred)
    scorecard.loc['directional_accuracy'] = df.is_correct.sum()*1. / (df.is_predicted.sum()*1.)*100
    scorecard.loc['edge'] = df.result.mean()
    scorecard.loc['noise'] = df.y_pred.diff().abs().mean()
    # derived metrics
    
    scorecard.loc['edge_long'] = df[df.sign_pred == 1].result.mean()  - df.y_true.mean()
    scorecard.loc['edge_short'] = df[df.sign_pred == -1].result.mean()  - df.y_true.mean()

    scorecard.loc['edge_win'] = df[df.is_correct == 1].result.mean()  - df.y_true.mean()
    scorecard.loc['edge_lose'] = df[df.is_incorrect == 1].result.mean()  - df.y_true.mean()

    
    scorecard.loc['edge_to_noise'] = scorecard.loc['edge'] / scorecard.loc['noise']
    scorecard.loc['edge_to_mae'] = scorecard.loc['edge'] / scorecard.loc['MAE']

    
    return scorecard    


def prepare_Xy(X_raw,y_raw):
    ''' Utility function to drop any samples without both valid X and y values'''
    Xy = X_raw.join(y_raw).replace({np.inf:None,-np.inf:None}).dropna()
    X = Xy.iloc[:,:-1]
    y = Xy.iloc[:,-1]
    return X,y



In [2]:
def get_features(stockCode, lag = 2, horizon = 20, corrThres = 0.1):
    
    address = "stock/" + stockCode + ".csv"
    col = ['trade_date','close','vol']
    stock1 = pd.read_csv(address)[col]
    stock1['trade_date'] = stock1['trade_date'].astype('str').apply(lambda x: datetime.strptime(x, "%Y%m%d"))
    stock1 = stock1.set_index('trade_date').sort_index().fillna(method='ffill')

    ### Price Feature Matrix

    # pct change
    stock1['past_ret_1'] = stock1.close.pct_change(1) # past day's returns
    stock1['past_ret_5'] = stock1.close.pct_change(5) # past week's returns
    stock1['past_ret_10'] = stock1.close.pct_change(10) # past two weeks' returns
    stock1['past_ret_20'] = stock1.close.pct_change(20) # past month's (approx) returns
    stock1['logVol'] = stock1.vol.apply(np.log)
    stock1['past_logVol_1'] = stock1.logVol.pct_change(1) # past day's change of log volume
    stock1['past_logVol_5'] = stock1.logVol.pct_change(5) # past week's change of log volume
    stock1['past_logVol_10'] = stock1.logVol.pct_change(10) # past two weeks' change of log volume
    stock1['past_logVol_20'] = stock1.logVol.pct_change(20) # past month's change of log volume

    # rolling mean or std
    stock1['roll_Close_Mean_200'] = stock1['close'].rolling(window=200, min_periods=20).mean().shift(1)
    stock1['roll_Close_Std_200'] = stock1['close'].rolling(window=200, min_periods=20).std().shift(1)
    stock1['roll_LogVol_Mean_200'] = stock1['logVol'].rolling(window=200, min_periods=20).mean().shift(1)
    stock1['roll_LogVol_Std_200'] = stock1['logVol'].rolling(window=200, min_periods=20).std().shift(1)

    # z-score
    stock1['zscore_price'] = (stock1['close'] - stock1['roll_Close_Mean_200'] ) / stock1['roll_Close_Std_200'] 
    stock1['zscore_logVol'] = (stock1['logVol'] - stock1['roll_LogVol_Mean_200'] ) / stock1['roll_LogVol_Std_200'] 

    # change sign
    stock1['sign_LogVol_1'] = stock1['past_logVol_1'].apply(np.sign)
    stock1['sign_Ret_1'] = stock1['past_ret_1'].apply(np.sign)

    ### Outcome Feature

    out_stock1 = pd.DataFrame(index = stock1.index)
    
    ### Assume T+lag enter the market
    out_stock1['outcome'] = stock1.close.pct_change(horizon).shift(-(lag+horizon)) # returns

    #   dropna
    out_stock1 = out_stock1.dropna()

    ### News Feature Matrix

    news = pd.read_csv('dummy_matrix.csv').drop('Unnamed: 0', axis =1)
    news['date'] =  pd.to_datetime(news['date'], format='%Y-%m-%d')

    # The keywords of news
    news_keywords = pd.read_csv('news_sent_kw_tag.csv')[['date', 'keywords']]

    # The keywords of the companys
    stock_keywords = pd.read_csv('stock_keywords.csv')
    stock_keywords['keywords'] = stock_keywords['keywords'].apply(literal_eval)
    keyword1 = stock_keywords[stock_keywords['code'] == stockCode ]['keywords'].iloc[0]

    # Find those relevant news for this compnay

    news_keywords['relevant'] = news_keywords['keywords'].apply(lambda x: any([word in x for word in keyword1]))

    ### Relevant News
    news = news.iloc[news_keywords[news_keywords['relevant']].index]
    news['date'] = pd.to_datetime(news['date'], format='%Y-%m-%d')

    ### Summarise the News on that date
    groupbyNews = news.groupby('date').agg({'lv1_tag_国际': 'sum', 'lv1_tag_时事': 'sum',
                                            'lv1_tag_社会': 'sum', 'lv1_tag_财经': 'sum',
                                            'sentiment_pos': 'mean',
                                            'dayofweek_1': 'mean', 'dayofweek_2': 'mean', 'dayofweek_3': 'mean',
                                            'dayofweek_4': 'mean', 'dayofweek_5': 'mean', 'dayofweek_6': 'mean',
                                            'quarter_2': 'mean', 'quarter_3': 'mean', 'quarter_4': 'mean',
                                            'month_2': 'mean', 'month_3': 'mean', 'month_4': 'mean', 'month_5': 'mean',
                                            'month_6': 'mean', 'month_7': 'mean', 'month_8': 'mean', 'month_9': 'mean',
                                            'month_10': 'mean', 'month_11': 'mean','month_12': 'mean',})

    # The preditive feature matrix
        # if the news is not released in a trading day, then ignore

    feature1 = groupbyNews.join(stock1, how = 'inner')

    ### Standardize predictive features
    # maybe problematic here, consider other schemes
#     std_scaler = StandardScaler()
#     features_scaled = std_scaler.fit_transform(feature1.dropna()) 
    features_scaled = feature1.dropna()

    df = pd.DataFrame(features_scaled,index=feature1.dropna().index)
    df.columns = feature1.dropna().columns

    ### Find the indices that both feature matrix and the outcome matrix share
    index = df.index.intersection(out_stock1['outcome'].index)
    out = out_stock1['outcome'].loc[index]
    df = df.loc[index]
    
    ### Feature Selection
    
    corr = df.corrwith(out)
    selected_features = corr[abs(corr)>corrThres].sort_values(ascending=False).index.tolist()
    
    return selected_features


def get_prediction(stockCode, lag = 2, horizon = 20, model = 'linear'):
    
    address = "stock/" + stockCode + ".csv"
    col = ['trade_date','close','vol']
    stock1 = pd.read_csv(address)[col]
    stock1['trade_date'] = stock1['trade_date'].astype('str').apply(lambda x: datetime.strptime(x, "%Y%m%d"))
    stock1 = stock1.set_index('trade_date').sort_index().fillna(method='ffill')

    ### Price Feature Matrix

    # pct change
    stock1['past_ret_1'] = stock1.close.pct_change(1) # past day's returns
    stock1['past_ret_5'] = stock1.close.pct_change(5) # past week's returns
    stock1['past_ret_10'] = stock1.close.pct_change(10) # past two weeks' returns
    stock1['past_ret_20'] = stock1.close.pct_change(20) # past month's (approx) returns
    stock1['logVol'] = stock1.vol.apply(np.log)
    stock1['past_logVol_1'] = stock1.logVol.pct_change(1) # past day's change of log volume
    stock1['past_logVol_5'] = stock1.logVol.pct_change(5) # past week's change of log volume
    stock1['past_logVol_10'] = stock1.logVol.pct_change(10) # past two weeks' change of log volume
    stock1['past_logVol_20'] = stock1.logVol.pct_change(20) # past month's change of log volume

    # rolling mean or std
    stock1['roll_Close_Mean_200'] = stock1['close'].rolling(window=200, min_periods=20).mean().shift(1)
    stock1['roll_Close_Std_200'] = stock1['close'].rolling(window=200, min_periods=20).std().shift(1)
    stock1['roll_LogVol_Mean_200'] = stock1['logVol'].rolling(window=200, min_periods=20).mean().shift(1)
    stock1['roll_LogVol_Std_200'] = stock1['logVol'].rolling(window=200, min_periods=20).std().shift(1)

    # z-score
    stock1['zscore_price'] = (stock1['close'] - stock1['roll_Close_Mean_200'] ) / stock1['roll_Close_Std_200'] 
    stock1['zscore_logVol'] = (stock1['logVol'] - stock1['roll_LogVol_Mean_200'] ) / stock1['roll_LogVol_Std_200'] 

    # change sign
    stock1['sign_LogVol_1'] = stock1['past_logVol_1'].apply(np.sign)
    stock1['sign_Ret_1'] = stock1['past_ret_1'].apply(np.sign)

    ### Outcome Feature

    out_stock1 = pd.DataFrame(index = stock1.index)
    
    ### Assume T+lag enter the market
    out_stock1['outcome'] = stock1.close.pct_change(horizon).shift(-(lag+horizon)) # returns

    #   dropna
    out_stock1 = out_stock1.dropna()

    ### News Feature Matrix

    news = pd.read_csv('dummy_matrix.csv').drop('Unnamed: 0', axis =1)
    news['date'] =  pd.to_datetime(news['date'], format='%Y-%m-%d')

    # The keywords of news
    news_keywords = pd.read_csv('news_sent_kw_tag.csv')[['date', 'keywords']]

    # The keywords of the companys
    stock_keywords = pd.read_csv('stock_keywords.csv')
    stock_keywords['keywords'] = stock_keywords['keywords'].apply(literal_eval)
    keyword1 = stock_keywords[stock_keywords['code'] == stockCode ]['keywords'].iloc[0]

    # Find those relevant news for this compnay

    news_keywords['relevant'] = news_keywords['keywords'].apply(lambda x: any([word in x for word in keyword1]))

    ### Relevant News
    news = news.iloc[news_keywords[news_keywords['relevant']].index]
    news['date'] = pd.to_datetime(news['date'], format='%Y-%m-%d')

    ### Summarise the News on that date
    groupbyNews = news.groupby('date').agg({'lv1_tag_国际': 'sum', 'lv1_tag_时事': 'sum',
                                            'lv1_tag_社会': 'sum', 'lv1_tag_财经': 'sum',
                                            'sentiment_pos': 'mean',
                                            'dayofweek_1': 'mean', 'dayofweek_2': 'mean', 'dayofweek_3': 'mean',
                                            'dayofweek_4': 'mean', 'dayofweek_5': 'mean', 'dayofweek_6': 'mean',
                                            'quarter_2': 'mean', 'quarter_3': 'mean', 'quarter_4': 'mean',
                                            'month_2': 'mean', 'month_3': 'mean', 'month_4': 'mean', 'month_5': 'mean',
                                            'month_6': 'mean', 'month_7': 'mean', 'month_8': 'mean', 'month_9': 'mean',
                                            'month_10': 'mean', 'month_11': 'mean','month_12': 'mean',})

    # The preditive feature matrix
        # if the news is not released in a trading day, then ignore

    feature1 = groupbyNews.join(stock1, how = 'inner')

    ### Standardize predictive features
    # maybe problematic here, consider other schemes
#     std_scaler = StandardScaler()
#     features_scaled = std_scaler.fit_transform(feature1.dropna()) 
    features_scaled = feature1.dropna()

    df = pd.DataFrame(features_scaled,index=feature1.dropna().index)
    df.columns = feature1.dropna().columns

    ### Find the indices that both feature matrix and the outcome matrix share
    index = df.index.intersection(out_stock1['outcome'].index)
    out = out_stock1['outcome'].loc[index]
    df = df.loc[index]
    
    ### Feature Selection
    corrThres = 0.1
    
    corr = df.corrwith(out)
    selected_features = corr[abs(corr)>corrThres].sort_values(ascending=False).index.tolist()


    X = df[selected_features]
    y = out
    
    if model == "linear":
        models,preds = make_walkforward_model(X,y,algo=LinearRegression())
    elif model == 'tree':
        models,preds = make_walkforward_model(X,y,algo=ExtraTreesRegressor())
    elif model == 'ensemble':
        linear_models, linear_preds = make_walkforward_model(X,y,algo=LinearRegression())
        tree_models, tree_preds = make_walkforward_model(X,y,algo=ExtraTreesRegressor())
        X_ens, y_ens = prepare_Xy(X_raw=pd.concat([linear_preds.rename('linear'),tree_preds.rename('tree')],
                                              axis=1),y_raw=y)
        models,preds = make_walkforward_model(X_ens,y_ens,algo=LassoCV(positive=True))
        preds = preds.rename('ensemble')
    
    return preds
 
def getBacktest(stockCode, preds, lag = 2, shortSell = True, tCostRate = 0.0002):
    
    address = "stock/" + stockCode + ".csv"
    col = ['trade_date','close','change','pct_chg']
    stock = pd.read_csv(address)[col]          
    stock['trade_date'] = stock['trade_date'].astype('str').apply(lambda x: datetime.strptime(x, "%Y%m%d"))
    
    # originally `pct_chg` and 'change' are compared to the previous day
    # it is changed to `pct_chg` and 'change' of tomorrow if I enter today
    stock['pct_chg'] = stock['pct_chg'].shift(1)
    stock['change'] = stock['change'].shift(1)    
    stock = stock.set_index('trade_date').sort_index().fillna(method='ffill')
    
    stock = stock.join(np.sign(preds.shift(lag)).rename('y_pred'), how = 'left').fillna(0)
    start = preds[~preds.isna()].index[0]
    end = preds[~preds.isna()].index[-1]
    stock = stock.loc[start:end]

    stock['position'] = 0
        
    if shortSell:
    
        for i in range(len(stock)):
            if stock['y_pred'].iloc[i] * stock['position'].iloc[i-1] > 0:
                stock['position'].iloc[i] = stock['position'].iloc[i-1] + stock['y_pred'].iloc[i]
            elif stock['y_pred'].iloc[i] * stock['position'].iloc[i-1] == 0:
                if stock['position'].iloc[i-1] == 0 and stock['y_pred'].iloc[i] != 0:
                    stock['position'].iloc[i] = stock['y_pred'].iloc[i]
                else:
                    stock['position'].iloc[i] = stock['position'].iloc[i-1] 
            elif stock['y_pred'].iloc[i] * stock['position'].iloc[i-1] < 0:
                stock['position'].iloc[i] = 0
    else:
        for i in range(len(stock)):
            if stock['y_pred'].iloc[i] > 0:
                stock['position'].iloc[i] = stock['position'].iloc[i-1] + 1
            elif stock['y_pred'].iloc[i] == 0:
                stock['position'].iloc[i] = stock['y_pred'].iloc[i]
            elif stock['y_pred'].iloc[i-1] < 0:
                stock['position'].iloc[i] = 0


    stock['pos_chg'] = stock['position'].diff().fillna(0)
    stock['transaction'] = -stock['pos_chg'] * stock['close']
    stock['tCost'] = abs(stock['transaction'] * tCostRate)
    stock['cumTCost'] = stock['tCost'].cumsum()
    stock['dollarReturn'] = stock['position'] * stock['change']
    stock['cumDollarReturn'] = stock['dollarReturn'].cumsum()
    stock['dollarReturnNetTCost'] = stock['dollarReturn'] - stock['tCost']
    stock['cumDollarReturnNetTCost'] = stock['dollarReturnNetTCost'].cumsum()
    
    return stock

def SSE50_dollar_neutral(DollarTransaction, startDate, endDate, tCostRate = 0.002):
    
    totalDollarTransaction = DollarTransaction.sum(axis = 1)
    
    SSE50 = pd.read_csv('index/SSE50.csv')
    col = ['trade_date','close','change','pct_chg']
    SSE50 = SSE50[col]
    SSE50['trade_date'] = SSE50['trade_date'].astype('str').apply(lambda x: datetime.strptime(x, "%Y%m%d"))
    SSE50['change'] = SSE50['change'].shift(1)
    SSE50 = SSE50.set_index('trade_date').sort_index().fillna(method='ffill')
    SSE50 = SSE50.loc[startDate:endDate]
    SSE50['transaction_dollar'] = -totalDollarTransaction
    SSE50['fraction'] = SSE50['transaction_dollar'] / SSE50['close']
    SSE50['position'] = SSE50['fraction'].cumsum()
    SSE50['dollarReturn'] = SSE50['position'] * SSE50['change']
    SSE50['cumDollarReturn'] = SSE50['dollarReturn'].cumsum()
    SSE50['tCost'] = abs(SSE50['transaction_dollar'] * tCostRate)
    SSE50['dollarReturnNetTCost'] = SSE50['dollarReturn'] - SSE50['tCost']
    SSE50['cumDollarReturnNetTCost'] = SSE50['dollarReturnNetTCost'].cumsum()
    
    return SSE50

In [3]:
fut220Tree = [i for i in pd.read_csv('fut_lag2_return_20_result.csv').sort_values('Tree_directional_accuracy', ascending = False).head(3).stock]
# fut220Tree
# fig, ax = plt.subplots(1, 3, sharex='col', sharey='row',figsize = (50,10))
# i = 0
# for stockCode in fut220Tree:
#     preds = get_prediction(stockCode, lag = 2, horizon = 20, model = 'linear')
#     result = getBacktest(stockCode, preds, lag = 2, shortSell = True, tCostRate = 0.0002)
    
#     ax[i].plot(result[['cumDollarReturn', 'cumDollarReturnNetTCost','cumTCost']])
#     ax[i].set_title(stockCode, fontdict = {'fontsize': 30} )
#     ax[i].legend(('cumDollarReturn', 'cumDollarReturnNetTCost','cumTCost'),loc='upper right')
#     i += 1

def port_dollar_neutral_bactest(components, lag = 2, horizon = 20, shortSell = False, model = 'linear'):
    
    
    dollarReturn = []
    dollarReturnNetTCost = []    
    cumDollarReturn = []
    cumDollarReturnNetTCost = []
    transaction = []
    tCost = []

    for stockCode in components:
        preds = get_prediction(stockCode, lag = lag, horizon = horizon, model = model)
        result = getBacktest(stockCode, preds, lag = lag, shortSell = shortSell, tCostRate = 0.0002)
        
        dollarReturn.append(result['dollarReturn'].rename(stockCode))
        dollarReturnNetTCost.append(result['dollarReturnNetTCost'].rename(stockCode))

        
        cumDollarReturn.append(result['cumDollarReturn'].rename(stockCode))
        cumDollarReturnNetTCost.append(result['cumDollarReturnNetTCost'].rename(stockCode))
        
        transaction.append(result['transaction'].rename(stockCode))
        tCost.append(result['tCost'].rename(stockCode))
        


    dollarReturn = pd.concat(dollarReturn, axis=1, join='outer').fillna(method = 'ffill')
    dollarReturnNetTCost = pd.concat(dollarReturnNetTCost, axis=1, join='outer').fillna(method = 'ffill')    
    
    cumDollarReturn = pd.concat(cumDollarReturn, axis=1, join='outer').fillna(method = 'ffill')
    cumDollarReturnNetTCost = pd.concat(cumDollarReturnNetTCost, axis=1, join='outer').fillna(method = 'ffill')
    
    transaction = pd.concat(transaction, axis=1, join='outer').fillna(method = 'ffill')
    tCost = pd.concat(tCost, axis=1, join='outer').fillna(method = 'ffill')

    SSE50 = SSE50_dollar_neutral(transaction, cumDollarReturn.index[0], cumDollarReturn.index[-1])

    dollarReturn['SSE50'] = SSE50['dollarReturn']
    dollarReturnNetTCost['SSE50'] = SSE50['dollarReturnNetTCost']
    cumDollarReturn['SSE50'] = SSE50['cumDollarReturn']
    cumDollarReturnNetTCost['SSE50'] = SSE50['cumDollarReturnNetTCost']
    tCost['SSE50'] = SSE50['tCost']


    dollarReturn['Portfolio'] = dollarReturn.sum(axis = 1)
    dollarReturnNetTCost['Portfolio'] = dollarReturnNetTCost.sum(axis = 1)
    
    cumDollarReturn['Portfolio'] = cumDollarReturn.sum(axis = 1)
    cumDollarReturnNetTCost['Portfolio'] = cumDollarReturnNetTCost.sum(axis = 1)
    
    tCost['Portfolio'] = tCost.sum(axis = 1)
    tCost['Cum.TCost'] = tCost['Portfolio'].cumsum()

    
#     dollarReturn.plot(figsize = (30,10))
#     plt.xlabel('Year', fontsize=16);
#     plt.ylabel('Dollar', fontsize=18);
#     plt.title('Dollar Return', fontsize=25);
    
    cumDollarReturn.plot(figsize = (30,10))
    plt.xlabel('Year', fontsize=16);
    plt.ylabel('Dollar', fontsize=18);
    plt.title('Cumulative Dollar Return', fontsize=25);
    
    cumDollarReturnNetTCost.plot(figsize = (30,10))
    plt.xlabel('Year', fontsize=16);
    plt.ylabel('Dollar', fontsize=18);
    plt.title('Cumulative Dollar Return Net T-Cost', fontsize=25);
    
    tCost.plot(figsize = (30,10), secondary_y = ['Cum.TCost'])
    plt.xlabel('Year', fontsize=18);
    plt.ylabel('Dollar', fontsize=18);
    plt.title('Transaction Cost', fontsize=25);
    
    
    MMD_end = np.argmax(np.maximum.accumulate(cumDollarReturnNetTCost['Portfolio']) - cumDollarReturnNetTCost['Portfolio']) # end of the period
    MMD_start = np.argmax(cumDollarReturnNetTCost['Portfolio'].loc[:MMD_end]) # start of period
    
    print("************************************************************************************")
    print("**************************  Statistics of the Portfolio  ***************************")
    print("************************************************************************************")
    print("Average Dollar Earned before T-cost per day holding position: ", '\t\t${:.2f}'.format(dollarReturn['Portfolio'].replace(0, np.nan).mean(skipna=True)))
    print("Total Dollar Earned before T-cost: ", '\t\t\t\t\t${:.2f}'.format(cumDollarReturn['Portfolio'].iloc[-1]))
    print("Average Dollar Return after T-cost per day holding position: ", '\t\t${:.2f}'.format(dollarReturnNetTCost['Portfolio'].replace(0, np.nan).mean(skipna=True)))
    print("Total Dollar Earned after T-cost: ", '\t\t\t\t\t${:.2f}'.format(cumDollarReturnNetTCost['Portfolio'].iloc[-1]))
    print("Maximum Daily Dollar Earned after T-cost: ", '\t\t\t\t${:.2f}'.format(max(dollarReturnNetTCost['Portfolio'])))
    print("Minimum Daily Dollar Earned after T-cost: ", '\t\t\t\t${:.2f}'.format(min(dollarReturnNetTCost['Portfolio'])))
    print("Maximum Drawdown: ", '\t\t\t\t\t\t\t{:.2%}'.format((cumDollarReturnNetTCost['Portfolio'].loc[MMD_end] - cumDollarReturnNetTCost['Portfolio'].loc[MMD_start] )/cumDollarReturnNetTCost['Portfolio'].loc[MMD_start]))
    print("Maximum Drawdown Period: ", "\t\t\t\t\t", MMD_start.date(), "-", MMD_end.date())
    

In [4]:
# port_dollar_neutral_bactest(fut220Tree, lag = 2, horizon = 20, shortSell = False, model = 'ensemble')

In [5]:
FTSEA50 = ['000002.SZ', '600104.SH', '601229.SH', '600018.SH', '600030.SH',
            '601998.SH', '601766.SH', '601390.SH', '601800.SH', '601628.SH',
            '601601.SH', '601318.SH', '601668.SH', '600028.SH', '601857.SH',
            '601088.SH', '600050.SH', '601186.SH', '601988.SH', '000858.SZ',
            '601328.SH', '000725.SZ', '600887.SH', '600048.SH', '601818.SH',
            '601166.SH', '601288.SH', '002027.SZ', '601169.SH', '601688.SH',
            '601211.SH', '600019.SH', '601398.SH', '000001.SZ', '000776.SZ',
            '601238.SH', '601939.SH', '600276.SH', '001979.SZ', '600036.SH',
            '601336.SH', '002594.SZ', '600016.SH', '002304.SZ', '600000.SH',
            '002415.SZ', '600837.SH', '000333.SZ', '600519.SH', '600900.SH']



In [11]:
withNews = []

NewsFeature = ['lv1_tag_国际', 'lv1_tag_时事', 'lv1_tag_社会', 'lv1_tag_财经', 'sentiment_pos']


for lag in [1,2]:
    for horzn in [1,5,10,20]:
        print('Scheme: lag_',lag, '_horizon_',horzn)
        for stockCode in FTSEA50:
            features = get_features(stockCode.replace('.',''), lag = 2, horizon = 20, corrThres = 0.1)
            if any([feat in features for feat in NewsFeature]):
                withNews = withNews + [stockCode]
            print(stockCode,":", "; ".join(features))
        print("")


Scheme: lag_ 1 _horizon_ 1
000002.SZ : month_11; roll_LogVol_Std_200; quarter_4; roll_LogVol_Mean_200; close; past_ret_20; past_ret_10; roll_Close_Std_200
600104.SH : month_9; month_3; month_10; month_12; past_logVol_20; roll_Close_Std_200; month_7; zscore_logVol; logVol; vol
601229.SH : month_5; roll_Close_Std_200; month_4; month_12; month_8; logVol; sentiment_pos; month_11; past_ret_20; quarter_3; close; month_6; zscore_price; month_7
600018.SH : month_8; quarter_3; lv1_tag_社会; month_12; quarter_2; lv1_tag_国际; quarter_4; sentiment_pos; vol; logVol; roll_LogVol_Std_200; roll_LogVol_Mean_200; roll_Close_Std_200; roll_Close_Mean_200; close
600030.SH : quarter_4; month_11; month_10; past_ret_10; zscore_logVol; past_ret_5; month_5; roll_LogVol_Mean_200; roll_Close_Std_200; quarter_2; close; roll_Close_Mean_200
601998.SH : month_10; quarter_4; month_11; roll_Close_Std_200; month_4; quarter_3; month_7; zscore_price; roll_Close_Mean_200; close
601766.SH : month_10; month_3; zscore_logVol; zs

600900.SH : zscore_logVol; logVol; zscore_price; vol; roll_LogVol_Std_200; past_logVol_20; month_4; past_ret_20; month_3; close; roll_Close_Mean_200; month_12

Scheme: lag_ 1 _horizon_ 5
000002.SZ : month_11; roll_LogVol_Std_200; quarter_4; roll_LogVol_Mean_200; close; past_ret_20; past_ret_10; roll_Close_Std_200
600104.SH : month_9; month_3; month_10; month_12; past_logVol_20; roll_Close_Std_200; month_7; zscore_logVol; logVol; vol
601229.SH : month_5; roll_Close_Std_200; month_4; month_12; month_8; logVol; sentiment_pos; month_11; past_ret_20; quarter_3; close; month_6; zscore_price; month_7
600018.SH : month_8; quarter_3; lv1_tag_社会; month_12; quarter_2; lv1_tag_国际; quarter_4; sentiment_pos; vol; logVol; roll_LogVol_Std_200; roll_LogVol_Mean_200; roll_Close_Std_200; roll_Close_Mean_200; close
600030.SH : quarter_4; month_11; month_10; past_ret_10; zscore_logVol; past_ret_5; month_5; roll_LogVol_Mean_200; roll_Close_Std_200; quarter_2; close; roll_Close_Mean_200
601998.SH : month_10;

600519.SH : month_2; past_ret_20; past_ret_10; sentiment_pos; month_10; past_ret_5; zscore_price; month_5; dayofweek_1; month_9; past_logVol_1; logVol; vol; roll_LogVol_Std_200; month_8; month_7; month_6; quarter_3; roll_LogVol_Mean_200
600900.SH : zscore_logVol; logVol; zscore_price; vol; roll_LogVol_Std_200; past_logVol_20; month_4; past_ret_20; month_3; close; roll_Close_Mean_200; month_12

Scheme: lag_ 1 _horizon_ 10
000002.SZ : month_11; roll_LogVol_Std_200; quarter_4; roll_LogVol_Mean_200; close; past_ret_20; past_ret_10; roll_Close_Std_200
600104.SH : month_9; month_3; month_10; month_12; past_logVol_20; roll_Close_Std_200; month_7; zscore_logVol; logVol; vol
601229.SH : month_5; roll_Close_Std_200; month_4; month_12; month_8; logVol; sentiment_pos; month_11; past_ret_20; quarter_3; close; month_6; zscore_price; month_7
600018.SH : month_8; quarter_3; lv1_tag_社会; month_12; quarter_2; lv1_tag_国际; quarter_4; sentiment_pos; vol; logVol; roll_LogVol_Std_200; roll_LogVol_Mean_200; ro

000333.SZ : quarter_4; roll_Close_Std_200; month_11; month_5; roll_LogVol_Std_200; month_10; logVol; month_8; past_ret_20; quarter_2; roll_Close_Mean_200; close; month_4
600519.SH : month_2; past_ret_20; past_ret_10; sentiment_pos; month_10; past_ret_5; zscore_price; month_5; dayofweek_1; month_9; past_logVol_1; logVol; vol; roll_LogVol_Std_200; month_8; month_7; month_6; quarter_3; roll_LogVol_Mean_200
600900.SH : zscore_logVol; logVol; zscore_price; vol; roll_LogVol_Std_200; past_logVol_20; month_4; past_ret_20; month_3; close; roll_Close_Mean_200; month_12

Scheme: lag_ 1 _horizon_ 20
000002.SZ : month_11; roll_LogVol_Std_200; quarter_4; roll_LogVol_Mean_200; close; past_ret_20; past_ret_10; roll_Close_Std_200
600104.SH : month_9; month_3; month_10; month_12; past_logVol_20; roll_Close_Std_200; month_7; zscore_logVol; logVol; vol
601229.SH : month_5; roll_Close_Std_200; month_4; month_12; month_8; logVol; sentiment_pos; month_11; past_ret_20; quarter_3; close; month_6; zscore_price;

600837.SH : month_11; zscore_price; quarter_4; month_10; zscore_logVol; past_ret_10; past_ret_5; past_logVol_20; past_ret_20; month_3; month_12; month_5; roll_Close_Std_200; quarter_2; close; roll_Close_Mean_200
000333.SZ : quarter_4; roll_Close_Std_200; month_11; month_5; roll_LogVol_Std_200; month_10; logVol; month_8; past_ret_20; quarter_2; roll_Close_Mean_200; close; month_4
600519.SH : month_2; past_ret_20; past_ret_10; sentiment_pos; month_10; past_ret_5; zscore_price; month_5; dayofweek_1; month_9; past_logVol_1; logVol; vol; roll_LogVol_Std_200; month_8; month_7; month_6; quarter_3; roll_LogVol_Mean_200
600900.SH : zscore_logVol; logVol; zscore_price; vol; roll_LogVol_Std_200; past_logVol_20; month_4; past_ret_20; month_3; close; roll_Close_Mean_200; month_12

Scheme: lag_ 2 _horizon_ 1
000002.SZ : month_11; roll_LogVol_Std_200; quarter_4; roll_LogVol_Mean_200; close; past_ret_20; past_ret_10; roll_Close_Std_200
600104.SH : month_9; month_3; month_10; month_12; past_logVol_20; 

002415.SZ : zscore_logVol; month_2; roll_Close_Std_200; quarter_2; month_6; roll_Close_Mean_200; close
600837.SH : month_11; zscore_price; quarter_4; month_10; zscore_logVol; past_ret_10; past_ret_5; past_logVol_20; past_ret_20; month_3; month_12; month_5; roll_Close_Std_200; quarter_2; close; roll_Close_Mean_200
000333.SZ : quarter_4; roll_Close_Std_200; month_11; month_5; roll_LogVol_Std_200; month_10; logVol; month_8; past_ret_20; quarter_2; roll_Close_Mean_200; close; month_4
600519.SH : month_2; past_ret_20; past_ret_10; sentiment_pos; month_10; past_ret_5; zscore_price; month_5; dayofweek_1; month_9; past_logVol_1; logVol; vol; roll_LogVol_Std_200; month_8; month_7; month_6; quarter_3; roll_LogVol_Mean_200
600900.SH : zscore_logVol; logVol; zscore_price; vol; roll_LogVol_Std_200; past_logVol_20; month_4; past_ret_20; month_3; close; roll_Close_Mean_200; month_12

Scheme: lag_ 2 _horizon_ 5
000002.SZ : month_11; roll_LogVol_Std_200; quarter_4; roll_LogVol_Mean_200; close; past_ret

600000.SH : quarter_4; month_8; roll_LogVol_Mean_200; logVol; month_11; month_10; quarter_3; zscore_price; vol; month_4; month_6; roll_Close_Mean_200; close; month_5; quarter_2
002415.SZ : zscore_logVol; month_2; roll_Close_Std_200; quarter_2; month_6; roll_Close_Mean_200; close
600837.SH : month_11; zscore_price; quarter_4; month_10; zscore_logVol; past_ret_10; past_ret_5; past_logVol_20; past_ret_20; month_3; month_12; month_5; roll_Close_Std_200; quarter_2; close; roll_Close_Mean_200
000333.SZ : quarter_4; roll_Close_Std_200; month_11; month_5; roll_LogVol_Std_200; month_10; logVol; month_8; past_ret_20; quarter_2; roll_Close_Mean_200; close; month_4
600519.SH : month_2; past_ret_20; past_ret_10; sentiment_pos; month_10; past_ret_5; zscore_price; month_5; dayofweek_1; month_9; past_logVol_1; logVol; vol; roll_LogVol_Std_200; month_8; month_7; month_6; quarter_3; roll_LogVol_Mean_200
600900.SH : zscore_logVol; logVol; zscore_price; vol; roll_LogVol_Std_200; past_logVol_20; month_4; p

002304.SZ : month_12; sentiment_pos; month_2; dayofweek_3; dayofweek_1; quarter_2; close; month_6; zscore_logVol; vol; logVol; roll_Close_Mean_200; roll_Close_Std_200
600000.SH : quarter_4; month_8; roll_LogVol_Mean_200; logVol; month_11; month_10; quarter_3; zscore_price; vol; month_4; month_6; roll_Close_Mean_200; close; month_5; quarter_2
002415.SZ : zscore_logVol; month_2; roll_Close_Std_200; quarter_2; month_6; roll_Close_Mean_200; close
600837.SH : month_11; zscore_price; quarter_4; month_10; zscore_logVol; past_ret_10; past_ret_5; past_logVol_20; past_ret_20; month_3; month_12; month_5; roll_Close_Std_200; quarter_2; close; roll_Close_Mean_200
000333.SZ : quarter_4; roll_Close_Std_200; month_11; month_5; roll_LogVol_Std_200; month_10; logVol; month_8; past_ret_20; quarter_2; roll_Close_Mean_200; close; month_4
600519.SH : month_2; past_ret_20; past_ret_10; sentiment_pos; month_10; past_ret_5; zscore_price; month_5; dayofweek_1; month_9; past_logVol_1; logVol; vol; roll_LogVol_St

600016.SH : month_11; zscore_logVol; quarter_4; month_5; month_6; quarter_2; close; roll_Close_Mean_200
002304.SZ : month_12; sentiment_pos; month_2; dayofweek_3; dayofweek_1; quarter_2; close; month_6; zscore_logVol; vol; logVol; roll_Close_Mean_200; roll_Close_Std_200
600000.SH : quarter_4; month_8; roll_LogVol_Mean_200; logVol; month_11; month_10; quarter_3; zscore_price; vol; month_4; month_6; roll_Close_Mean_200; close; month_5; quarter_2
002415.SZ : zscore_logVol; month_2; roll_Close_Std_200; quarter_2; month_6; roll_Close_Mean_200; close
600837.SH : month_11; zscore_price; quarter_4; month_10; zscore_logVol; past_ret_10; past_ret_5; past_logVol_20; past_ret_20; month_3; month_12; month_5; roll_Close_Std_200; quarter_2; close; roll_Close_Mean_200
000333.SZ : quarter_4; roll_Close_Std_200; month_11; month_5; roll_LogVol_Std_200; month_10; logVol; month_8; past_ret_20; quarter_2; roll_Close_Mean_200; close; month_4
600519.SH : month_2; past_ret_20; past_ret_10; sentiment_pos; month

In [15]:
set(withNews)

{'000858.SZ',
 '002304.SZ',
 '002594.SZ',
 '600018.SH',
 '600048.SH',
 '600050.SH',
 '600519.SH',
 '600887.SH',
 '601229.SH'}