### import Library and Data

In [106]:
import FinancialMachineLearning as fml
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [107]:
df = pd.read_csv('./Data/kospi_sample_2020.csv', index_col = 0)
df = fml.getDataFrame(df)
mad = fml.madOutlier(df.price.values.reshape(-1, 1))
df = df.loc[~mad]
df

Dollar Bar Sampling

In [108]:
dollar_M = 1000000
dollar_df = fml.BarSampling(df, 'dv', dollar_M)
dollar_df = dollar_df.groupby(level = 0).first()
dollar_df

Fractionally Differencing

In [109]:
import statsmodels.api as sm

cols = ['adfStat','pVal','lags','nObs','95% conf', 'corr']
out = pd.DataFrame(columns = cols)
for d in np.linspace(0, 2, 21):
    try:
        df1 = np.log(dollar_df).resample('1H').last().dropna() # daily return
        df2 = fml.fracDiff(df1, d, thres = 1e-5)
        corr = np.corrcoef(df1.loc[df2.index, 'price'], df2['price'])[0,1]
        df2 = sm.tsa.stattools.adfuller(df2['price'], maxlag = 1, regression = 'c', autolag = None)
        out.loc[d] = list(df2[:4]) + [df2[4]['5%']] + [corr]
    except Exception as e:
        print(f'd: {d.round(1)}, error: {e}')
        continue

In [110]:
f,ax = plt.subplots()
out[['adfStat', 'corr']].plot(ax = ax, marker = 'X')
ax.axhline(out['95% conf'].mean(), lw = 1, color = 'r', ls = 'dotted')
ax.set_title('min d with thresh = 0.0001')
ax.set_xlabel('d values')
ax.set_ylabel('adf stat')
display(out)

In [111]:
min_ffd = out[out.pVal <= 0.05].iloc[0].name
print("The min. D that makes the fracdiff stationary", min_ffd.round(1))

In [112]:
dfx2 = fml.fracDiff_FFD(dollar_df, min_ffd, thres = 1e-5)
dfx2

CUSUM Filtering

In [113]:
threshold = dfx2.std().iat[0] * 2
tEvents = fml.getTEvents(dfx2.price, h = threshold)
tEvents

In [114]:
dollar_feature = dollar_df.price.loc[tEvents]
dollar_feature

In [115]:
frac_diff_feature = dfx2.loc[tEvents]
frac_diff_feature

In [116]:
feature_Mat = (pd.DataFrame().assign(dollar = dollar_feature, frac_diff_dollar = frac_diff_feature.price).drop_duplicates().dropna())
feature_Mat

Volatility

In [117]:
dailyVol = fml.getDailyVolatility(feature_Mat.dollar, span = 50)
dailyVol

In [118]:
dailyVol.plot()

Vertical Barrier

In [119]:
t1 = fml.addVerticalBarrier(tEvents, feature_Mat.dollar, numDays = 5)
t1

In [120]:
import platform
from multiprocessing import cpu_count
if platform.system() == 'Windows':
    cpus = 1
else:
    cpus = cpu_count() - 1
ptsl = [1,1]
target = dailyVol * 2
minRet = 0.001
cpus = cpu_count() - 1
events = fml.getEvents(feature_Mat.dollar, tEvents, ptsl, target, minRet, cpus, t1 = t1)

In [121]:
events

Concurrent Events

In [122]:
numCoEvents = fml.mpPandasObj(fml.getConcurrentBar, ('molecule', events.index), cpus, closeIdx = feature_Mat.index, t1 = events['t1'])
numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep = 'last')]
numCoEvents = numCoEvents.reindex(feature_Mat.index).fillna(0)
out = pd.DataFrame()
out['tW'] = fml.mpPandasObj(fml.getAvgLabelUniq, ('molecule', events.index), cpus, t1 = events['t1'], numCoEvents = numCoEvents)
out

In [123]:
plt.figure(figsize = (15,5))
plt.xlabel('Time', fontfamily = 'Serif', fontsize = 15)

plt.style.use('default')
ax1 = numCoEvents.plot(color = 'blue', grid = True, label = 'Concurrency Events')
ax2 = dailyVol.plot(color = 'red', grid = True, secondary_y = True, label = 'daily volatility')

ax1.legend(loc = 1)
ax2.legend(loc = 2)

In [124]:
coEvents = numCoEvents.to_frame()
ewmasd = dailyVol.to_frame()
combine = ewmasd.join(coEvents, how = 'inner')
combine.columns = ['ewmasd', 'coEvents']
combine['ewmasd_std'] = (combine['ewmasd']-combine['ewmasd'].mean())/combine['ewmasd'].std()
combine['coEvents_std'] = (combine['coEvents']-combine['coEvents'].mean())/combine['coEvents'].std()

sns.set_style('whitegrid')
plt.scatter(combine['coEvents_std'], combine['ewmasd_std'], marker = '.', alpha = 0.5)
plt.axhline(y = combine['coEvents_std'].mean(), c='r', ls='--')
plt.axvline(x = combine['ewmasd_std'].mean(), c='g', ls='--')
plt.show()

In [125]:
plt.scatter(combine['coEvents'], combine['ewmasd'], marker = '.', alpha = 0.5)
plt.axhline(y = 0.03, c='r', ls='--')
plt.axvline(x = 10, c='g', ls='--')
plt.show()

Weights

In [126]:
out['w'] = fml.mpPandasObj(fml.mpSampleW, ('molecule', events.index),
                           cpus, t1 = events['t1'], numCoEvents = numCoEvents, close = feature_Mat.dollar)
out['w'] *= out.shape[0] / out['w'].sum()
out

In [127]:
def getExTimeDecay(tW, clfLastW = 1.,exponent = 1):
    clfW = tW.sort_index().cumsum()
    if clfLastW >= 0: slope = ((1.-clfLastW)/clfW.iloc[-1]) ** exponent
    else: slope = (1./((clfLastW + 1) * clfW.iloc[-1])) ** exponent
    const = 1.-slope * clfW.iloc[-1]
    clfW = const + slope * clfW
    clfW[clfW < 0] = 0
    print(round(const, 4), round(slope, 4))
    return clfW

f,ax=plt.subplots(2,figsize=(10,7))
fs = [1,.75,.5,0,-.25,-.5]
ls = ['-','-.','--',':','--','-.']
for lstW, l in zip(fs,ls):
    decayFactor = getExTimeDecay(out['tW'].dropna(),
                                 clfLastW = lstW,
                                 exponent = 0.75) # experiment by changing exponent
    ((out['w'].dropna()*decayFactor).reset_index(drop=True)
     .plot(ax=ax[0],alpha=0.5))
    s = (pd.Series(1,index = out['w'].dropna().index) * decayFactor)
    s.plot(ax=ax[1], ls=l, label=str(lstW))
ax[1].legend(loc='center left', bbox_to_anchor=(1, 0.5))

In [128]:
s

In [129]:
feature_Mat = feature_Mat.join(out, how = 'left').join(target, how = 'left').dropna()
feature_Mat

In [130]:
# feature_Mat['w'] = feature_Mat['w'] * s

RSI

In [131]:
# Compute RSI
def calculate_rsi(prices, window):
    deltas = prices.diff()
    up = deltas.clip(lower=0)
    down = deltas.clip(upper=0).abs()

    avg_gain = up.rolling(window=window).mean()
    avg_loss = down.rolling(window=window).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    return rsi


def get_rsi(data, window=14):
    df = data.copy(deep=True).reset_index()
    rsi = calculate_rsi(df['dollar'], window)
    rsi_df = pd.Series(data=rsi.values, index=data.index)
    return rsi_df

In [132]:
data = feature_Mat.copy()
rsi_windows = [5, 13, 35]
for w in rsi_windows:
    rsi = get_rsi(data, window = w).squeeze()
    data[f'rsi_{w}'] = rsi

In [133]:
data

autocorr

In [134]:
data['log_ret'] = np.log(data['dollar']).diff()

window_autocorr = 50
data['autocorr_1'] = (data['log_ret']
                          .rolling(window=window_autocorr,
                                   min_periods=window_autocorr, center=False)
                          .apply(lambda x: x.autocorr(lag=1), raw=False))

data['autocorr_3'] = (data['log_ret']
                          .rolling(window=window_autocorr,
                                   min_periods=window_autocorr, center=False)
                          .apply(lambda x: x.autocorr(lag=3), raw=False))

data['autocorr_5'] = (data['log_ret']
                          .rolling(window=window_autocorr,
                                   min_periods=window_autocorr, center=False)
                          .apply(lambda x: x.autocorr(lag=5), raw=False))

In [135]:
data

labeling

In [136]:
labels = fml.getBins(events, feature_Mat.dollar)
clean_labels = fml.dropLabels(labels, 0.05)
clean_labels

feature matrix

In [137]:
from multiprocessing import Pool
import test

if __name__ == "__main__":
    with Pool() as pool:
        seqUs = test.main_mp(t1)
        #result = pool.map(test.main_mp(t1), range(10))
        #print(result)

In [138]:
avg_uniqueness = seqUs.mean()[0]
avgU1 = out['tW'].mean()

#### Primary Model

In [139]:
target = clean_labels.bin
target

In [140]:
data = data.join(target, how = 'left').dropna()
data

### Secondary Model
use Primary Model Prediction

In [141]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [142]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = False)

#### RandomForest Classifier

In [143]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingClassifier
from scipy import interp

rf = RandomForestClassifier(n_estimators = 1000, criterion = "entropy", bootstrap = True,
                                n_jobs=1, random_state = 42, class_weight = 'balanced_subsample', oob_score = False)

cv_gen0 = KFold(n_splits = 5, shuffle = False)

score = fml.cvScore(rf, X_train, y_train, sample_weight = X_train['w'], scoring = 'neg_log_loss', cv = None, cvGen = cv_gen0, pctEmbargo = 0.1)
print('rf_clf Mean CV score: {0:.6f}\nCV Variance: {1:.6f}'.format(score.mean(), score.var()))

In [144]:
skf = StratifiedKFold(n_splits = 5, shuffle = False)
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', bootstrap = True,
                                    class_weight = 'balanced_subsample', random_state = 42, oob_score = False)
fml.crossValPlot(skf, classifier, X_train, y_train)

In [145]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

fit = rf.fit(X_train, y_train)

y_pred_rf = fit.predict_proba(X_test)[:, 1]
y_pred = fit.predict(X_test)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
print(classification_report(y_test, y_pred, target_names = ['no trade',' trade']))

plt.figure(figsize = (9,6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label = 'RandomForest Classifier')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

Feature Importance

In [146]:
title = 'Feature Importance:'
figsize = (15, 5)

feat_imp = pd.DataFrame({'Importance':rf.feature_importances_})
feat_imp['feature'] = X.columns
feat_imp.sort_values(by='Importance', ascending=False, inplace=True)
feat_imp = feat_imp

feat_imp.sort_values(by='Importance', inplace=True)
feat_imp = feat_imp.set_index('feature', drop=True)
feat_imp.plot.barh(title=title, figsize=figsize)
plt.axvline(1. / feat_imp.shape[0], linewidth = 1, color='r', linestyle='dotted')
plt.xlabel('Feature Importance Score')
plt.show()

#### DecisionTree & Bagging Classifier

In [147]:
n_estimator = 1000
dt = DecisionTreeClassifier(criterion = 'entropy', max_features = 'auto', class_weight = 'balanced')
bc = BaggingClassifier(base_estimator = dt, n_estimators = n_estimator,
                       max_samples = avgU1, max_features = 1., random_state = 42)

In [148]:
cv_gen0 = KFold(n_splits = 5, shuffle = False)

score = fml.cvScore(bc, X_train, y_train, sample_weight = X_train['w'], scoring = 'neg_log_loss', cv = None, cvGen = cv_gen0, pctEmbargo = 0.1)
print('rf_clf Mean CV score: {0:.6f}\nCV Variance: {1:.6f}'.format(score.mean(), score.var()))

In [149]:
skf = StratifiedKFold(n_splits = 5, shuffle = False)
classifier = BaggingClassifier(base_estimator = dt, n_estimators = n_estimator,
                               max_samples = avgU1, max_features = 1., random_state = 42)
fml.crossValPlot(skf, classifier, X_train, y_train)

In [150]:
y_pred_bc = fit.predict_proba(X_test)[:, 1]
y_pred = fit.predict(X_test)
fpr_bc, tpr_bc, _ = roc_curve(y_test, y_pred_bc)
print(classification_report(y_test, y_pred, target_names = ['no trade',' trade']))

plt.figure(figsize = (9,6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_bc, tpr_bc, label = 'Bagging Classifier')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

#### RandomForest & Bagging Classifier

In [151]:
n_estimator = 1000
rf2 = RandomForestClassifier(n_estimators = 1, criterion = 'entropy', bootstrap = False, class_weight = 'balanced_subsample')
bc2 = BaggingClassifier(base_estimator = rf2, n_estimators = n_estimator,
                        max_samples = avgU1, max_features = 1., random_state = 42)

fit = bc2.fit(X_train, y_train)

In [152]:
cv_gen0 = KFold(n_splits = 5, shuffle = False)

score = fml.cvScore(bc2, X_train, y_train, sample_weight = X_train['w'], scoring = 'neg_log_loss', cv = None, cvGen = cv_gen0, pctEmbargo = 0.1)
print('rf_clf Mean CV score: {0:.6f}\nCV Variance: {1:.6f}'.format(score.mean(), score.var()))

In [153]:
skf = StratifiedKFold(n_splits = 5, shuffle = False)
classifier = BaggingClassifier(base_estimator = rf2, n_estimators = n_estimator,
                               max_samples = avgU1, max_features = 1., random_state = 42)
fml.crossValPlot(skf, classifier, X_train, y_train)

In [154]:
y_pred_bc2 = fit.predict_proba(X_test)[:, 1]
y_pred = fit.predict(X_test)
fpr_bc, tpr_bc, _ = roc_curve(y_test, y_pred_rf)
print(classification_report(y_test, y_pred, target_names = ['no trade',' trade']))

plt.figure(figsize = (9,6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_bc, tpr_bc, label = 'Bagging Classifier')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [155]:
yprf = pd.Series(y_pred_rf, index = X_test.index)
ytest = pd.Series(y_test, index = X_test.index)

In [156]:
yt = pd.DataFrame(yprf)
yt = pd.concat([yt, X_test['log_ret']], axis = 1)
yt = pd.concat([yt, ytest], axis = 1)
z = (yt[0] - 1 / 2) / (yt[0] * (1 - yt[0])) ** 0.5
cali_pred = (yprf - yprf.min()) / (yprf.max() - yprf.min())
yt = pd.concat([yt, z], axis = 1)
yt = pd.concat([yt, z * X_test['log_ret']], axis = 1)
yt = pd.concat([yt, cali_pred], axis = 1)
yt.columns = ['pred_prob','log_ret','bin','bet_size','profit','cali_pred']
print(yt)

In [157]:
fig, ax = plt.subplots(figsize=(10, 5))

coefficients = np.polyfit(yt['cali_pred'][yt['bin'] == 1.0], yt['profit'][yt['bin'] == 1.0], 1)
regression_line = np.poly1d(coefficients)
x_range = np.linspace(np.min(yt['cali_pred'][yt['bin'] == 1.0]), np.max(yt['cali_pred'][yt['bin'] == 1.0]), 100)
y_range = regression_line(x_range)
ax.plot(x_range, y_range, color='red')

sns.scatterplot(data = yt, x = 'cali_pred', y="profit", hue='bin', palette='flare', ax=ax)
plt.show()

yt['profit'][yt['bin'] == 1][yt['cali_pred'] >= 0.5].cumsum().plot(label = 'Machine Learning Model', figsize = (10,5))
yt['log_ret'].cumsum().plot(label = 'KOSPI200 future')
plt.legend()

In [158]:
yprf = pd.Series(y_pred_bc, index = X_test.index)
ytest = pd.Series(y_test, index = X_test.index)
yt = pd.DataFrame(yprf)
yt = pd.concat([yt, X_test['log_ret']], axis = 1)
yt = pd.concat([yt, ytest], axis = 1)
z = (yt[0] - 1 / 2) / (yt[0] * (1 - yt[0])) ** 0.5
cali_pred = (yprf - yprf.min()) / (yprf.max() - yprf.min())
yt = pd.concat([yt, z], axis = 1)
yt = pd.concat([yt, z * X_test['log_ret']], axis = 1)
yt = pd.concat([yt, cali_pred], axis = 1)
yt.columns = ['pred_prob','log_ret','bin','bet_size','profit','cali_pred']

fig, ax = plt.subplots(figsize=(10, 5))

coefficients = np.polyfit(yt['cali_pred'][yt['bin'] == 1.0], yt['profit'][yt['bin'] == 1.0], 1)
regression_line = np.poly1d(coefficients)
x_range = np.linspace(np.min(yt['cali_pred'][yt['bin'] == 1.0]), np.max(yt['cali_pred'][yt['bin'] == 1.0]), 100)
y_range = regression_line(x_range)
ax.plot(x_range, y_range, color='red')

sns.scatterplot(data=yt, x='cali_pred', y="profit", hue='bin', palette='flare', ax=ax)
plt.show()

yt['profit'][yt['bin'] == 1][yt['cali_pred'] >= 0.5].cumsum().plot(label='Machine Learning Model', figsize=(10, 5))
yt['log_ret'].cumsum().plot(label='KOSPI200 future')
plt.legend()

In [159]:
yprf = pd.Series(y_pred_bc2, index = X_test.index)
ytest = pd.Series(y_test, index = X_test.index)
yt = pd.DataFrame(yprf)
yt = pd.concat([yt, X_test['log_ret']], axis = 1)
yt = pd.concat([yt, ytest], axis = 1)
z = (yt[0] - 1 / 2) / (yt[0] * (1 - yt[0])) ** 0.5
cali_pred = (yprf - yprf.min()) / (yprf.max() - yprf.min())
yt = pd.concat([yt, z], axis = 1)
yt = pd.concat([yt, z * X_test['log_ret']], axis = 1)
yt = pd.concat([yt, cali_pred], axis = 1)
yt.columns = ['pred_prob','log_ret','bin','bet_size','profit','cali_pred']

fig, ax = plt.subplots(figsize=(10, 5))

coefficients = np.polyfit(yt['cali_pred'][yt['bin'] == 1.0], yt['profit'][yt['bin'] == 1.0], 1)
regression_line = np.poly1d(coefficients)
x_range = np.linspace(np.min(yt['cali_pred'][yt['bin'] == 1.0]), np.max(yt['cali_pred'][yt['bin'] == 1.0]), 100)
y_range = regression_line(x_range)
ax.plot(x_range, y_range, color='red')

sns.scatterplot(data=yt, x='cali_pred', y="profit", hue='bin', palette='flare', ax=ax)
plt.show()

yt['profit'][yt['bin'] == 1][yt['cali_pred'] >= 0.3].cumsum().plot(label='Machine Learning Model', figsize=(10, 5))
yt['log_ret'].cumsum().plot(label='KOSPI200 future')
plt.legend()