In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.ma.extras import average
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import TimeSeriesSplit

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import warnings
import json
import optuna 
from sklearn.preprocessing import LabelEncoder


### OWN FUNCTIONS 
from model_functions import *

## Main Model: Voting Classifier

In [6]:
aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')

In [7]:
tickers = ['AAPL', 'GOOGL', 'MSFT']
data_dict = {
    'AAPL': aapl_with_features,
    'GOOGL': googl_with_features,
    'MSFT': msft_with_features
}
color_dict = {
     'AAPL': 'grey',
    'GOOGL': 'yellow',
    'MSFT': 'green'
}

statistics = ['accuracy', 'precision_weighted', 'recall_weighted', 'roc_auc_ovr_weighted']
#statistics = ['accuracy']

In [8]:
def get_tbm_target(df, ticker, horizon=5, pt_sl=[1,1.5]):
    df = df.copy()
    close = df[f'Close_{ticker}']
    
    log_ret = np.log(close / close.shift(1))
    volatility = log_ret.rolling(window=20).std()
    
    targets = pd.Series(index=df.index, dtype=float)
    
    for i in range(len(df) - horizon):
        price_start = close.iloc[i]
        current_vol = volatility.iloc[i] ### dynamic barrier for each day
        
        upper_barrier = price_start * (1 + current_vol * pt_sl[0])
        lower_barrier = price_start * (1 - current_vol * pt_sl[1])
        
        future_prices = close.iloc[i+1 : i+ 1 + horizon]
        
        targets.iloc[i] = 0
        
        for price_future in future_prices:
            if price_future >= upper_barrier:
                targets.iloc[i] = 1 # profit taking hit
                break
            elif price_future <= lower_barrier:
                targets.iloc[i] = -1 # stop loss hit
                break
    df['Target'] = targets
    return df.dropna(subset=['Target'])

In [9]:
print(msft_with_features.columns[:47])
def get_target(input_df, ticker):
    df = input_df.copy()
    df['Target'] = (df[f'Close_{ticker}'].shift(-1) > df[f'Close_{ticker}']).astype(int)
    df.dropna(inplace=True)
    return df

Index(['index', 'DATE', 'Close_MSFT', 'High_MSFT', 'Low_MSFT', 'Open_MSFT',
       'Volume_MSFT', 'RSI_14', 'MACD_12_26_9', 'MACDh_12_26_9',
       'MACDs_12_26_9', 'ATRr_14', 'BBL_20_2.0_2.0', 'BBM_20_2.0_2.0',
       'BBU_20_2.0_2.0', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'Target',
       'FEDFUNDS', 'DGS10', 'CPI', 'Dollar_idx', 'USEPUINDXD', 'Close_VIX',
       'High_VIX', 'Low_VIX', 'Open_VIX', 'VIX_percent', 'Is_Panic', 'Is_Calm',
       'Is_Uncertain', 'Close_AAPL', 'High_AAPL', 'Low_AAPL', 'Open_AAPL',
       'Volume_AAPL', 'Close_GOOGL', 'High_GOOGL', 'Low_GOOGL', 'Open_GOOGL',
       'Volume_GOOGL', 'log_return', 'cusum', 'cusum_pos', 'cusum_neg',
       'anomaly_raw', 'is_anomaly'],
      dtype='object')


### SEARCHING FOR BEST FEATURES BY USING RFECV

In [15]:
def best_features(data_dict, tickers):
    warnings.filterwarnings('ignore')
    feature_dict = {}
    le = LabelEncoder()

    for i, share in enumerate(tickers):
        feature_dict[share] = {}
        df = get_tbm_target(data_dict[share], share)
        features = [col for col in df.columns if col not in ['Target', 'index', 'DATE', f'Close_{share}', f'Open_{share}', f'High_{share}', f'Low_{share}', f'Volume_{share}']]

        to_remove = [f'Volume_{share}_lag1', f'Volume_{share}_lag2', f'Volume_{share}_lag3', f'Volume_{share}_lag5',
                         f'RSI_14_lag1', 'RSI_14_lag2', 'RSI_14_lag3', 'RSI_14_lag5', 'log_return_lag1', 'log_return_lag2',
                         'log_return_lag3', 'log_return_lag5']

        '''if share == 'AAPL':
            additional_to_remove = ['rolling_max_20', 'rolling_max_20', 'dist_to_max_20', 'dist_to_min_20', 'rolling_max_60', 'rolling_max_60', 'dist_to_max_60', 'dist_to_min_60']
            to_remove += additional_to_remove
            '''

        features = [f for f in features if f not in to_remove]

        X = df[features]
        y = df['Target']
        y_encoded = le.fit_transform(y)

        model_judge_rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42, n_jobs=-1)

        # Second model to check
        model_judge_xgb = XGBClassifier(
            n_estimators=50,
            max_depth=4,
            learning_rate=0.1,
            n_jobs=-1,
            random_state=42,
            eval_metric='logloss',
            
            use_label_encoder=False
        )

        ### test on different models for best acc for each company
        '''if share != 'MSFT':
            model_judge = model_judge_rf
        else:
            model_judge = model_judge_xgb
            '''

        model_judge = model_judge_xgb

        cv_split = TimeSeriesSplit(n_splits=5)
        min_feats = 15 if share == 'AAPL' else 10
        rfecv = RFECV(
                estimator=model_judge,
                min_features_to_select=min_feats,
                step=1,
                cv=cv_split,
                scoring='f1_weighted',
                n_jobs=-1,)

        rfecv.fit(X, y_encoded)

        print(f"Optimal features numer by RFECV : {rfecv.n_features_}")
        selected_features = [f for f, s in zip(features, rfecv.support_) if s]

        X_refined = X[selected_features]
        desired_features = min(len(selected_features), 20)
        rfe_final = RFE(
                    estimator=model_judge,
                    n_features_to_select=desired_features,
                    step=1
                )

        rfe_final.fit(X_refined, y_encoded)
        final_features = [f for f, s in zip(selected_features, rfe_final.support_) if s]

        selected_features = final_features
        print(f"Winner features: ({len(selected_features)}):")
        print(selected_features)
        feature_dict[share] = selected_features

    with open("../selected_features/feature_dict.json", "w") as f:
        json.dump(feature_dict, f, indent=4)

    return feature_dict

In [18]:
bf = best_features(data_dict, tickers) 
print(bf)

Optimal features numer by RFECV : 34
Winner features: (15):
['MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 'ATRr_14', 'BBL_20_2.0_2.0', 'BBM_20_2.0_2.0', 'BBB_20_2.0_2.0', 'FEDFUNDS', 'DGS10', 'CPI', 'Dollar_idx', 'Low_VIX', 'Open_VIX', 'Close_GOOGL', 'cusum_neg']
Optimal features numer by RFECV : 27
Winner features: (27):
['MACDh_12_26_9', 'MACDs_12_26_9', 'BBL_20_2.0_2.0', 'BBM_20_2.0_2.0', 'BBU_20_2.0_2.0', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'FEDFUNDS', 'DGS10', 'CPI', 'Dollar_idx', 'Close_VIX', 'High_VIX', 'Low_VIX', 'Open_VIX', 'Is_Panic', 'Close_MSFT', 'High_MSFT', 'Low_MSFT', 'Open_MSFT', 'Close_AAPL', 'High_AAPL', 'Low_AAPL', 'Open_AAPL', 'cusum', 'cusum_pos', 'cusum_neg']
Optimal features numer by RFECV : 30
Winner features: (15):
['RSI_14', 'BBL_20_2.0_2.0', 'BBU_20_2.0_2.0', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'CPI', 'Dollar_idx', 'Close_VIX', 'High_VIX', 'Close_AAPL', 'High_AAPL', 'Open_AAPL', 'High_GOOGL', 'Open_GOOGL', 'cusum_pos']
{'AAPL': ['MACD_12_26_9', 'MACDh_12

In [None]:
def model_train(tickers, feature_dict):
    result_dict = {}

    for i, share in enumerate(tickers):
        df = get_tbm_target(data_dict, share)
        features = [col for col in df.columns if col not in ['Target', 'index', 'DATE']]

        X = df[features]
        y = df['Target']
        result_dict = {}

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

        selected_cols = feature_dict[share]

        rf_pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('rf',
                 RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_leaf=10, random_state=42, n_jobs=-1))
            ])

        svm_pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('svc', SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)),
            ])

        xgb_pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('xgb',
                 XGBClassifier(n_estimators=50, max_depth=4, learning_rate=0.1, eval_metric='logloss', random_state=42,
                               n_jobs=-1)),
            ])

        main_model = VotingClassifier(
                estimators=[('xgb', xgb_pipeline), ('svc', svm_pipeline), ('rf', rf_pipeline), ],
                voting='soft')

        main_model.fit(X_train[selected_cols], y_train)

        y_pred = main_model.predict(X_test[selected_cols])
        acc = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        print('*' * 40)
        print(f"Results for {share}:")
        print("Accuracy:", acc)
        print("Precision:", precision)
        print("Recall:", recall)
        print("ROC AUC score:", roc_auc)
        result_dict[share] = [acc, precision, recall, roc_auc]
    return result_dict


In [None]:
result_dict = model_train(tickers, feature_dict=bf)
print(result_dict)

In [None]:
for share in tickers:
    df_to_save = pd.DataFrame.from_dict(
        result_dict[share], 
        orient='index',
        columns=['accuracy', 'precision', 'recall', 'roc_auc'])
    print('*'*10, f'Results', '*'*10)
    print(df_to_save)
    df_to_save.to_csv(f'../models_results/main_model_results.csv')

In [None]:
rf_results = pd.read_csv('../models_results/RF_results.csv')
svm_results = pd.read_csv('../models_results/SVM_results.csv')
xgb_results = pd.read_csv('../models_results/XGB_results.csv')
lr_results = pd.read_csv('../models_results/LR_results.csv')
bs1_results = pd.read_csv('../models_results/bs1_results.csv')
bs2_results = pd.read_csv('../models_results/bs2_results.csv')
bs3_results = pd.read_csv('../models_results/bs3_results.csv')
main_model_acc = pd.read_csv('../models_results/main_model_results_by_accuracy.csv')
main_model_precision = pd.read_csv('../models_results/main_model_results_by_precision.csv')
main_model_recall = pd.read_csv('../models_results/main_model_results_by_recall.csv')
main_model_roc_auc = pd.read_csv('../models_results/main_model_results_by_roc_auc.csv')

bs1_results['Model'] = 'Based on yesterday'
bs2_results['Model'] = 'Always rise'
bs3_results['Model'] = 'Based on SMA'
rf_results['Model'] = 'Random Forest'
svm_results['Model'] = 'SVM'
xgb_results['Model'] = 'XGBoost'
lr_results['Model'] = 'Logistic Regression'
main_model_acc['Model'] = 'Ensemble (Main)'
main_model_precision['Model'] = 'Ensemble (Main)'
main_model_recall['Model'] = 'Ensemble (Main)'
main_model_roc_auc['Model'] = 'Ensemble (Main)'

all_dfs = [bs1_results, bs2_results, bs3_results, rf_results, svm_results, xgb_results, lr_results, main_model_acc]

fig, axes = plt.subplots(4, 3, figsize=(20, 60))

combined_df = pd.concat(all_dfs)
    
if 'Unnamed: 0' in combined_df.columns:
    combined_df = combined_df.rename(columns={'Unnamed: 0': 'Ticker'})
    
combined_df = combined_df.sort_values(by=['Ticker'], ascending=[True, False])
    
combined_df = combined_df.reset_index(drop=True)
    
print('*' * 40)
print(combined_df)
    
for i, t in enumerate(combined_df['Ticker'].unique()):
    subsets = combined_df[combined_df['Ticker'] == t]
    subsets = subsets.sort_values(by='accuracy', ascending=False)
        
    y_min = subsets.min() - 0.03
    y_max = subsets.max() + 0.03
        
    axes[i].bar(subsets['Model'], subsets, color=color_dict[t])
    axes[i].set_title(f'Share: {t}, prediction:', size=20)
    axes[i].set_ylim(y_min, ymax=y_max)
    axes[i].grid(axis='x', alpha=0.5, linestyle='--')
    axes[i].tick_params(axis='x', labelsize=15, labelrotation=45)
plt.tight_layout()
plt.show()



