In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.ma.extras import average
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import TimeSeriesSplit

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import warnings
import json
import optuna 
from sklearn.preprocessing import LabelEncoder


### OWN FUNCTIONS 
from model_functions import *

## Main Model: Voting Classifier

In [74]:
aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')

In [75]:
tickers = ['AAPL', 'GOOGL', 'MSFT']
data_dict = {
    'AAPL': aapl_with_features,
    'GOOGL': googl_with_features,
    'MSFT': msft_with_features
}
color_dict = {
     'AAPL': 'grey',
    'GOOGL': 'yellow',
    'MSFT': 'green'
}

statistics = ['accuracy', 'precision_weighted', 'recall_weighted', 'roc_auc_ovr_weighted']
#statistics = ['accuracy']

In [76]:
def get_tbm_target(df, ticker, horizon=5, pt_sl=[1.3,1]):
    df = df.copy()
    close = df[f'Close_{ticker}']
    
    log_ret = np.log(close / close.shift(1))
    volatility = log_ret.rolling(window=20).std()
    
    targets = pd.Series(index=df.index, dtype=float)
    
    for i in range(len(df) - horizon):
        price_start = close.iloc[i]
        current_vol = volatility.iloc[i] ### dynamic barrier for each day
        
        upper_barrier = price_start * (1 + current_vol * pt_sl[0])
        lower_barrier = price_start * (1 - current_vol * pt_sl[1])
        
        future_prices = close.iloc[i+1 : i+ 1 + horizon]
        
        targets.iloc[i] = 0
        
        for price_future in future_prices:
            if price_future >= upper_barrier:
                targets.iloc[i] = 1 # profit taking hit
                break
            elif price_future <= lower_barrier:
                targets.iloc[i] = -1 # stop loss hit
                break
    df['Target'] = targets
    return df.dropna(subset=['Target'])

In [77]:
def get_target(input_df, ticker):
    df = input_df.copy()
    df['Target'] = (df[f'Close_{ticker}'].shift(-1) > df[f'Close_{ticker}']).astype(int)
    df.dropna(inplace=True)
    return df

### SEARCHING FOR BEST FEATURES BY USING RFECV

In [78]:
def best_features(data_dict, tickers, pred_type):
    warnings.filterwarnings('ignore')
    feature_dict = {}
    le = LabelEncoder()

    for i, share in enumerate(tickers):
        feature_dict[share] = {}
        df = pred_type(data_dict[share], share)
        
        banned_keywords = [
            'Open_', 'High_', 'Low_', 'Close_', 'Volume_',
            'rolling_max', 'rolling_min',
            'CPI', 'Dollar_idx', 'FEDFUNDS', 
            'BBM_', 'BBU_', 'BBL_',
            'MACD_', 'MACDs_' 
        ]
        
        cols_to_ban = []
        for col in df.columns:
            if any(ban in col for ban in banned_keywords):
                if 'change' in col: continue
                if 'diff' in col: continue
                if 'MACDh' in col: continue
                
                cols_to_ban.append(col)
        
        safe_cols = [c for c in df.columns if c not in cols_to_ban]
        df = df[safe_cols]
        
        features = [col for col in df.columns if col not in ['Target', 'index', 'DATE', f'Close_{share}', f'Open_{share}', f'High_{share}', f'Low_{share}', f'Volume_{share}']]

        to_remove = [f'Volume_{share}_lag1', f'Volume_{share}_lag2', f'Volume_{share}_lag3', f'Volume_{share}_lag5',
                         f'RSI_14_lag1', 'RSI_14_lag2', 'RSI_14_lag3', 'RSI_14_lag5', 'log_return_lag1', 'log_return_lag2',
                         'log_return_lag3', 'log_return_lag5']

        '''if share == 'AAPL':
            additional_to_remove = ['rolling_max_20', 'rolling_max_20', 'dist_to_max_20', 'dist_to_min_20', 'rolling_max_60', 'rolling_max_60', 'dist_to_max_60', 'dist_to_min_60']
            to_remove += additional_to_remove
            '''

        features = [f for f in features if f not in to_remove]

        X = df[features]
        y = df['Target']
        y_encoded = le.fit_transform(y)

        model_judge_rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42, n_jobs=-1)

        # Second model to check
        model_judge_xgb = XGBClassifier(
            n_estimators=50,
            max_depth=4,
            learning_rate=0.1,
            n_jobs=-1,
            random_state=42,
            eval_metric='logloss',
            
            use_label_encoder=False
        )

        ### test on different models for best acc for each company
        '''if share != 'MSFT':
            model_judge = model_judge_rf
        else:
            df = pred_type(data_dict[share], share)
            df = df.iloc[20:]
            model_judge = model_judge_xgb
            '''

        model_judge = model_judge_rf

        cv_split = TimeSeriesSplit(n_splits=5)
        min_feats = 15 if share == 'AAPL' else 10
        rfecv = RFECV(
                estimator=model_judge,
                min_features_to_select=min_feats,
                step=1,
                cv=cv_split,
                scoring='precision_weighted',
                n_jobs=-1,)

        rfecv.fit(X, y_encoded)

        print(f"Optimal features numer by RFECV : {rfecv.n_features_}")
        selected_features = [f for f, s in zip(features, rfecv.support_) if s]

        X_refined = X[selected_features]
        desired_features = min(len(selected_features), 20)
        rfe_final = RFE(
                    estimator=model_judge,
                    n_features_to_select=desired_features,
                    step=1
                )

        rfe_final.fit(X_refined, y_encoded)
        final_features = [f for f, s in zip(selected_features, rfe_final.support_) if s]

        selected_features = final_features
        must_have_features = ['FinBERT_MA7', 'DGS10', 'Dollar_idx']
        
        for feature in must_have_features:
            if feature in X.columns:
                if feature not in selected_features:
                    selected_features.append(feature)
                    
        print(f"Winner features: ({len(selected_features)}):")
        print(selected_features)
        feature_dict[share] = selected_features

    return feature_dict

In [79]:
warnings.filterwarnings('ignore')
tbm = best_features(data_dict, tickers, pred_type=get_tbm_target) 
with open('../selected_features/feature_dict_tbm.json', 'w') as f:
    json.dump(tbm, f, indent=4)   
print(tbm)

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Optimal features numer by RFECV : 20
Winner features: (21):
['RSI_14', 'MACDh_12_26_9', 'ATRr_14', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'DGS10', 'RSI_14_MSFT', 'MACDh_12_26_9_MSFT', 'BBB_20_2.0_2.0_MSFT', 'BBP_20_2.0_2.0_MSFT', 'RSI_14_GOOGL', 'ATRr_14_GOOGL', 'BBB_20_2.0_2.0_GOOGL', 'cusum', 'cusum_pos', 'cusum_neg', 'dist_to_max_20', 'dist_to_min_20', 'dist_to_max_60', 'dist_to_min_60', 'FinBERT_MA7']


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


Optimal features numer by RFECV : 38
Winner features: (21):
['RSI_14', 'MACDh_12_26_9', 'ATRr_14', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'DGS10', 'RSI_14_MSFT', 'MACDh_12_26_9_MSFT', 'ATRr_14_MSFT', 'BBB_20_2.0_2.0_MSFT', 'BBP_20_2.0_2.0_MSFT', 'MACDh_12_26_9_AAPL', 'ATRr_14_AAPL', 'BBB_20_2.0_2.0_AAPL', 'cusum', 'cusum_pos', 'cusum_neg', 'dist_to_min_20', 'dist_to_min_60', 'Dist_to_SMA200_GOOGL', 'FinBERT_MA7']


  return _ForkingPickler.loads(res)


Optimal features numer by RFECV : 15
Winner features: (17):
['RSI_14', 'MACDh_12_26_9', 'ATRr_14', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'MACDh_12_26_9_AAPL', 'ATRr_14_AAPL', 'BBB_20_2.0_2.0_AAPL', 'BBP_20_2.0_2.0_AAPL', 'cusum', 'cusum_pos', 'cusum_neg', 'dist_to_min_20', 'dist_to_max_60', 'dist_to_min_60', 'FinBERT_MA7', 'DGS10']
{'AAPL': ['RSI_14', 'MACDh_12_26_9', 'ATRr_14', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'DGS10', 'RSI_14_MSFT', 'MACDh_12_26_9_MSFT', 'BBB_20_2.0_2.0_MSFT', 'BBP_20_2.0_2.0_MSFT', 'RSI_14_GOOGL', 'ATRr_14_GOOGL', 'BBB_20_2.0_2.0_GOOGL', 'cusum', 'cusum_pos', 'cusum_neg', 'dist_to_max_20', 'dist_to_min_20', 'dist_to_max_60', 'dist_to_min_60', 'FinBERT_MA7'], 'GOOGL': ['RSI_14', 'MACDh_12_26_9', 'ATRr_14', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'DGS10', 'RSI_14_MSFT', 'MACDh_12_26_9_MSFT', 'ATRr_14_MSFT', 'BBB_20_2.0_2.0_MSFT', 'BBP_20_2.0_2.0_MSFT', 'MACDh_12_26_9_AAPL', 'ATRr_14_AAPL', 'BBB_20_2.0_2.0_AAPL', 'cusum', 'cusum_pos', 'cusum_neg', 'dist_to_min_20', 'dist_

In [80]:
warnings.filterwarnings('ignore')
binary = best_features(data_dict, tickers, pred_type=get_target) 
with open('../selected_features/feature_dict_binary.json', 'w') as f:
    json.dump(binary, f, indent=4)   
print(binary)

Optimal features numer by RFECV : 33
Winner features: (22):
['RSI_14', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'VIX_percent', 'RSI_14_MSFT', 'MACDh_12_26_9_MSFT', 'BBB_20_2.0_2.0_MSFT', 'RSI_14_GOOGL', 'MACDh_12_26_9_GOOGL', 'BBB_20_2.0_2.0_GOOGL', 'BBP_20_2.0_2.0_GOOGL', 'Dollar_idx_change', 'VIX_change', 'log_return', 'cusum', 'cusum_pos', 'dist_to_max_20', 'dist_to_min_20', 'dist_to_max_60', 'Dist_to_SMA200_AAPL', 'FinBERT_MA7', 'DGS10']
Optimal features numer by RFECV : 29
Winner features: (21):
['MACDh_12_26_9', 'BBB_20_2.0_2.0', 'DGS10', 'USEPUINDXD', 'VIX_percent', 'RSI_14_MSFT', 'ATRr_14_MSFT', 'BBB_20_2.0_2.0_MSFT', 'BBP_20_2.0_2.0_MSFT', 'RSI_14_AAPL', 'MACDh_12_26_9_AAPL', 'ATRr_14_AAPL', 'BBB_20_2.0_2.0_AAPL', 'Dollar_idx_change', 'VIX_change', 'log_return', 'cusum', 'cusum_pos', 'cusum_neg', 'dist_to_min_20', 'FinBERT_MA7']
Optimal features numer by RFECV : 23
Winner features: (22):
['RSI_14', 'BBB_20_2.0_2.0', 'USEPUINDXD', 'VIX_percent', 'RSI_14_AAPL', 'ATRr_14_AAPL', 'BBB_2