In [99]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [100]:
def add_log_returns(df: pd.DataFrame, price_col: str = "close") -> pd.DataFrame:
    df = df.copy()
    df["log_return"] = np.log(df[price_col] / df[price_col].shift(1))
    return df

In [101]:
def add_cusum(df: pd.DataFrame, col: str = "log_return") -> pd.DataFrame:
    df = df.copy()
    mean = df[col].mean()
    df["cusum"] = (df[col] - mean).cumsum()
    return df


def add_cusum_pos_neg(df: pd.DataFrame, col: str = "log_return") -> pd.DataFrame:
    df = df.copy()

    pos, neg = 0.0, 0.0
    pos_list, neg_list = [], []

    mean = df[col].mean()

    for x in df[col]:
        if np.isnan(x):
            pos_list.append(np.nan)
            neg_list.append(np.nan)
            continue

        pos = max(0.0, pos + x - mean)
        neg = min(0.0, neg + x - mean)

        pos_list.append(pos)
        neg_list.append(neg)

    df["cusum_pos"] = pos_list
    df["cusum_neg"] = neg_list
    return df

In [102]:
def add_anomaly_flag(
    df: pd.DataFrame,
    contamination: float = 0.01,
    random_state: int = 42
) -> pd.DataFrame:
    df = df.copy()

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    excluded = {"log_return"}
    features = [c for c in numeric_cols if c not in excluded]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))

    iso = IsolationForest(
        n_estimators=300,
        contamination=contamination,
        random_state=random_state,
        n_jobs=-1
    )

    df["anomaly_raw"] = iso.fit_predict(X_scaled)
    df["is_anomaly"] = (df["anomaly_raw"] == -1).astype(int)

    return df

In [103]:
def add_market_regime(df,share, slow_ma=50, fast_ma=50):
    df = df.copy()
    df[f'SMA_{slow_ma}_{share}'] = df[f'Close_{share}'].rolling(window=slow_ma).mean()
        
    df[f'SMA_{fast_ma}_{share}'] = df[f'Close_{share}'].rolling(window=fast_ma).mean()

    df[f'Regime_Bullish_{share}'] = (df[f'Close_{share}'].shift(1) > df[f'SMA_{slow_ma}_{share}'].shift(1)).astype(int)
        
    
    df[f'Dist_to_SMA200_{share}'] = (df[f'Close_{share}'].shift(1) / df[f'SMA_{slow_ma}_{share}'].shift(1)) - 1
    df = df.drop(columns=[f'SMA_{slow_ma}_{share}', f'SMA_{fast_ma}_{share}'])
        
    return df

In [104]:
def add_peak_valley_features(df,share, window=20):
    df[f'rolling_max_{window}_{share}'] = df[f'High_{share}'].rolling(window=window).max().shift(1)
    df[f'rolling_min_{window}_{share}'] = df[f'Low_{share}'].rolling(window=window).min().shift(1)
        
    df[f'dist_to_max_{window}'] = df[f'Close_{share}'] / df[f'rolling_max_{window}_{share}'] - 1
    df[f'dist_to_min_{window}'] = df[f'Close_{share}'] / df[f'rolling_min_{window}_{share}'] - 1
    
    return df

In [105]:
def add_lagged_features(df, ticker, lags=[1, 2]):
    df = df.copy()
    for col in [f'log_return', f'Volume_{ticker}', 'RSI_14']:
        if col in df.columns:
            for lag in lags:
                df[f'{col}_lag{lag}'] = df[col].shift(lag)
    
    return df

In [106]:
def stage2_feature_engineering(
    df: pd.DataFrame,
    price_col,
    ticker
) -> pd.DataFrame:


    df = df.sort_values("DATE").reset_index(drop=True)
    df['FEDFUNDS_diff'] = df['FEDFUNDS'].diff()
    df['CPI_change'] = df['CPI'].pct_change()
    df['Dollar_idx_change'] = df['Dollar_idx'].pct_change()
    df['VIX_change'] = df['Close_VIX'].pct_change()
    df = add_log_returns(df, price_col)
    df = add_cusum(df)
    df = add_cusum_pos_neg(df)
    df = add_anomaly_flag(df)
    df = add_peak_valley_features(df,share=ticker, window=20)
    df = add_peak_valley_features(df,share=ticker, window=60)
    df = add_market_regime(df, share=ticker)
    df = add_lagged_features(df, ticker=ticker)

    df.dropna(inplace=True)

    return df

In [107]:
aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')
tickers = ['AAPL', 'GOOGL', 'MSFT']

for df, share in zip([aapl_with_features, googl_with_features, msft_with_features], tickers):
    print(df.columns)
    df_features = stage2_feature_engineering(df, price_col=f"Close_{share}", ticker=share)
    df_features.to_csv(f"../data/all_data/all_{share}_data.csv", index=False)


Index(['DATE', 'Close_AAPL', 'High_AAPL', 'Low_AAPL', 'Open_AAPL',
       'Volume_AAPL', 'RSI_14', 'MACD_12_26_9', 'MACDh_12_26_9',
       'MACDs_12_26_9', 'ATRr_14', 'BBL_20_2.0_2.0', 'BBM_20_2.0_2.0',
       'BBU_20_2.0_2.0', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'FinBERT_Score',
       'FinBERT_MA7', 'FEDFUNDS', 'DGS10', 'CPI', 'Dollar_idx', 'USEPUINDXD',
       'Close_VIX', 'High_VIX', 'Low_VIX', 'Open_VIX', 'VIX_percent',
       'Is_Panic', 'Is_Calm', 'Is_Uncertain', 'Close_SP500', 'High_SP500',
       'Low_SP500', 'Open_SP500', 'Volume_SP500', 'Close_MSFT_MSFT',
       'High_MSFT_MSFT', 'Low_MSFT_MSFT', 'Open_MSFT_MSFT', 'Volume_MSFT_MSFT',
       'RSI_14_MSFT', 'MACD_12_26_9_MSFT', 'MACDh_12_26_9_MSFT',
       'MACDs_12_26_9_MSFT', 'ATRr_14_MSFT', 'BBL_20_2.0_2.0_MSFT',
       'BBM_20_2.0_2.0_MSFT', 'BBU_20_2.0_2.0_MSFT', 'BBB_20_2.0_2.0_MSFT',
       'BBP_20_2.0_2.0_MSFT', 'Close_GOOGL_GOOGL', 'High_GOOGL_GOOGL',
       'Low_GOOGL_GOOGL', 'Open_GOOGL_GOOGL', 'Volume_GOOGL_GOOGL',

  X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))


Index(['DATE', 'Close_GOOGL', 'High_GOOGL', 'Low_GOOGL', 'Open_GOOGL',
       'Volume_GOOGL', 'RSI_14', 'MACD_12_26_9', 'MACDh_12_26_9',
       'MACDs_12_26_9', 'ATRr_14', 'BBL_20_2.0_2.0', 'BBM_20_2.0_2.0',
       'BBU_20_2.0_2.0', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'FinBERT_Score',
       'FinBERT_MA7', 'FEDFUNDS', 'DGS10', 'CPI', 'Dollar_idx', 'USEPUINDXD',
       'Close_VIX', 'High_VIX', 'Low_VIX', 'Open_VIX', 'VIX_percent',
       'Is_Panic', 'Is_Calm', 'Is_Uncertain', 'Close_SP500', 'High_SP500',
       'Low_SP500', 'Open_SP500', 'Volume_SP500', 'Close_MSFT_MSFT',
       'High_MSFT_MSFT', 'Low_MSFT_MSFT', 'Open_MSFT_MSFT', 'Volume_MSFT_MSFT',
       'RSI_14_MSFT', 'MACD_12_26_9_MSFT', 'MACDh_12_26_9_MSFT',
       'MACDs_12_26_9_MSFT', 'ATRr_14_MSFT', 'BBL_20_2.0_2.0_MSFT',
       'BBM_20_2.0_2.0_MSFT', 'BBU_20_2.0_2.0_MSFT', 'BBB_20_2.0_2.0_MSFT',
       'BBP_20_2.0_2.0_MSFT', 'Close_AAPL_AAPL', 'High_AAPL_AAPL',
       'Low_AAPL_AAPL', 'Open_AAPL_AAPL', 'Volume_AAPL_AAPL', 'RSI

  X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))


Index(['DATE', 'Close_MSFT', 'High_MSFT', 'Low_MSFT', 'Open_MSFT',
       'Volume_MSFT', 'RSI_14', 'MACD_12_26_9', 'MACDh_12_26_9',
       'MACDs_12_26_9', 'ATRr_14', 'BBL_20_2.0_2.0', 'BBM_20_2.0_2.0',
       'BBU_20_2.0_2.0', 'BBB_20_2.0_2.0', 'BBP_20_2.0_2.0', 'FinBERT_Score',
       'FinBERT_MA7', 'FEDFUNDS', 'DGS10', 'CPI', 'Dollar_idx', 'USEPUINDXD',
       'Close_VIX', 'High_VIX', 'Low_VIX', 'Open_VIX', 'VIX_percent',
       'Is_Panic', 'Is_Calm', 'Is_Uncertain', 'Close_SP500', 'High_SP500',
       'Low_SP500', 'Open_SP500', 'Volume_SP500', 'Close_AAPL_AAPL',
       'High_AAPL_AAPL', 'Low_AAPL_AAPL', 'Open_AAPL_AAPL', 'Volume_AAPL_AAPL',
       'RSI_14_AAPL', 'MACD_12_26_9_AAPL', 'MACDh_12_26_9_AAPL',
       'MACDs_12_26_9_AAPL', 'ATRr_14_AAPL', 'BBL_20_2.0_2.0_AAPL',
       'BBM_20_2.0_2.0_AAPL', 'BBU_20_2.0_2.0_AAPL', 'BBB_20_2.0_2.0_AAPL',
       'BBP_20_2.0_2.0_AAPL', 'Close_GOOGL_GOOGL', 'High_GOOGL_GOOGL',
       'Low_GOOGL_GOOGL', 'Open_GOOGL_GOOGL', 'Volume_GOOGL_GOOGL',

  X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))


In [108]:

aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
print(aapl_with_features.head())

         DATE  Close_AAPL  High_AAPL  Low_AAPL  Open_AAPL   Volume_AAPL  \
0  2006-03-31    1.881938   1.898433  1.845349   1.884037  1.390651e+09   
1  2006-04-03    1.881038   1.907731  1.866643   1.896934  8.153572e+08   
2  2006-04-04    1.878939   1.923025  1.877739   1.909530  8.157912e+08   
3  2006-04-05    1.834552   1.866043  1.830953   1.862443  9.319240e+08   
4  2006-04-06    2.015697   2.015697  1.923924   1.940719  2.233409e+09   

      RSI_14  MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9  ...  dist_to_max_60  \
0  44.480125     -0.066158       0.000783      -0.066941  ...       -0.273727   
1  44.399561     -0.060089       0.005481      -0.065570  ...       -0.274074   
2  44.198314     -0.054817       0.008602      -0.063420  ...       -0.274884   
3  40.064486     -0.053603       0.007853      -0.061456  ...       -0.292014   
4  57.524557     -0.037590       0.019093      -0.056683  ...       -0.222107   

   dist_to_min_60  Regime_Bullish_AAPL  Dist_to_SMA200_AAPL  l