In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [11]:
def add_log_returns(df: pd.DataFrame, price_col: str = "close") -> pd.DataFrame:
    df = df.copy()
    df["log_return"] = np.log(df[price_col] / df[price_col].shift(1))
    return df

In [12]:
def add_cusum(df: pd.DataFrame, col: str = "log_return") -> pd.DataFrame:
    df = df.copy()
    mean = df[col].mean()
    df["cusum"] = (df[col] - mean).cumsum()
    return df


def add_cusum_pos_neg(df: pd.DataFrame, col: str = "log_return") -> pd.DataFrame:
    df = df.copy()

    pos, neg = 0.0, 0.0
    pos_list, neg_list = [], []

    mean = df[col].mean()

    for x in df[col]:
        if np.isnan(x):
            pos_list.append(np.nan)
            neg_list.append(np.nan)
            continue

        pos = max(0.0, pos + x - mean)
        neg = min(0.0, neg + x - mean)

        pos_list.append(pos)
        neg_list.append(neg)

    df["cusum_pos"] = pos_list
    df["cusum_neg"] = neg_list
    return df

In [13]:
def add_anomaly_flag(
    df: pd.DataFrame,
    contamination: float = 0.01,
    random_state: int = 42
) -> pd.DataFrame:
    df = df.copy()

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    excluded = {"log_return"}
    features = [c for c in numeric_cols if c not in excluded]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))

    iso = IsolationForest(
        n_estimators=300,
        contamination=contamination,
        random_state=random_state,
        n_jobs=-1
    )

    df["anomaly_raw"] = iso.fit_predict(X_scaled)
    df["is_anomaly"] = (df["anomaly_raw"] == -1).astype(int)

    return df

In [14]:
def add_market_regime(df,share, slow_ma=50, fast_ma=50):
    df = df.copy()
    df[f'SMA_{slow_ma}_{share}'] = df[f'Close_{share}'].rolling(window=slow_ma).mean()
        
    df[f'SMA_{fast_ma}_{share}'] = df[f'Close_{share}'].rolling(window=fast_ma).mean()

    df[f'Regime_Bullish_{share}'] = (df[f'Close_{share}'].shift(1) > df[f'SMA_{slow_ma}_{share}'].shift(1)).astype(int)
        
    
    df[f'Dist_to_SMA200_{share}'] = (df[f'Close_{share}'].shift(1) / df[f'SMA_{slow_ma}_{share}'].shift(1)) - 1
    df = df.drop(columns=[f'SMA_{slow_ma}_{share}', f'SMA_{fast_ma}_{share}'])
        
    return df

In [15]:
def add_peak_valley_features(df,share, window=20):
    df[f'rolling_max_{window}_{share}'] = df[f'High_{share}'].rolling(window=window).max().shift(1)
    df[f'rolling_min_{window}_{share}'] = df[f'Low_{share}'].rolling(window=window).min().shift(1)
        
        # Jak daleko jesteśmy od szczytu/dołka
        #df[f'dist_to_max_{window}'] = df[f'Close_{share}'] / df[f'rolling_max_{window}'] - 1
        #df[f'dist_to_min_{window}'] = df[f'Close_{share}'] / df[f'rolling_min_{window}'] - 1
    
    return df

In [16]:
def add_lagged_features(df, ticker, lags=[1, 2]):
    df = df.copy()
    for col in [f'log_return', f'Volume_{ticker}', 'RSI_14']:
        if col in df.columns:
            for lag in lags:
                df[f'{col}_lag{lag}'] = df[col].shift(lag)
    
    return df

In [17]:
def stage2_feature_engineering(
    df: pd.DataFrame,
    price_col,
    ticker
) -> pd.DataFrame:


    df = df.sort_values("DATE").reset_index(drop=True)

    df = add_log_returns(df, price_col)
    df = add_cusum(df)
    df = add_cusum_pos_neg(df)
    df = add_anomaly_flag(df)
    df = add_peak_valley_features(df,share=ticker, window=20)
    df = add_peak_valley_features(df,share=ticker, window=60)
    df = add_market_regime(df, share=ticker)
    df = add_lagged_features(df, ticker=ticker)

    df.dropna(inplace=True)

    return df

In [18]:

aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')
tickers = ['AAPL', 'GOOGL', 'MSFT']

for df, share in zip([aapl_with_features, googl_with_features, msft_with_features], tickers):
    df_features = stage2_feature_engineering(df, price_col=f"Close_{share}", ticker=share)
    df_features.to_csv(f"../data/all_data/all_{share}_data.csv", index=False)


  X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))
  X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))
  X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))


In [19]:

aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
print(aapl_with_features.head())

    index        DATE  Close_AAPL  High_AAPL   Low_AAPL  Open_AAPL  \
0  2991.0  2011-11-22   11.066997  11.147072  10.974024  11.108683   
1  2992.0  2011-11-23   11.291923  11.334510  11.124872  11.127272   
2  2993.0  2011-11-25   11.006413  11.271833  11.003113  11.231945   
3  2994.0  2011-11-28   10.903844  11.131176  10.896346  11.049300   
4  2995.0  2011-11-29   11.280232  11.298227  11.106585  11.167166   

   Volume_AAPL     RSI_14  MACD_12_26_9  MACDh_12_26_9  ...  \
0  447980400.0  35.271163     -0.174584      -0.093165  ...   
1  409021200.0  41.335459     -0.180518      -0.079280  ...   
2  428271200.0  36.642621     -0.205886      -0.083718  ...   
3  254760800.0  35.100889     -0.231597      -0.087543  ...   
4  346413200.0  44.353542     -0.219077      -0.060018  ...   

   Regime_Bullish_AAPL  rolling_max_20_AAPL  rolling_min_20_AAPL  \
0                    0            12.276235            11.243041   
1                    0            12.276235            10.974024