In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [3]:
def add_log_returns(df: pd.DataFrame, price_col: str = "close") -> pd.DataFrame:
    df = df.copy()
    df["log_return"] = np.log(df[price_col] / df[price_col].shift(1))
    return df

In [4]:
def add_cusum(df: pd.DataFrame, col: str = "log_return") -> pd.DataFrame:
    df = df.copy()
    mean = df[col].mean()
    df["cusum"] = (df[col] - mean).cumsum()
    return df


def add_cusum_pos_neg(df: pd.DataFrame, col: str = "log_return") -> pd.DataFrame:
    df = df.copy()

    pos, neg = 0.0, 0.0
    pos_list, neg_list = [], []

    mean = df[col].mean()

    for x in df[col]:
        if np.isnan(x):
            pos_list.append(np.nan)
            neg_list.append(np.nan)
            continue

        pos = max(0.0, pos + x - mean)
        neg = min(0.0, neg + x - mean)

        pos_list.append(pos)
        neg_list.append(neg)

    df["cusum_pos"] = pos_list
    df["cusum_neg"] = neg_list
    return df

In [None]:
def add_anomaly_flag(
    df: pd.DataFrame,
    contamination: float = 0.01,
    random_state: int = 42
) -> pd.DataFrame:
    df = df.copy()

    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    excluded = {"log_return"}
    features = [c for c in numeric_cols if c not in excluded]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))

    iso = IsolationForest(
        n_estimators=300,
        contamination=contamination,
        random_state=random_state,
        n_jobs=-1
    )

    df["anomaly_raw"] = iso.fit_predict(X_scaled)
    df["is_anomaly"] = (df["anomaly_raw"] == -1).astype(int)

    return df

In [10]:
def stage2_feature_engineering(
    df: pd.DataFrame,
    price_col
) -> pd.DataFrame:


    df = df.sort_values("DATE").reset_index(drop=True)

    df = add_log_returns(df, price_col)
    df = add_cusum(df)
    df = add_cusum_pos_neg(df)
    df = add_anomaly_flag(df)

    return df

In [13]:

aapl_with_features = pd.read_csv('../data/all_data/all_AAPL_data.csv')
googl_with_features = pd.read_csv('../data/all_data/all_GOOGL_data.csv')
msft_with_features = pd.read_csv('../data/all_data/all_MSFT_data.csv')
tickers = ['AAPL', 'GOOGL', 'MSFT']

for df, share in zip([aapl_with_features, googl_with_features, msft_with_features], tickers):
    df_features = stage2_feature_engineering(df, price_col=f"Close_{share}")
    df_features.to_csv(f"../data/all_data/all_{share}_data.csv", index=False)


  X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))
  X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))
  X_scaled = scaler.fit_transform(df[features].fillna(method="ffill").fillna(0))
