In [3]:
from pathlib import Path
import pandas as pd
import numpy as np

df = pd.read_csv('d:/文心远/研究生/5040-Bootcamp/project/data/processed/MSFT_preprocessed.csv')
df = df.sort_values("date")

In [4]:
def create_stock_features(df):
    """
    Feature engineering for stock prediction.
    Features included:
    1. Daily return
    2. Log return
    3. Rolling volatility
    4. Moving averages (5, 20)
    5. MA spread
    6. Volume rolling mean
    7. High-Low spread
    8. Close-Open difference
    9. RSI (14-day)
    10. Momentum (5-day)
    13. Log volume
    14. RSI moving average (5-day trend)
    """

    # 1. Daily return
    df['daily_return'] = df['close'].pct_change().fillna(0)

    # 2. Log return
    df['log_return'] = np.log(df['close'] / df['close'].shift(1)).fillna(0)

    # 3. Rolling volatility (5-day)
    df['volatility_5d'] = df['daily_return'].rolling(5).std().fillna(0)

    # 4. Moving averages (5, 20)
    df['ma_5'] = df['close'].rolling(5).mean().fillna(df['close'])
    df['ma_20'] = df['close'].rolling(20).mean().fillna(df['close'])

    # 5. MA spread
    df['ma_spread'] = df['ma_5'] - df['ma_20']

    # 6. Volume rolling mean (5-day)
    df['vol_mean_5'] = df['volume'].rolling(5).mean().fillna(df['volume'])

    # 7. High-Low spread
    df['hl_spread'] = df['high'] - df['low']

    # 8. Close-Open difference
    df['co_diff'] = df['close'] - df['open']

    # 9. RSI (Relative Strength Index, 14-day)
    window = 14
    delta = df['close'].diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    roll_up = pd.Series(gain).rolling(window).mean()
    roll_down = pd.Series(loss).rolling(window).mean()
    rs = roll_up / roll_down
    df['rsi'] = 100 - (100 / (1 + rs))
    df['rsi'] = df['rsi'].fillna(50)

    # 10. Momentum (5-day)
    df['momentum_5'] = df['close'] - df['close'].shift(5)
    df['momentum_5'] = df['momentum_5'].fillna(0)

    # 11. Log volume
    df['log_volume'] = np.log(df['vol_mean_5'] + 1)

    # 12. RSI moving average (5-day trend of RSI)
    df['rsi_ma5'] = df['rsi'].rolling(5).mean()

    return df



## Feature Engineering Documentation

1. Daily Return (daily_return)
- Purpose: Measures the daily price change.
- Impact: Helps the model capture short-term price fluctuations.

2. Log Return (log_return)
- Purpose: Measures the logarithmic ratio of consecutive daily prices.
- Impact: Smoothly describes price changes, improving prediction of future trends.

3. 5-day Rolling Volatility (volatility_5d)
- Purpose: Standard deviation of daily returns over the past 5 days.
- Impact: Reflects recent market volatility, aiding prediction of price swings.

4. 5-day and 20-day Moving Averages (ma_5, ma_20)
- Purpose: Average closing price over the past 5 and 20 days.
- Impact: Shows short-term and mid-term trends, helping the model identify price direction.

5. MA Spread (ma_spread)
- Purpose: Difference between short-term and long-term moving averages.
- Impact: Indicates trend strength and potential acceleration or reversal in price.

6. 5-day Volume Rolling Mean (vol_mean_5)
- Purpose: Average trading volume over the past 5 days.
- Impact: Reflects market activity changes, signaling potential price movements.

7. High-Low Spread (hl_spread)
- Purpose: Difference between daily high and low prices.
- Impact: Measures daily price range, helping the model understand price instability.

8. Close-Open Difference (co_diff)
- Purpose: Difference between daily close and open prices.
- Impact: Captures daily price direction and magnitude, providing trend information.

9. RSI 14-day (rsi)
- Purpose: Relative Strength Index based on past 14 days (0-100).
- Impact: Indicates potential overbought or oversold conditions, useful for short-term trend prediction.

10. 5-day Momentum (momentum_5)
- Purpose: Difference between current close and close 5 days ago.
- Impact: Measures medium-term price momentum, helping assess continuation or reversal potential.

