In [1]:
import pandas as pd
import pandas_ta as ta 
import numpy as np

In [3]:
path = "Data Historical/AAPL_data.csv"
data = pd.read_csv(path)
data.dtypes

Date      object
Close     object
High      object
Low       object
Open      object
Volume    object
dtype: object

In [7]:
data

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,,AAPL,AAPL,AAPL,AAPL,AAPL
1,1980-12-12,0.09872589260339737,0.09915511179359737,0.09872589260339737,0.09872589260339737,469033600
2,1980-12-15,0.09357532113790512,0.09400454041387873,0.09357532113790512,0.09400454041387873,175884800
3,1980-12-16,0.08670707792043686,0.087136291402555,0.08670707792043686,0.087136291402555,105728000
4,1980-12-17,0.08885318040847778,0.08928239973569656,0.08885318040847778,0.08885318040847778,86441600
...,...,...,...,...,...,...
11136,2025-02-18,244.47000122070312,245.17999267578125,241.83999633789062,244.14999389648438,48822500
11137,2025-02-19,244.8699951171875,246.00999450683594,243.16000366210938,244.66000366210938,32204200
11138,2025-02-20,245.8300018310547,246.77999877929688,244.2899932861328,244.94000244140625,32316900
11139,2025-02-21,245.5500030517578,248.69000244140625,245.22000122070312,245.9499969482422,53119400


In [8]:
numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')


In [10]:
# Step 2: Calculate RSI for 14, 30, and 200 periods
data['RSI_14'] = ta.rsi(data['Close'], length=14)
data['RSI_30'] = ta.rsi(data['Close'], length=30)
data['RSI_200'] = ta.rsi(data['Close'], length=200)

# Step 3: Calculate Momentum for 10 and 30 periods
data['MOM_10'] = ta.mom(data['Close'], length=10)
data['MOM_30'] = ta.mom(data['Close'], length=30)

# Step 4: Calculate MACD
macd = ta.macd(data['Close'], fast=12, slow=26, signal=9)
data = pd.concat([data, macd], axis=1)  # Add MACD columns to the dataframe

# Step 5: Calculate PROC (Price Rate of Change) for 9 periods
data['PROC_9'] = ta.roc(data['Close'], length=9)

# Step 6: Calculate EMA for 10, 30 periods
data['EMA_10'] = ta.ema(data['Close'], length=10)
data['EMA_30'] = ta.ema(data['Close'], length=30)

data['Pct_Change'] = data['Close'].pct_change()
data['Log_Returns'] = np.log(data['Close'] / data['Close'].shift(1))

data['Volatility_10'] = data['Close'].rolling(window=10).std()
data['Volatility_30'] = data['Close'].rolling(window=30).std()


# Step 7: Display the dataframe with new features
print(data[['Open', 'High', 'Low', 'Close', 
            'RSI_14', 'RSI_30', 'RSI_200', 
            'MOM_10', 'MOM_30', 
            'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 
            'PROC_9', 
            'EMA_10', 'EMA_30']].tail())

             Open        High         Low       Close     RSI_14     RSI_30  \
11136  244.149994  245.179993  241.839996  244.470001  60.452968  55.156709   
11137  244.660004  246.009995  243.160004  244.869995  60.839364  55.370690   
11138  244.940002  246.779999  244.289993  245.830002  61.803953  55.893241   
11139  245.949997  248.690002  245.220001  245.550003  61.329462  55.696479   
11140  244.807999  248.860001  244.589996  247.554993  63.490952  56.822429   

         RSI_200     MOM_10    MOM_30  MACD_12_26_9  MACD_12_26_9  \
11136  54.320456  16.710419  0.887802      0.827891      0.827891   
11137  54.361388  12.325668  1.777267      1.410605      1.410605   
11138  54.459821  13.615311  1.099075      1.927654      1.927654   
11139  54.425412  12.586136  3.606003      2.288446      2.288446   
11140  54.631702  20.174988  5.121552      2.704981      2.704981   

       MACDh_12_26_9  MACDh_12_26_9  MACDs_12_26_9  MACDs_12_26_9    PROC_9  \
11136       1.967624       1.96

In [5]:
data.tail()

Unnamed: 0,Date,Close,High,Low,Open,Volume
11136,2025-02-18,244.47000122070312,245.17999267578125,241.83999633789065,244.1499938964844,48822500
11137,2025-02-19,244.8699951171875,246.00999450683597,243.16000366210935,244.66000366210935,32204200
11138,2025-02-20,245.8300018310547,246.77999877929688,244.2899932861328,244.94000244140625,32316900
11139,2025-02-21,245.5500030517578,248.69000244140625,245.22000122070312,245.9499969482422,53119400
11140,2025-02-24,247.55499267578125,248.8600006103516,244.58999633789065,244.80799865722656,24437621


In [11]:
# Step 3: Handle null values
# Drop rows with nulls in price data
data.dropna(subset=['Open', 'High', 'Low', 'Close'], inplace=True)

# Forward fill nulls in technical indicators
data.fillna(method='ffill', inplace=True)

# Drop any remaining nulls
data.dropna(inplace=True)

# Step 4: Display the cleaned dataframe
print(data.isnull().sum())  # Check for remaining nulls


Date             0
Close            0
High             0
Low              0
Open             0
Volume           0
RSI_14           0
RSI_30           0
RSI_200          0
MOM_10           0
MOM_30           0
MACD_12_26_9     0
MACDh_12_26_9    0
MACDs_12_26_9    0
PROC_9           0
EMA_10           0
EMA_30           0
Pct_Change       0
Log_Returns      0
Volatility_10    0
Volatility_30    0
MACD_12_26_9     0
MACDh_12_26_9    0
MACDs_12_26_9    0
dtype: int64


  data.fillna(method='ffill', inplace=True)


## Define target value

In [12]:
# Define the horizon (e.g., 5 day ahead)
horizon = 5
threshold = 0.02
# Calculate future price change
data['Future_Close'] = data['Close'].shift(-horizon)  # Shift close price forward
data['Price_Change'] = data['Future_Close'] - data['Close']

# Define buy/sell signals
data['Signal'] = np.where(data['Price_Change'] > threshold, 1, 
                         np.where(data['Price_Change'] < -threshold, 0, 2))
data.dropna(subset=['Signal'], inplace=True)  # Remove rows with insignificant moves

# Drop rows with NaN in the target variable (last `horizon` rows)
data.dropna(subset=['Future_Close'], inplace=True)

In [13]:
data.columns

Index(['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'RSI_14', 'RSI_30',
       'RSI_200', 'MOM_10', 'MOM_30', 'MACD_12_26_9', 'MACDh_12_26_9',
       'MACDs_12_26_9', 'PROC_9', 'EMA_10', 'EMA_30', 'Pct_Change',
       'Log_Returns', 'Volatility_10', 'Volatility_30', 'MACD_12_26_9',
       'MACDh_12_26_9', 'MACDs_12_26_9', 'Future_Close', 'Price_Change',
       'Signal'],
      dtype='object')

In [14]:
data.to_csv('final_data.csv', index = False)
