In [1]:
import pandas as pd
import pandas_ta as ta 
import numpy as np

In [2]:
path = "Data Historical/AAPL_data.csv"
data = pd.read_csv(path)
data.dtypes

Date      object
Close     object
High      object
Low       object
Open      object
Volume    object
dtype: object

In [3]:
data

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,,AAPL,AAPL,AAPL,AAPL,AAPL
1,1980-12-12,0.09872590005397797,0.09915511927657,0.09872590005397798,0.09872590005397798,469033600
2,1980-12-15,0.09357534348964691,0.09400456286814542,0.09357534348964691,0.09400456286814542,175884800
3,1980-12-16,0.08670710772275925,0.08713632135240351,0.08670710772275925,0.08713632135240351,105728000
4,1980-12-17,0.08885317295789719,0.08928239224912475,0.08885317295789719,0.08885317295789719,86441600
...,...,...,...,...,...,...
11137,2025-02-19,244.8699951171875,246.00999450683594,243.16000366210938,244.66000366210938,32204200
11138,2025-02-20,245.8300018310547,246.77999877929688,244.2899932861328,244.94000244140625,32316900
11139,2025-02-21,245.5500030517578,248.69000244140625,245.22000122070312,245.9499969482422,53197400
11140,2025-02-24,247.10000610351562,248.86000061035156,244.4199981689453,244.92999267578125,51225800


In [4]:
numeric_cols = ['Close', 'High', 'Low', 'Open', 'Volume']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')


In [5]:
# Step 2: Calculate RSI for 14, 30, and 200 periods
data['RSI_14'] = ta.rsi(data['Close'], length=14)
data['RSI_30'] = ta.rsi(data['Close'], length=30)
data['RSI_200'] = ta.rsi(data['Close'], length=200)

# Step 3: Calculate Momentum for 10 and 30 periods
data['MOM_10'] = ta.mom(data['Close'], length=10)
data['MOM_30'] = ta.mom(data['Close'], length=30)

# Step 4: Calculate MACD
macd = ta.macd(data['Close'], fast=12, slow=26, signal=9)
data = pd.concat([data, macd], axis=1)  # Add MACD columns to the dataframe

# Step 5: Calculate PROC (Price Rate of Change) for 9 periods
data['PROC_9'] = ta.roc(data['Close'], length=9)

# Step 6: Calculate EMA for 10, 30 periods
data['EMA_10'] = ta.ema(data['Close'], length=10)
data['EMA_30'] = ta.ema(data['Close'], length=30)

data['Pct_Change'] = data['Close'].pct_change()
data['Log_Returns'] = np.log(data['Close'] / data['Close'].shift(1))

data['Volatility_10'] = data['Close'].rolling(window=10).std()
data['Volatility_30'] = data['Close'].rolling(window=30).std()


# Step 7: Display the dataframe with new features
print(data[['Open', 'High', 'Low', 'Close', 
            'RSI_14', 'RSI_30', 'RSI_200', 
            'MOM_10', 'MOM_30', 
            'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 
            'PROC_9', 
            'EMA_10', 'EMA_30']].tail())

             Open        High         Low       Close     RSI_14     RSI_30  \
11137  244.660004  246.009995  243.160004  244.869995  60.839364  55.370690   
11138  244.940002  246.779999  244.289993  245.830002  61.803953  55.893241   
11139  245.949997  248.690002  245.220001  245.550003  61.329462  55.696479   
11140  244.929993  248.860001  244.419998  247.100006  63.021918  56.571969   
11141  248.000000  249.979996  244.910004  248.574997  64.609253  57.400667   

         RSI_200     MOM_10     MOM_30  MACD_12_26_9  MACDh_12_26_9  \
11137  54.361385  12.325668   1.777267      1.410605       2.040270   
11138  54.459818  13.615311   1.099075      1.927654       2.045855   
11139  54.425409  12.586136   3.606003      2.288446       1.925318   
11140  54.585050  19.720001   4.666565      2.668686       1.844446   
11141  54.736685  20.925003  11.985123      3.053845       1.783684   

       MACDs_12_26_9    PROC_9      EMA_10      EMA_30  
11137      -0.629665  5.449829  238.79258

In [6]:
data.tail()

Unnamed: 0,Date,Close,High,Low,Open,Volume,RSI_14,RSI_30,RSI_200,MOM_10,...,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,PROC_9,EMA_10,EMA_30,Pct_Change,Log_Returns,Volatility_10,Volatility_30
11137,2025-02-19,244.869995,246.009995,243.160004,244.660004,32204200.0,60.839364,55.37069,54.361385,12.325668,...,1.410605,2.04027,-0.629665,5.449829,238.792587,236.645037,0.001636,0.001635,6.928542,7.001813
11138,2025-02-20,245.830002,246.779999,244.289993,244.940002,32316900.0,61.803953,55.893241,54.459818,13.615311,...,1.927654,2.045855,-0.118201,5.522803,240.072117,237.237615,0.00392,0.003913,7.316121,7.061546
11139,2025-02-21,245.550003,248.690002,245.220001,245.949997,53197400.0,61.329462,55.696479,54.425409,12.586136,...,2.288446,1.925318,0.363129,7.991027,241.068096,237.773898,-0.001139,-0.00114,7.457962,7.22639
11140,2025-02-24,247.100006,248.860001,244.419998,244.929993,51225800.0,63.021918,56.571969,54.58505,19.720001,...,2.668686,1.844446,0.82424,8.543823,242.164807,238.375583,0.006312,0.006293,6.556468,7.453356
11141,2025-02-25,248.574997,249.979996,244.910004,248.0,28102908.0,64.609253,57.400667,54.736685,20.925003,...,3.053845,1.783684,1.270161,6.858826,243.330296,239.03361,0.005969,0.005951,4.918177,7.878753


In [7]:
# Step 3: Handle null values
# Drop rows with nulls in price data
data.dropna(subset=['Open', 'High', 'Low', 'Close'], inplace=True)

# Forward fill nulls in technical indicators
data.fillna(method='ffill', inplace=True)

# Drop any remaining nulls
data.dropna(inplace=True)

# Step 4: Display the cleaned dataframe
print(data.isnull().sum())  # Check for remaining nulls


Date             0
Close            0
High             0
Low              0
Open             0
Volume           0
RSI_14           0
RSI_30           0
RSI_200          0
MOM_10           0
MOM_30           0
MACD_12_26_9     0
MACDh_12_26_9    0
MACDs_12_26_9    0
PROC_9           0
EMA_10           0
EMA_30           0
Pct_Change       0
Log_Returns      0
Volatility_10    0
Volatility_30    0
dtype: int64


  data.fillna(method='ffill', inplace=True)


## Define target value

In [8]:
# Define the horizon (e.g., 5 day ahead)
horizon = 5
threshold = 0.02
# Calculate future price change
data['Future_Close'] = data['Close'].shift(-horizon)  # Shift close price forward
data['Price_Change'] = data['Future_Close'] - data['Close']

# Define buy/sell signals
data['Signal'] = np.where(data['Price_Change'] > threshold, 1, 
                         np.where(data['Price_Change'] < -threshold, 0, 2))
data.dropna(subset=['Signal'], inplace=True)  # Remove rows with insignificant moves

# Drop rows with NaN in the target variable (last `horizon` rows)
data.dropna(subset=['Future_Close'], inplace=True)

In [9]:
data.columns

Index(['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'RSI_14', 'RSI_30',
       'RSI_200', 'MOM_10', 'MOM_30', 'MACD_12_26_9', 'MACDh_12_26_9',
       'MACDs_12_26_9', 'PROC_9', 'EMA_10', 'EMA_30', 'Pct_Change',
       'Log_Returns', 'Volatility_10', 'Volatility_30', 'Future_Close',
       'Price_Change', 'Signal'],
      dtype='object')

In [10]:
data.to_csv('final_data.csv', index = False)
