In [151]:
import pandas as pd
import pandas_ta as ta 
import numpy as np

In [152]:
path = "Data Historical/AAPL_data.csv"
data = pd.read_csv(path)
data.dtypes

Date         object
Adj Close    object
Close        object
High         object
Low          object
Open         object
Volume       object
dtype: object

In [153]:
numeric_cols = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')


In [154]:
# Step 2: Calculate RSI for 14, 30, and 200 periods
data['RSI_14'] = ta.rsi(data['Close'], length=14)
data['RSI_30'] = ta.rsi(data['Close'], length=30)
data['RSI_200'] = ta.rsi(data['Close'], length=200)

# Step 3: Calculate Momentum for 10 and 30 periods
data['MOM_10'] = ta.mom(data['Close'], length=10)
data['MOM_30'] = ta.mom(data['Close'], length=30)

# Step 4: Calculate MACD
macd = ta.macd(data['Close'], fast=12, slow=26, signal=9)
data = pd.concat([data, macd], axis=1)  # Add MACD columns to the dataframe

# Step 5: Calculate PROC (Price Rate of Change) for 9 periods
data['PROC_9'] = ta.roc(data['Close'], length=9)

# Step 6: Calculate EMA for 10, 30 periods
data['EMA_10'] = ta.ema(data['Close'], length=10)
data['EMA_30'] = ta.ema(data['Close'], length=30)

data['Pct_Change'] = data['Close'].pct_change()
data['Log_Returns'] = np.log(data['Close'] / data['Close'].shift(1))

data['Volatility_10'] = data['Close'].rolling(window=10).std()
data['Volatility_30'] = data['Close'].rolling(window=30).std()


# Step 7: Display the dataframe with new features
print(data[['Open', 'High', 'Low', 'Close', 'Adj Close', 
            'RSI_14', 'RSI_30', 'RSI_200', 
            'MOM_10', 'MOM_30', 
            'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 
            'PROC_9', 
            'EMA_10', 'EMA_30']].tail())

             Open        High         Low       Close   Adj Close     RSI_14  \
11076  225.250000  229.740005  225.169998  228.020004  228.020004  51.542650   
11077  226.979996  230.160004  226.660004  228.279999  228.279999  51.986984   
11078  228.059998  229.929993  225.889999  229.000000  229.000000  53.265028   
11079  228.880005  230.160004  225.710007  228.520004  228.520004  52.266187   
11080  228.059998  230.130005  228.059998  229.520004  229.520004  54.193399   

          RSI_30    RSI_200    MOM_10    MOM_30  MACD_12_26_9  MACDh_12_26_9  \
11076  51.873211  53.925678  6.010010  6.330002     -0.680781      -0.026659   
11077  52.068742  53.954426  4.830002  2.509995     -0.485303       0.135055   
11078  52.620201  54.034246  6.279999 -0.539993     -0.269184       0.280939   
11079  52.205970  53.971560  1.040009 -0.519989     -0.135083       0.332032   
11080  53.003310  54.083096  2.559998  1.970001      0.051294       0.414727   

       MACDs_12_26_9    PROC_9      EM

In [155]:
data.tail()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,RSI_14,RSI_30,RSI_200,...,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,PROC_9,EMA_10,EMA_30,Pct_Change,Log_Returns,Volatility_10,Volatility_30
11076,2024-11-18,228.020004,228.020004,229.740005,225.169998,225.25,44686000.0,51.54265,51.873211,53.925678,...,-0.680781,-0.026659,-0.654122,2.045204,226.341986,227.157843,0.013422,0.013333,1.981399,4.157991
11077,2024-11-19,228.279999,228.279999,230.160004,226.660004,226.979996,36211800.0,51.986984,52.068742,53.954426,...,-0.485303,0.135055,-0.620358,2.496407,226.694352,227.23024,0.00114,0.00114,2.003083,4.119655
11078,2024-11-20,229.0,229.0,229.929993,225.889999,228.059998,35169600.0,53.265028,52.620201,54.034246,...,-0.269184,0.280939,-0.550124,0.668193,227.113561,227.344418,0.003154,0.003149,1.828245,4.117942
11079,2024-11-21,228.520004,228.520004,230.160004,225.710007,228.880005,42071900.0,52.266187,52.20597,53.97156,...,-0.135083,0.332032,-0.467115,0.687345,227.369278,227.420263,-0.002096,-0.002098,1.908282,4.118348
11080,2024-11-22,229.520004,229.520004,230.130005,228.059998,228.059998,8449370.0,54.193399,53.00331,54.083096,...,0.051294,0.414727,-0.363434,2.359189,227.760319,227.55573,0.004376,0.004366,2.100432,4.112365


In [156]:
# Step 3: Handle null values
# Drop rows with nulls in price data
data.dropna(subset=['Open', 'High', 'Low', 'Close', 'Adj Close'], inplace=True)

# Forward fill nulls in technical indicators
data.fillna(method='ffill', inplace=True)

# Drop any remaining nulls
data.dropna(inplace=True)

# Step 4: Display the cleaned dataframe
print(data.isnull().sum())  # Check for remaining nulls


Date             0
Adj Close        0
Close            0
High             0
Low              0
Open             0
Volume           0
RSI_14           0
RSI_30           0
RSI_200          0
MOM_10           0
MOM_30           0
MACD_12_26_9     0
MACDh_12_26_9    0
MACDs_12_26_9    0
PROC_9           0
EMA_10           0
EMA_30           0
Pct_Change       0
Log_Returns      0
Volatility_10    0
Volatility_30    0
dtype: int64


  data.fillna(method='ffill', inplace=True)


## Define target value

In [157]:
# Define the horizon (e.g., 5 day ahead)
horizon = 5

# Calculate future price change
data['Future_Close'] = data['Close'].shift(-horizon)  # Shift close price forward
data['Price_Change'] = data['Future_Close'] - data['Close']

# Define buy/sell signals
data['Signal'] = np.where(data['Price_Change'] > threshold, 1, 
                         np.where(data['Price_Change'] < -threshold, 0, 2))
data.dropna(subset=['Signal'], inplace=True)  # Remove rows with insignificant moves

# Drop rows with NaN in the target variable (last `horizon` rows)
data.dropna(subset=['Future_Close'], inplace=True)

In [158]:
data.columns

Index(['Date', 'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume', 'RSI_14',
       'RSI_30', 'RSI_200', 'MOM_10', 'MOM_30', 'MACD_12_26_9',
       'MACDh_12_26_9', 'MACDs_12_26_9', 'PROC_9', 'EMA_10', 'EMA_30',
       'Pct_Change', 'Log_Returns', 'Volatility_10', 'Volatility_30',
       'Future_Close', 'Price_Change', 'Signal'],
      dtype='object')

In [160]:
data.to_csv('final_data.csv', index = False)
