In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import pandas_ta as ta
from sklearn.preprocessing import LabelEncoder

## Cleaning the data

In [2]:
def cleaning_data(data):
    """
    Cleans the input data by converting date columns, handling missing values, and resetting indices.
    """
    df = data.copy()

    # Convert numeric columns
    numeric_cols = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

    df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)

    
    # Drop remaining NaN values
    df = df.dropna()

    # Sort by date and reset index
    if 'Date' in df.columns:
        df = df.sort_values(by='Date').reset_index(drop=True)

    return df


## Features Engeniring

def extract_features(data):
    """
    Extracts technical indicators and features from the cleaned stock data.
    """
    df = data.copy()

    # Daily Return and Momentum
    df['Daily Return'] = df['Adj Close'].pct_change()
    df['5d_momentum'] = df['Adj Close'] / df['Adj Close'].shift(5)

    # MACD and Signal Line
    short_ema = df['Adj Close'].ewm(span=12, adjust=False).mean()
    long_ema = df['Adj Close'].ewm(span=26, adjust=False).mean()
    df['MACD_Line'] = short_ema - long_ema
    df['Signal_Line'] = df['MACD_Line'].ewm(span=9, adjust=False).mean()

    # RSI
    df['RSI'] = ta.rsi(df['Adj Close'], length=14)

    # Lagged Features
    for lag in range(1, 4):
        df[f'Daily_Return_Lag_{lag}'] = df['Daily Return'].shift(lag)

    # Derived Features
    df['MACD_Signal_Diff'] = df['MACD_Line'] - df['Signal_Line']
    df['RSI_Trend'] = df['RSI'].diff()


   # Bollinger Bands
    try:
        bbands = ta.bbands(df['Adj Close'], length=20)
        if bbands is not None:
            df['BB_Lower'] = bbands.iloc[:, 0]
            df['BB_Middle'] = bbands.iloc[:, 1]
            df['BB_Upper'] = bbands.iloc[:, 2]
        else:
            print("Bollinger Bands calculation returned None.")
    except Exception as e:
        print(f"Error calculating Bollinger Bands: {e}")
    
    # ADX
    try:
        adx = ta.adx(df['High'], df['Low'], df['Close'], length=14)
        if adx is not None:
            df['ADX'] = adx.iloc[:, 0]
        else:
            print("ADX calculation returned None.")
    except Exception as e:
        print(f"Error calculating ADX: {e}")


    # Volume-Based Features
    df['Volume_Change'] = df['Volume'].pct_change()
    df['Avg_Volume'] = df['Volume'].rolling(window=14).mean()

    # Drop NaN values caused by rolling calculations
    df = df.dropna()

    return df


In [3]:
import logging

logging.basicConfig(level=logging.INFO)

def extract_features(data, macd_short=12, macd_long=26, macd_signal=9, rsi_length=14, bb_length=20, adx_length=14):
    """
    Extracts technical indicators and features from the cleaned stock data.
    Parameters allow for flexibility in indicator lengths.

       
    Extracts technical indicators and features from the cleaned stock data.

    Parameters:
        data (DataFrame): The input stock data.
        macd_short (int): The short EMA period for MACD.
        macd_long (int): The long EMA period for MACD.
        macd_signal (int): The EMA period for the MACD signal line.
        rsi_length (int): The period for calculating RSI.
        bb_length (int): The period for Bollinger Bands.
        adx_length (int): The period for ADX.

    Returns:
        DataFrame: The original DataFrame with additional technical indicator columns.

    """
    
    df = data.copy()

    # MACD and Signal Line
    logging.info("Calculating MACD...")
    short_ema = df['Adj Close'].ewm(span=macd_short, adjust=False).mean()
    long_ema = df['Adj Close'].ewm(span=macd_long, adjust=False).mean()
    df['MACD_Line'] = short_ema - long_ema
    df['Signal_Line'] = df['MACD_Line'].ewm(span=macd_signal, adjust=False).mean()
    logging.info("MACD calculation complete.")

    logging.info("Calculating RSI...")
    # RSI
    df['RSI'] = ta.rsi(df['Adj Close'], length=rsi_length)
    logging.info("RSI calculation complete.")

    # Bollinger Bands
    try:
        bbands = ta.bbands(df['Adj Close'], length=bb_length)
        if bbands is not None:
            df['BB_Lower'] = bbands.iloc[:, 0]
            df['BB_Middle'] = bbands.iloc[:, 1]
            df['BB_Upper'] = bbands.iloc[:, 2]
    except Exception as e:
        print(f"Error calculating Bollinger Bands: {e}")

    # ADX
    try:
        adx = ta.adx(df['High'], df['Low'], df['Close'], length=adx_length)
        if adx is not None:
            df['ADX'] = adx.iloc[:, 0]
    except Exception as e:
        print(f"Error calculating ADX: {e}")

    # Drop NaN values caused by rolling calculations
    df = df.dropna()
    logging.info("Feature extraction complete.")

    return df


## Apply Function to the dataset

In [4]:
path = "Data Historical/AAPL_data.csv"
stock_df = pd.read_csv(path)


In [5]:
cleaned_data = cleaning_data(stock_df)
cleaned_data

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
0,1980-12-12,0.098834,0.128348,0.128906,0.128348,0.128348,469033600.0
1,1980-12-15,0.093678,0.121652,0.122210,0.121652,0.122210,175884800.0
2,1980-12-16,0.086802,0.112723,0.113281,0.112723,0.113281,105728000.0
3,1980-12-17,0.088951,0.115513,0.116071,0.115513,0.115513,86441600.0
4,1980-12-18,0.091530,0.118862,0.119420,0.118862,0.118862,73449600.0
...,...,...,...,...,...,...,...
11075,2024-11-18,228.020004,228.020004,229.740005,225.169998,225.250000,44686000.0
11076,2024-11-19,228.279999,228.279999,230.160004,226.660004,226.979996,36211800.0
11077,2024-11-20,229.000000,229.000000,229.929993,225.889999,228.059998,35169600.0
11078,2024-11-21,228.520004,228.520004,230.160004,225.710007,228.880005,42071900.0


In [6]:
featured_data = extract_features(cleaned_data)

INFO:root:Calculating MACD...
INFO:root:MACD calculation complete.
INFO:root:Calculating RSI...
INFO:root:RSI calculation complete.
INFO:root:Feature extraction complete.


In [7]:
featured_data

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,MACD_Line,Signal_Line,RSI,BB_Lower,BB_Middle,BB_Upper,ADX
27,1981-01-22,0.113015,0.146763,0.147879,0.146763,0.146763,35548800.0,0.001851,0.001999,57.172613,0.100432,0.112048,0.123665,9.309829
28,1981-01-23,0.112585,0.146205,0.147321,0.146205,0.146763,11222400.0,0.001930,0.001985,56.493064,0.100474,0.112091,0.123709,9.898616
29,1981-01-26,0.110867,0.143973,0.144531,0.143973,0.144531,24640000.0,0.001833,0.001954,53.741887,0.100845,0.111533,0.122220,9.843361
30,1981-01-27,0.110007,0.142857,0.143973,0.142857,0.143973,23699200.0,0.001667,0.001897,52.368568,0.101740,0.110845,0.119951,9.522146
31,1981-01-28,0.106570,0.138393,0.138951,0.138393,0.138951,28156800.0,0.001244,0.001766,47.175750,0.102078,0.110136,0.118194,8.963527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11075,2024-11-18,228.020004,228.020004,229.740005,225.169998,225.250000,44686000.0,-0.610971,-0.607063,51.821302,219.615305,227.438286,235.261267,12.246967
11076,2024-11-19,228.279999,228.279999,230.160004,226.660004,226.979996,36211800.0,-0.414482,-0.568547,52.265694,220.181711,227.072246,233.962780,11.782090
11077,2024-11-20,229.000000,229.000000,229.929993,225.889999,228.059998,35169600.0,-0.198379,-0.494513,53.543633,220.226302,226.996926,233.767550,11.121455
11078,2024-11-21,228.520004,228.520004,230.160004,225.710007,228.880005,42071900.0,-0.065097,-0.408630,52.533896,220.268651,226.907095,233.545540,10.575246


In [8]:
print(featured_data.dtypes)  # Check the data types of each column


Date            object
Adj Close      float64
Close          float64
High           float64
Low            float64
Open           float64
Volume         float64
MACD_Line      float64
Signal_Line    float64
RSI            float64
BB_Lower       float64
BB_Middle      float64
BB_Upper       float64
ADX            float64
dtype: object


In [9]:
featured_data.columns

Index(['Date', 'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume',
       'MACD_Line', 'Signal_Line', 'RSI', 'BB_Lower', 'BB_Middle', 'BB_Upper',
       'ADX'],
      dtype='object')

# Handle remaining NaN values if any (e.g., replace with column means)
featured_data = featured_data.fillna(featured_data.mean())


## Add Target Column

# Add target column (direction: 1 for up, 0 for down)
featured_data['direction'] = (featured_data['Close'].shift(-1) > featured_data['Close']).astype(int)
featured_data

In [10]:
print(featured_data.columns)


Index(['Date', 'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume',
       'MACD_Line', 'Signal_Line', 'RSI', 'BB_Lower', 'BB_Middle', 'BB_Upper',
       'ADX'],
      dtype='object')


def create_target_column(df):
    """
    Adds a target column 'direction' based on MACD and RSI indicators.

    Parameters:
        df (DataFrame): Processed stock data with MACD and RSI features.

    Returns:
        DataFrame: Original DataFrame with added 'direction' column.
    """
    df = df.copy()

    # MACD-based direction
    df['direction_macd'] = (df['MACD_Line'] > df['Signal_Line']).astype(int)

    # RSI-based direction (optional)
    df['direction_rsi'] = df['RSI'].apply(lambda x: 1 if x < 30 else (0 if x > 70 else None))

    # Combine MACD and RSI for final direction (optional)
    df['direction'] = df.apply(
        lambda row: 1 if row['direction_macd'] == 1 and (row['direction_rsi'] == 1 or row['direction_rsi'] is None)
        else (0 if row['direction_macd'] == 0 and (row['direction_rsi'] == 0 or row['direction_rsi'] is None) else None),
        axis=1
    )

    # Fill None values in 'direction' (if RSI didn't signal) with MACD direction
    df['direction'] = df['direction'].fillna(df['direction_macd'])

    return df


In [16]:
def create_target_column(df):
    """
    Adds a target column 'direction' based on MACD and RSI indicators.

    Parameters:
        df (DataFrame): Processed stock data with MACD and RSI features.

    Returns:
        DataFrame: Original DataFrame with added 'direction' column.
    """
    df = df.copy()

    df['future_price'] = df['Close'].shift(-1)  # Price in the next day
    df['direction'] = np.where(df['future_price'] > df['Close'], 1, 0)  # 1 for Buy, 0 for Sell
    df = df.dropna() 

    return df


In [14]:
final_data = create_target_column(featured_data)
final_data

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,MACD_Line,Signal_Line,RSI,BB_Lower,BB_Middle,BB_Upper,ADX,future_price,direction
27,1981-01-22,0.113015,0.146763,0.147879,0.146763,0.146763,35548800.0,0.001851,0.001999,57.172613,0.100432,0.112048,0.123665,9.309829,0.146205,0
28,1981-01-23,0.112585,0.146205,0.147321,0.146205,0.146763,11222400.0,0.001930,0.001985,56.493064,0.100474,0.112091,0.123709,9.898616,0.143973,0
29,1981-01-26,0.110867,0.143973,0.144531,0.143973,0.144531,24640000.0,0.001833,0.001954,53.741887,0.100845,0.111533,0.122220,9.843361,0.142857,0
30,1981-01-27,0.110007,0.142857,0.143973,0.142857,0.143973,23699200.0,0.001667,0.001897,52.368568,0.101740,0.110845,0.119951,9.522146,0.138393,0
31,1981-01-28,0.106570,0.138393,0.138951,0.138393,0.138951,28156800.0,0.001244,0.001766,47.175750,0.102078,0.110136,0.118194,8.963527,0.133371,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11075,2024-11-18,228.020004,228.020004,229.740005,225.169998,225.250000,44686000.0,-0.610971,-0.607063,51.821302,219.615305,227.438286,235.261267,12.246967,228.279999,1
11076,2024-11-19,228.279999,228.279999,230.160004,226.660004,226.979996,36211800.0,-0.414482,-0.568547,52.265694,220.181711,227.072246,233.962780,11.782090,229.000000,1
11077,2024-11-20,229.000000,229.000000,229.929993,225.889999,228.059998,35169600.0,-0.198379,-0.494513,53.543633,220.226302,226.996926,233.767550,11.121455,228.520004,0
11078,2024-11-21,228.520004,228.520004,230.160004,225.710007,228.880005,42071900.0,-0.065097,-0.408630,52.533896,220.268651,226.907095,233.545540,10.575246,229.520004,1


In [None]:
#Option 1: Replace Neutral (NaN) with MACD Signal
# Fill missing RSI-based signals with MACD-based signals
final_data['direction_rsi'] = final_data['direction_rsi'].fillna(final_data['direction_macd'])

# Use this final 'direction_rsi' as your target column
final_data['direction'] = final_data['direction_rsi']


In [19]:
#Option 1: Replace Neutral (NaN) with MACD Signal
# Fill missing RSI-based signals with MACD-based signals
final_data = final_data.dropna() 


In [20]:
# Display remaining null values (if any)
print(final_data.isnull().sum())


Date            0
Adj Close       0
Close           0
High            0
Low             0
Open            0
Volume          0
MACD_Line       0
Signal_Line     0
RSI             0
BB_Lower        0
BB_Middle       0
BB_Upper        0
ADX             0
future_price    0
direction       0
dtype: int64


In [None]:
#visualize the price movement
# Create a candlestick chart
fig = go.Figure(data=[go.Candlestick(
    x=final_data['Date'],  
    open=final_data['Open'],  
    high=final_data['High'],  
    low=final_data['Low'], 
    close=final_data['Close']
)])

fig.update_layout(
    title='Stock Price Candlestick Chart',
    xaxis_title='Date',
    yaxis_title='Price',
    xaxis_rangeslider_visible=False  # Optional: Hide the range slider
)

fig.show()


plt.figure(figsize=(12, 6))
plt.plot(final_data['Date'], final_data['MACD_Line'], label='MACD Line', color='blue')
plt.plot(final_data['Date'], final_data['Signal_Line'], label='Signal Line', color='orange')
plt.title('MAclean_data.to_csv('final_data/user_journey.csv', index = False)
CD for AAPL')
plt.legend(loc='upper left')
plt.show()


## Store data

In [None]:
final_data.head(10)

In [21]:
#Direction 0 (Sell): 6,147 instances (~55.6%)
#Direction 1 (Buy): 4,906 instances (~44.4%)
final_data["direction"].value_counts()

direction
0    5553
1    5499
Name: count, dtype: int64

In [22]:
final_data = final_data.drop(columns=['Date'])


from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_resampled, y_resampled, cv=5)  # 5-fold cross-validation
print("Cross-validation scores:", scores)
print("Mean CV score:", scores.mean())


// elemniate hight coralated 
// try directio with macd 5 > 15 => 1 if else <0
// try direction with rsi 


## Store the data into CSV 

In [23]:
final_data.to_csv('final_data.csv', index = False)
