In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# List of stock symbols
stock_symbols = ["AMZN", "AAPL", "NVDA", "MSFT", "GOOG", "META", "TSLA", "WMT", "JPM", "NFLX"]  # Extend this list as needed
stock = {}

# Dynamically construct file path and load the data for each stock symbol
for symbol in stock_symbols:
    file_path = f"../clean_data/clean_data{symbol}.csv"  # Dynamically generate file path
    stock[symbol] = pd.read_csv(file_path)
# Apply EWMA to the Close column for each stock symbol
for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['Close'] = stock[stock_symbols[i]]['Close'].ewm(alpha=0.65).mean()

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['today'] = stock[stock_symbols[i]]['Close'].pct_change() * 100

for i in range(len(stock_symbols)):
    for j in range(1, 6):
        stock[stock_symbols[i]][f'previous{str(j)}'] = stock[stock_symbols[i]]['today'].shift(j)

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['ema50'] = stock[stock_symbols[i]]['Close'] / stock[stock_symbols[i]]['Close'].ewm(50).mean()
    stock[stock_symbols[i]]['ema21'] = stock[stock_symbols[i]]['Close'] / stock[stock_symbols[i]]['Close'].ewm(21).mean()
    stock[stock_symbols[i]]['ema14'] = stock[stock_symbols[i]]['Close'] / stock[stock_symbols[i]]['Close'].ewm(14).mean()
    stock[stock_symbols[i]]['ema5'] = stock[stock_symbols[i]]['Close'] / stock[stock_symbols[i]]['Close'].ewm(5).mean()

def rsi(X, window=14):
    delta = X.diff(1)

    gains = delta.where(delta > 0, 0)
    losses = -delta.where(delta < 0, 0)

    avg_gains = gains.rolling(window=window, min_periods=1).mean()
    avg_losses = losses.rolling(window=window, min_periods=1).mean()

    rs = avg_gains / avg_losses
    rsi = 100 - (100 / (1 + rs))

    return rsi
for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['rsi'] = rsi(stock[stock_symbols[i]]['Close'])


def macd(X, short_window=12, long_window=29, signal=9):
    short_ema = X.ewm(span=short_window, adjust=False).mean()

    long_ema = X.ewm(span=long_window, adjust=False).mean()

    macd = short_ema - long_ema

    signal = macd.ewm(span=signal, adjust=False).mean()

    return signal
for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['macd'] = macd(stock[stock_symbols[i]]['Close'])


window = 6
for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['roc'] = ((stock[stock_symbols[i]]['Close'] - stock[stock_symbols[i]]['Close'].shift(window)) / stock[stock_symbols[i]]['Close'].shift(window)) * 100


window = 16
for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['high-low'] = stock[stock_symbols[i]]['High'] - stock[stock_symbols[i]]['Low']
    stock[stock_symbols[i]]['high-preclose'] = abs(stock[stock_symbols[i]]['High'] - stock[stock_symbols[i]]['Close'].shift(1))
    stock[stock_symbols[i]]['low-preclose'] = abs(stock[stock_symbols[i]]['Low'] - stock[stock_symbols[i]]['Close'].shift(1))

    stock[stock_symbols[i]]['tr'] = stock[stock_symbols[i]][['high-low', 'high-preclose', 'low-preclose']].max(axis=1)

    stock[stock_symbols[i]]['atr'] = stock[stock_symbols[i]]['tr'].rolling(window=window).mean()

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]] = stock[stock_symbols[i]].drop(['high-low', 'high-preclose', 'low-preclose', 'tr'], axis=1)


def obv(X):
    obv = pd.Series(index=X.index)
    obv.iloc[0] = 0

    for i in range(1, len(X)):
        if X['Close'].iloc[i] > X['Close'].iloc[i-1]:
            obv.iloc[i] = obv.iloc[i-1] + X['Volume'].iloc[i]
        elif X['Close'].iloc[i] < X['Close'].iloc[i-1]:
            obv.iloc[i] = obv.iloc[i-1] - X['Volume'].iloc[i]
        else:
            obv.iloc[i] = obv.iloc[i-1]

    return obv

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['obv'] = obv(stock[stock_symbols[i]])

def cmf(X, window=16):
    money_flow_multiplier = ((X['Close'] - X['Low']) - (X['High'] - X['Close']))/(X['High']-X['Low'])

    money_flow_volume = money_flow_multiplier * X['Volume']

    cmf = money_flow_volume.rolling(window=window).sum() / X['Volume'].rolling(window=window).sum()

    return cmf

for i in range(len(stock_symbols)):
  stock[stock_symbols[i]]['cmf'] = cmf(stock[stock_symbols[i]])

def emv(X):
    emv = pd.Series(index=X.index)
    emv.iloc[0] = np.nan  # Set the first value to NaN as there is no previous data for comparison

    for i in range(1, len(X)):
        # Calculate the directional movement (dm)
        dm = 0.5 * ((X['High'].iloc[i] + X['Low'].iloc[i]) - (X['High'].iloc[i-1] + X['Low'].iloc[i-1]))
        
        # Calculate the buying pressure (br)
        br = X['Volume'].iloc[i] / (1000000 * (X['High'].iloc[i] - X['Low'].iloc[i]))

        # Calculate the EMV (Ease of Movement) value
        emv.iloc[i] = dm / br if br != 0 else 0  # Avoid division by zero

    return emv

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['emv'] = emv(stock[stock_symbols[i]])

window = 16
for i in range(len(stock_symbols)):
    # Calculate the minimum low and maximum high over the rolling window
    stock[stock_symbols[i]]['minimum_low'] = stock[stock_symbols[i]]['Low'].rolling(window=window).min()
    stock[stock_symbols[i]]['maximum_high'] = stock[stock_symbols[i]]['High'].rolling(window=window).max()

    # Calculate the stochastic oscillator
    stock[stock_symbols[i]]['stoch'] = ((stock[stock_symbols[i]]['Close'] - stock[stock_symbols[i]]['minimum_low']) /
                                        (stock[stock_symbols[i]]['maximum_high'] - stock[stock_symbols[i]]['minimum_low'])) * 100

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]] = stock[stock_symbols[i]].drop(['minimum_low', 'maximum_high'], axis=1)

def mfi(X, window=14):
    combine_price = (X['High'] + X['Low'] + X['Close']) / 3

    raw_money_flow = combine_price * X['Volume']

    flow_direction = (combine_price.diff() > 0).astype(int)

    positive_money_flow = flow_direction * raw_money_flow
    negative_money_flow = (1 - flow_direction) * raw_money_flow

    positive = positive_money_flow.rolling(window=window, min_periods=1).sum()
    negative = negative_money_flow.rolling(window=window, min_periods=1).sum()

    mf = positive / negative
    mfi = 100 - (100 / (1 + mf))

    return mfi

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['mfi'] = mfi(stock[stock_symbols[i]])


window = 21
for i in range(len(stock_symbols)):
    # Calculate the combined price (average of High, Low, and Close)
    stock[stock_symbols[i]]['combine_price'] = (stock[stock_symbols[i]]['High'] + stock[stock_symbols[i]]['Low'] + stock[stock_symbols[i]]['Close']) / 3

    # Calculate the simple moving average of the combined price
    stock[stock_symbols[i]]['sma_combine_price'] = stock[stock_symbols[i]]['combine_price'].rolling(window=window).mean()

    # Calculate the mean deviation of the combined price
    # Apply custom function to calculate the mean absolute deviation (MAD) for each rolling window
    stock[stock_symbols[i]]['mean_deviation'] = stock[stock_symbols[i]]['combine_price'].rolling(window=window).apply(lambda x: (x - x.mean()).abs().mean(), raw=False)

    # Calculate the Commodity Channel Index (CCI)
    stock[stock_symbols[i]]['cci'] = (stock[stock_symbols[i]]['combine_price'] - stock[stock_symbols[i]]['sma_combine_price']) / (0.015 * stock[stock_symbols[i]]['mean_deviation'])

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]] = stock[stock_symbols[i]].drop(['combine_price', 'sma_combine_price', 'mean_deviation'], axis=1)

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['Volume'] = stock[stock_symbols[i]]['Volume'] / stock[stock_symbols[i]]['Volume'].ewm(5).mean()

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]].replace(0, np.nan, inplace=True)
    stock[stock_symbols[i]] = stock[stock_symbols[i]].dropna()

for i in range(len(stock_symbols)):
    stock[stock_symbols[i]]['trend'] = (stock[stock_symbols[i]]['today'].iloc[:] > 0).astype(int)

df = {}
for i in range(len(stock_symbols)):
    df[stock_symbols[i]] = stock[stock_symbols[i]][['today', 'previous1', 'previous2', 'previous3', 'previous4', 'previous5', 'Volume',
                                                'ema50', 'ema21', 'ema14', 'ema5', 'rsi', 'macd', 'roc', 'atr', 'obv', 'cmf', 'emv', 'stoch', 'mfi', 'cci', 'trend']]



In [35]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from scipy.stats import loguniform

stock_symbols = ["AMZN", "AAPL", "NVDA", "MSFT", "GOOG", "META", "TSLA", "WMT", "JPM", "NFLX"]

for stock in stock_symbols:
    print(f"Processing {stock}...")

    # # LOGISTIC REGRESSION
    X = df[stock_symbols[0]].loc[:, df[stock_symbols[0]].columns != 'trend']
    y = df[stock_symbols[0]]['trend']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=21)

    scaler = MinMaxScaler()

    X_train_scaled = scaler.fit_transform(X_train.values)
    X_test_scaled = scaler.transform(X_test.values)

    lr = LogisticRegression(penalty='l2', C=0.1, random_state=42)

    lr.fit(X_train_scaled, y_train.values)

    predictions = lr.predict(X_test_scaled)
    accuracy = accuracy_score(y_test.values, predictions)
    print(f"{stock} Logistic Regression Accuracy: {accuracy:.2f}")
    classification_rep = classification_report(y_test.values, predictions)
    print(f"{stock} Logistic Regression Classification Report:\n", classification_rep)


    #RANDOM FOREST
    X = df[stock_symbols[0]][['Volume', 'ema50', 'ema21',
                          'ema14', 'ema5', 'rsi',
                           'macd', 'roc', 'obv',
                            'atr', 'cmf', 'emv',
                             'stoch', 'cci', 'mfi']]
    y = df[stock_symbols[0]]['trend']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

    rf = RandomForestClassifier(n_estimators=110, random_state=21)
    rf.fit(X_train.values, y_train.values)

    y_pred = rf.predict(X_test.values)
    accuracy = accuracy_score(y_test.values, y_pred)
    print(f"{stock} Random Forest Accuracy: {accuracy:.2f}")
    classification_rep = classification_report(y_test.values, y_pred)
    print(f"{stock} Random Forest Classification Report:\n", classification_rep)


    # LSTM
    X = df[stock].loc[:, df[stock].columns != 'trend']
    y = df[stock]['trend']
    scale = MinMaxScaler(feature_range=(0,1))
    X_scaled = scale.fit_transform(X)

    timesteps = 10
    feature = X_scaled.shape[1]
    
    test_len = int(len(X_scaled)*0.25)
    train_len = X_scaled.shape[0] - test_len
    
    X_train = []
    y_train = []

    for i in range(timesteps, train_len):
        X_train.append(X_scaled[i-timesteps:i])
        y_train.append(y.values[i])

    X_train = np.asarray(X_train)
    y_train = np.asarray(y_train)

    X_test = []
    y_test = []
    for i in range(train_len-1, len(X_scaled)):
        X_test.append(X_scaled[i-timesteps: i])
        y_test.append(y.values[i])

    X_test = np.asarray(X_test)
    y_test = np.asarray(y_test)

    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=(timesteps, feature)))
    model.add(Dropout(0.1))
    model.add(LSTM(64))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse')

    model.fit(X_train, y_train, epochs=15, batch_size=8, validation_split=0.1, verbose=0)

    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{stock} LSTM Model Accuracy: {accuracy:.2f}")
    classification_rep = classification_report(y_test, y_pred)
    print(f"{stock} LSTM Classification Report:\n", classification_rep)


Processing AMZN...
AMZN Logistic Regression Accuracy: 0.78
AMZN Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

AMZN Random Forest Accuracy: 0.80
AMZN Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.78      0.78       255
           1       0.81      0.81      0.81       301

    accuracy                           0.80       556
   macro avg       0.80      0.80      0.80       556
weighted avg       0.80      0.80      0.80       556



  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
AMZN LSTM Model Accuracy: 0.58
AMZN LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.38      0.45       255
           1       0.59      0.74      0.65       301

    accuracy                           0.58       556
   macro avg       0.57      0.56      0.55       556
weighted avg       0.57      0.58      0.56       556

Processing AAPL...
AAPL Logistic Regression Accuracy: 0.78
AAPL Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

AAPL Random Forest Accuracy: 0.80
AAPL Random Forest Classification Report:
               precisio

  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
AAPL LSTM Model Accuracy: 0.63
AAPL LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.62      0.60       248
           1       0.68      0.63      0.65       308

    accuracy                           0.63       556
   macro avg       0.63      0.63      0.63       556
weighted avg       0.63      0.63      0.63       556

Processing NVDA...
NVDA Logistic Regression Accuracy: 0.78
NVDA Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

NVDA Random Forest Accuracy: 0.80
NVDA Random Forest Classification Report:
               precisio

  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
NVDA LSTM Model Accuracy: 0.58
NVDA LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.69      0.58       234
           1       0.69      0.50      0.58       322

    accuracy                           0.58       556
   macro avg       0.60      0.60      0.58       556
weighted avg       0.61      0.58      0.58       556

Processing MSFT...
MSFT Logistic Regression Accuracy: 0.78
MSFT Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

MSFT Random Forest Accuracy: 0.80
MSFT Random Forest Classification Report:
               precisio

  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 58ms/step
MSFT LSTM Model Accuracy: 0.58
MSFT LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.15      0.25       252
           1       0.57      0.93      0.71       304

    accuracy                           0.58       556
   macro avg       0.61      0.54      0.48       556
weighted avg       0.61      0.58      0.50       556

Processing GOOG...
GOOG Logistic Regression Accuracy: 0.78
GOOG Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

GOOG Random Forest Accuracy: 0.80
GOOG Random Forest Classification Report:
               precisio

  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step
GOOG LSTM Model Accuracy: 0.62
GOOG LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.56      0.57       248
           1       0.65      0.66      0.65       308

    accuracy                           0.62       556
   macro avg       0.61      0.61      0.61       556
weighted avg       0.61      0.62      0.61       556

Processing META...
META Logistic Regression Accuracy: 0.78
META Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

META Random Forest Accuracy: 0.80
META Random Forest Classification Report:
               precisio

  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
META LSTM Model Accuracy: 0.58
META LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.04      0.08       239
           1       0.58      0.99      0.73       317

    accuracy                           0.58       556
   macro avg       0.67      0.52      0.40       556
weighted avg       0.66      0.58      0.45       556

Processing TSLA...
TSLA Logistic Regression Accuracy: 0.78
TSLA Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

TSLA Random Forest Accuracy: 0.80
TSLA Random Forest Classification Report:
               precisio

  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step
TSLA LSTM Model Accuracy: 0.62
TSLA LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.43      0.53       277
           1       0.59      0.81      0.68       279

    accuracy                           0.62       556
   macro avg       0.64      0.62      0.60       556
weighted avg       0.64      0.62      0.60       556

Processing WMT...
WMT Logistic Regression Accuracy: 0.78
WMT Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

WMT Random Forest Accuracy: 0.80
WMT Random Forest Classification Report:
               precision    

  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step
WMT LSTM Model Accuracy: 0.61
WMT LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.50      0.52       231
           1       0.66      0.69      0.67       325

    accuracy                           0.61       556
   macro avg       0.60      0.60      0.60       556
weighted avg       0.61      0.61      0.61       556

Processing JPM...
JPM Logistic Regression Accuracy: 0.78
JPM Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

JPM Random Forest Accuracy: 0.80
JPM Random Forest Classification Report:
               precision    re

  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step
JPM LSTM Model Accuracy: 0.63
JPM LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.19      0.29       228
           1       0.62      0.93      0.75       328

    accuracy                           0.63       556
   macro avg       0.64      0.56      0.52       556
weighted avg       0.64      0.63      0.56       556

Processing NFLX...
NFLX Logistic Regression Accuracy: 0.78
NFLX Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.63      0.71       241
           1       0.76      0.89      0.82       315

    accuracy                           0.78       556
   macro avg       0.79      0.76      0.77       556
weighted avg       0.78      0.78      0.77       556

NFLX Random Forest Accuracy: 0.80
NFLX Random Forest Classification Report:
               precision 

  super().__init__(**kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step
NFLX LSTM Model Accuracy: 0.56
NFLX LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.45      0.48       248
           1       0.60      0.65      0.62       307

    accuracy                           0.56       555
   macro avg       0.55      0.55      0.55       555
weighted avg       0.56      0.56      0.56       555

