In [1]:
import importlib
import importlib.metadata
import subprocess
import sys
from packaging import version


def install_dependencies(recheck=False):
    if not recheck:
        print("Checking Dependencies")

    pkgs = ['pandas', 'numpy', 'pandas_ta', 'tqdm', 'tensorflow', 'matplotlib', 'scikit-learn', 'openpyxl', 'jupyter',
            'keras-tuner']
    pkgs_ver = {
        'tensorflow': '2.16.1',
        'scikit-learn': '1.4.2',
    }

    # Dictionary to handle special import names
    import_names = {
        'scikit-learn': 'sklearn',
        'keras-tuner': 'kerastuner'
    }

    failed_pkgs = []

    for pkg in pkgs:
        try:
            import_name = import_names.get(pkg, pkg)
            module = importlib.import_module(import_name)

            if pkg in pkgs_ver:
                # Check if the installed version matches the required version
                installed_version = importlib.metadata.version(pkg)
                required_version = pkgs_ver[pkg]

                if version.parse(installed_version) != version.parse(required_version):
                    raise ImportError(f"{pkg} version mismatch: {installed_version} != {required_version}")

        except ImportError:
            if not recheck:
                print("Installing Package:", pkg)
                pkg_to_install = f"{pkg}=={pkgs_ver[pkg]}" if pkg in pkgs_ver else pkg
                try:
                    result = subprocess.run([sys.executable, '-m', 'pip', 'install', pkg_to_install, '-q'], check=True)
                    print(f"{pkg} Installed Successfully")
                except subprocess.CalledProcessError:
                    print(f"{pkg} Installation Failed")
                    failed_pkgs.append(pkg)
            else:
                failed_pkgs.append(pkg)
        except Exception as e:
            print("Error:", e)

    if recheck:
        if len(failed_pkgs) > 0:
            return False, failed_pkgs
        else:
            return True, None

    if not recheck:
        rst, pkg = install_dependencies(recheck=True)
        if not rst:
            print("Failed to Install The Following Packages:", pkg)
            print("KINDLY INSTALL IT MANUALLY AND TRY AGAIN")
            return False
        print("All Dependencies Installed Successfully")

    return True

In [2]:
if not install_dependencies():
    exit()

import warnings

warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pandas_ta as ta
from tqdm import tqdm
from tqdm.keras import TqdmCallback
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard, History
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Activation, Bidirectional
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

Checking Dependencies
Installing Package: keras-tuner
keras-tuner Installed Successfully
All Dependencies Installed Successfully


In [3]:
_data_path = r"C:\Users\vinis\OneDrive\Desktop\Vinish Codes\ML\data\TATAMOTORS.xlsx"
if not Path(_data_path).exists():
    print("Data not found")
    raise FileNotFoundError("Stock Data Not Found AT : ", _data_path)
else:
    _data = pd.ExcelFile(_data_path)

In [4]:
class Trainer:
    def __init__(self, _data):
        self._data = _data
        self.data_path = _data.io
        self.stock_name = Path(_data_path).stem
        self.sheet_name = _data.sheet_names
        self.total_sheets = len(self.sheet_name)
        self.timeframes = [sheet.removeprefix(self.stock_name + '_') for sheet in
                           self.sheet_name]
        self.paths = self.path_initializer()
        self.raw_data = {}
        self.data = {}
        self.data_summary = {}
        self.column_names = {}
        self.scaled_data = {}
        self.X = {}
        self.Y = {}
        self.X_train = {}
        self.X_test = {}
        self.Y_train = {}
        self.Y_test = {}
        self.loss_fn = 'mse'
        self.model = {}
        self.y_pred = {}
        self.model_metrics = {}

    def path_initializer(self):
        print("Creating Necessary Paths")
        base_path = Path('ALGO DATA')
        base_path.mkdir(exist_ok=True)
        stock_path = base_path / self.stock_name
        stock_path.mkdir(exist_ok=True)
        model_path = stock_path / 'MODEL'
        model_path.mkdir(exist_ok=True)
        log_path = model_path / 'LOG'
        log_path.mkdir(exist_ok=True)
        model_data_path = stock_path / 'DATA'
        if not model_data_path.exists():
            with open(model_data_path, 'w') as f:
                f.write('Model Datas : \n')
        paths = {
            'base_path': base_path,
            'stock_path': stock_path,
            'model_path': model_path,
            'log_path': log_path,
            'model_data_path': model_data_path
        }

        for key, value in paths.items():
            if not value.exists():
                print("Path not found : ", value)
            else:
                print("All Necessary Paths Created Successfully")
                return paths

    def fetch_data(self):
        with tqdm(total=len(self.timeframes), desc="Loading Data") as pbar:
            if len(self.timeframes) == 0:
                raise ValueError("Timeframe Not Found")
            for timeframe in self.timeframes:
                df = self._data.parse(self.stock_name + '_' + timeframe)
                df.index = pd.to_datetime(df['DateTime'], format='%d-%m-%Y %H:%M:%S')
                df.drop('DateTime', axis=1, inplace=True)
                self.raw_data[timeframe] = df
                self.data_summary[timeframe] = {
                    'start_date': df.index.min(),
                    'end_date': df.index.max(),
                    'total_data': len(df)
                }
                pbar.update(1)
            pbar.set_postfix({"STATUS": "SUCCESS"})
        return True

    def feature_engneering(self):
        with tqdm(total=len(self.timeframes), desc="Processing Data (Stage 1)") as pbar:
            for timeframe in self.timeframes:
                try:
                    self.raw_data[timeframe]
                except KeyError:
                    raise ValueError("Data Not Found (LOOK LIKE LOADING DATA NOT DONE PROPERLY)")
                if self.raw_data[timeframe].empty:
                    raise ValueError("Data Not Found (LOOK LIKE LOADING DATA NOT DONE PROPERLY)")
                df = self.raw_data[timeframe]
                close = df['Close']
                open = df['Open']
                high = df['High']
                low = df['Low']
                volume = df['Volume']

                df['PRICE_CHANDE'] = close.pct_change()

                df.ta.rsi(close=close, length=14, append=True)
                df.ta.macd(close=close, append=True)
                df.ta.ema(close=close, length=8, append=True)
                df.ta.ema(close=close, length=21, append=True)
                df.ta.ema(close=close, length=50, append=True)

                df.ta.sma(close=close, length=8, append=True)
                df.ta.sma(close=close, length=21, append=True)
                df.ta.sma(close=close, length=50, append=True)

                df.ta.vwap(high=high, low=low, close=close, volume=volume, append=True)
                df.ta.atr(high=high, low=low, close=close, append=True)
                df.ta.adx(high=high, low=low, close=close, append=True)

                macd = ta.macd(close)
                df['MACD'] = macd['MACD_12_26_9']
                df['MACD_SIGNAL'] = macd['MACDs_12_26_9']
                df['MACD_HIST'] = macd['MACDh_12_26_9']

                stochastic = ta.stoch(high, low, close)
                df['STOCH_K'] = stochastic['STOCHk_14_3_3']
                df['STOCH_D'] = stochastic['STOCHd_14_3_3']

                bband = ta.bbands(close)
                df['BBAND_UPPER'] = bband['BBU_5_2.0']
                df['BBAND_LOWER'] = bband['BBL_5_2.0']
                df['BBAND_MIDDLE'] = bband['BBM_5_2.0']
                df['BBAND_WIDTH'] = bband['BBB_5_2.0']
                df['BBAND_PERCENT'] = bband['BBP_5_2.0']

                supert = ta.supertrend(high, low, close)
                df['SUPERT'] = supert['SUPERT_7_3.0']
                df['SUPERT_DIRECTION'] = supert['SUPERTd_7_3.0']

                df = df.dropna()
                self.column_names[timeframe] = df.columns
                self.data[timeframe] = df
                self.data_summary[timeframe]['total_features'] = len(df.columns)
                self.data_summary[timeframe]['features'] = df.columns
                pbar.update(1)
            pbar.set_postfix({"STATUS": "SUCCESS"})
        return True

    def scale_data(self):
        with tqdm(total=len(self.timeframes), desc="Processing Data (Stage 2)") as pbar:
            for timeframe in self.timeframes:
                try:
                    df = self.data[timeframe]
                except KeyError:
                    raise ValueError("Data Not Found (LOOK LIKE Processing Data (Stage 1) NOT DONE PROPERLY)")
                except Exception as e:
                    raise e
                sc = MinMaxScaler()
                try:
                    self.scaled_data[timeframe] = sc.fit_transform(df)
                except ValueError:
                    raise ValueError("Data Not Found (LOOK LIKE Processing Data (Stage 1) NOT DONE PROPERLY)")
                self.data_summary[timeframe]['scaler_min'] = sc.data_min_
                self.data_summary[timeframe]['scaler_max'] = sc.data_max_
                pbar.update(1)
            pbar.set_postfix({"STATUS": "SUCCESS"})
        return True

    def split_backdata(self):
        with tqdm(total=self.total_sheets, desc="Processing Data (Stage 3)") as pbar:
            backdata = {
                'ONE_MINUTE': 60,
                'THREE_MINUTE': 60,
                'FIVE_MINUTE': 60,

            }
            for timeframe in self.timeframes:
                X = []
                Y = []
                try:
                    back_data = backdata[timeframe]
                except KeyError:
                    batch_data = 60
                total_df = len(self.column_names[timeframe])
                try:
                    feature = self.column_names[timeframe].get_loc('Close')
                except KeyError:
                    raise ValueError("Close Price Not Found")

                for i in range(len(self.scaled_data[timeframe]) - back_data - 1):
                    a = self.scaled_data[timeframe][i:(i + back_data), :]
                    X.append(a)
                    Y.append(self.scaled_data[timeframe][i + back_data, feature])
                X, Y = np.array(X), np.array(Y)
                self.X[timeframe] = X.reshape(X.shape[0], X.shape[1], total_df)
                self.Y[timeframe] = Y
                pbar.update(1)
                self.data_summary[timeframe]['total_data'] = self.X[timeframe].shape[0]
            pbar.set_postfix({"STATUS": "SUCCESS"})
        return True

    def split_data(self):
        with tqdm(total=self.total_sheets, desc="Processing Data (Stage 4)") as pbar:
            for timeframe in self.timeframes:
                df = self.scaled_data[timeframe]
                split_limit = int(len(df) * 0.8)
                self.X_train[timeframe] = self.X[timeframe][:split_limit]
                self.X_test[timeframe] = self.X[timeframe][split_limit:]
                self.Y_train[timeframe] = self.Y[timeframe][:split_limit]
                self.Y_test[timeframe] = self.Y[timeframe][split_limit:]
                pbar.update(1)
                self.data_summary[timeframe]['train_data'] = self.X_train[timeframe].shape[0]
                self.data_summary[timeframe]['test_data'] = self.X_test[timeframe].shape[0]
            pbar.set_postfix({"STATUS": "SUCCESS"})
        return True

    def process_data(self):
        if not bool(self.raw_data):
            raise ValueError("Data Not Found (Kindly Fetch Data First)")
        if not self.feature_engneering():
            raise ValueError("Feature Engineering Not Completed")
        if not self.scale_data():
            raise ValueError("Data Scaling Not Completed")
        if not self.split_backdata():
            raise ValueError("Data Splitting Not Completed")
        if not self.split_data():
            raise ValueError("Data Splitting Not Completed")
        return True

    def train_model(self):
        def build_model(timeframe):
            lstm_input = Input(shape=(self.X_train[timeframe].shape[1], self.X_train[timeframe].shape[2]),
                               name='lstm_input')
            input = Bidirectional(LSTM(units=250, return_sequences=False,activation='sigmoid', name='lstm_1'))(lstm_input)
            input = Dropout(0.1)(input)
            input = Dense(1)(input)
            output = Activation('linear', name="output")(input)
            model = Model(inputs=lstm_input, outputs=output, name=f"{self.stock_name}_{timeframe}")
            optimizer = optimizers.Adam()
            model.compile(optimizer=optimizer, loss=self.loss_fn)
            return model

        with tqdm(total=self.total_sheets, desc="Training Model") as pbar:
            for timeframe in self.timeframes:
                try:
                    self.X_train[timeframe]
                except KeyError:
                    raise ValueError("Data Not Found (LOOK LIKE Processing Data (Stage 3) NOT DONE PROPERLY)")
                except Exception as e:
                    raise e
                model = build_model(timeframe)
                model.summary()
                model_path_checkpt = self.paths['model_path'] / f'{self.stock_name}_{timeframe}_checkpoint.keras'
                model_checkpoint = ModelCheckpoint(model_path_checkpt, monitor='val_loss', save_best_only=True,
                                                   verbose=0)
                early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0)
                reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=0)
                tensorboard = TensorBoard(log_dir=self.paths['log_path'], histogram_freq=1, write_graph=True,
                                          write_images=True)
                history = History()
                model.fit(self.X_train[timeframe], self.Y_train[timeframe], epochs=100, batch_size=16,
                          validation_split=0.2, verbose=1,
                          callbacks=[TqdmCallback(), model_checkpoint, early_stopping, reduce_lr, tensorboard, history])
                pbar.update(1)
                self.model[timeframe] = model
            pbar.set_postfix({"STATUS": "SUCCESS"})
        return True

    def eval_model(self):
        with tqdm(total=self.total_sheets, desc="Evaluating Model") as pbar:
            for timeframe in self.timeframes:
                try:
                    self.model[timeframe]
                except KeyError:
                    raise ValueError("Model Not Found (LOOK LIKE Training Model NOT DONE PROPERLY)")
                except Exception as e:
                    raise e
                print("EVALUATING MODEL FOR TIMEFRAME: ", timeframe)
                self.y_pred[f'{timeframe}'] = self.model[f'{timeframe}'].predict(self.X_test[f'{timeframe}'])
                mse = mean_squared_error(self.Y_test[f'{timeframe}'], self.y_pred[f'{timeframe}'])
                mae = mean_absolute_error(self.Y_test[f'{timeframe}'], self.y_pred[f'{timeframe}'])
                mape = mean_absolute_percentage_error(self.Y_test[f'{timeframe}'], self.y_pred[f'{timeframe}'])
                r2 = r2_score(self.Y_test[f'{timeframe}'], self.y_pred[f'{timeframe}'])
                self.model_metrics[f'{timeframe}'] = {"MSE": mse, "MAE": mae, "MAPE": mape, "R2": r2}
                self.data_summary[timeframe]['model_metrics'] = self.model_metrics[f'{timeframe}']
                pbar.update(1)
            pbar.set_postfix({"STATUS": "SUCCESS"})

    def graphical_report_analysis(self):
        with tqdm(total=self.total_sheets, desc="Graphical Analysis") as pbar:
            for timeframe in self.timeframes:
                plt.figure(figsize=(16, 8))
                plt.title(f"{self.stock_name} {timeframe}")
                plt.plot(self.Y_test[f'{timeframe}'], label='Actual')
                plt.plot(self.y_pred[f'{timeframe}'], label='Predicted')
                plt.legend()
                plt.show()
                pbar.update(1)
            pbar.set_postfix({"STATUS": "SUCCESS"})

    def save_model(self):
        with tqdm(total=self.total_sheets, desc="Saving Model") as pbar:
            for timeframe in self.timeframes:
                try:
                    self.model[timeframe]
                except KeyError:
                    raise ValueError("Model Not Found (LOOK LIKE Training Model NOT DONE PROPERLY)")
                except Exception as e:
                    raise e
                model_path = self.paths['model_path'] / f'{self.stock_name}_{timeframe}_model.keras'
                model_weight_path = self.paths['model_path'] / f'{self.stock_name}_{timeframe}_model.weights.h5'
                self.model[timeframe].save(model_path)
                self.model[timeframe].save_weights(model_weight_path)
                with open(self.paths['model_data_path'], 'a') as f:
                    for key, value in self.data_summary[timeframe].items():
                        f.write(f"{key} : {value}\n")
                pbar.update(1)
            pbar.set_postfix({"STATUS": "SUCCESS"})

In [5]:
trainer = Trainer(_data)

Creating Necessary Paths
All Necessary Paths Created Successfully


In [6]:
trainer.fetch_data()

Loading Data: 100%|██████████████████████████████████████████████████████| 1/1 [00:12<00:00, 12.28s/it, STATUS=SUCCESS]


True

In [7]:
trainer.process_data()

Processing Data (Stage 1): 100%|█████████████████████████████████████████| 1/1 [00:07<00:00,  7.91s/it, STATUS=SUCCESS]
Processing Data (Stage 2): 100%|█████████████████████████████████████████| 1/1 [00:00<00:00,  7.05it/s, STATUS=SUCCESS]
Processing Data (Stage 3): 100%|█████████████████████████████████████████| 1/1 [00:01<00:00,  1.90s/it, STATUS=SUCCESS]
Processing Data (Stage 4): 100%|█████████████████████████████████████████| 1/1 [00:00<00:00, 25.32it/s, STATUS=SUCCESS]


True

In [None]:
trainer.train_model()

Training Model:   0%|                                                                            | 0/1 [00:00<?, ?it/s]

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Epoch 1/100
[1m1766/5086[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m7:48[0m 141ms/step - loss: 0.0217

In [None]:
trainer.eval_model()
print(trainer.model_metrics)

In [None]:
trainer.graphical_report_analysis()

In [None]:
trainer.save_model()