In [None]:
# Import necessary libraries
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler



warnings.filterwarnings('ignore')


nifty50_tickers = [
    "ASIANPAINT.NS", "BRITANNIA.NS", "CIPLA.NS", "EICHERMOT.NS", "NESTLEIND.NS",
        "GRASIM.NS", "HEROMOTOCO.NS", "HINDALCO.NS", "HINDUNILVR.NS", "ITC.NS",
            "LT.NS", "M&M.NS", "RELIANCE.NS", "TATACONSUM.NS", "TATAMOTORS.NS",
                "TATASTEEL.NS", "WIPRO.NS", "APOLLOHOSP.NS", "DRREDDY.NS", "TITAN.NS",
                    "SBIN.NS", "SHRIRAMFIN.NS", "BPCL.NS", "KOTAKBANK.NS", "INFY.NS",
                        "BAJFINANCE.NS", "ADANIENT.NS", "SUNPHARMA.NS", "JSWSTEEL.NS", "HDFCBANK.NS",
                            "TCS.NS", "ICICIBANK.NS", "POWERGRID.NS", "MARUTI.NS", "INDUSINDBK.NS",
                                "AXISBANK.NS", "HCLTECH.NS", "ONGC.NS", "NTPC.NS", "COALINDIA.NS",
                                    "BHARTIARTL.NS", "TECHM.NS", "LTIM.NS", "DIVISLAB.NS", "ADANIPORTS.NS",
                                        "HDFCLIFE.NS", "SBILIFE.NS", "ULTRACEMCO.NS"
                                        ]

# Date Range(10 years time period)
end_date = '2024-07-23'
start_date = pd.to_datetime('2014-01-01')

# Dataset-Yfinance
df = yf.download(tickers=nifty50_tickers,
                            start=start_date,
                             end=end_date).stack()

# Rename the index names for the dataframe(multi index dataframe)
df.index.names = ['date', 'ticker']
df.columns = df.columns.str.lower()
df
#This dataframe is at date level(contains daily stock prices of nifty 50 stocks)



: 

In [None]:
#FEATURE ENGINEERING(techincal indicators) using pandas_ta lib

# 1.Garman-Klass Volatility
# Formula: 0.5 * (log(High) - log(Low))^2 - (2*log(2)-1) * (log(Close) - log(Open))^2
df['garman_klass_vol'] = (
    ((np.log(df['high']) - np.log(df['low'])) ** 2) / 2 -
    (2 * np.log(2) - 1) * ((np.log(df['adj close']) - np.log(df['open'])) ** 2)
)

#2. RSI (Relative Strength Index) calculation for a window of 20 days 
df['rsi'] = df.groupby(level=1)['adj close'].transform(
    lambda x: pandas_ta.rsi(close=x, length=20)
)

#3. Bollinger Bands for a window of 20 days
# bb_low: Lower Bollinger Band
df['bb_low'] = df.groupby(level=1)['adj close'].transform(
    lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:, 0]
)
# bb_mid: Middle Bollinger Band (SMA)
df['bb_mid'] = df.groupby(level=1)['adj close'].transform(
    lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:, 1]
)
# bb_high: Upper Bollinger Band
df['bb_high'] = df.groupby(level=1)['adj close'].transform(
    lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:, 2]
)

#4. ATR (Average True Range): Function to compute ATR, then normalize by subtracting mean and dividing by standard deviation
def compute_atr(stock_data):
    atr = pandas_ta.atr(
        high=stock_data['high'],
        low=stock_data['low'],
        close=stock_data['close'],
        length=14
    )
    return atr.sub(atr.mean()).div(atr.std())

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

#5. MACD (Moving Average Convergence Divergence): Function to compute MACD, then normalize by subtracting mean and dividing by standard deviation
def compute_macd(close):
    macd = pandas_ta.macd(close=close, length=20).iloc[:, 0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].apply(compute_macd)

#5. Traded Value in crores inr: Calculated as (Adjusted Close Price * Volume) / 1 crore
df['traded_value'] = (df['adj close'] * df['volume']) / 1e7

# Display the resulting dataframe
df


In [None]:
# Aggregate to monthly level 

# Create a list of column names, excluding those related to volume, price levels, and some others.
last_cols = [c for c in df.columns.unique(0)
             if c not in ['traded_value', 'volume', 'open', 'high', 'low', 'close']]

# 1. Converting the daily 'traded_value' data to monthly frequency by taking the mean.
# 2. Convert the rest of the daily data (from last_cols) to monthly frequency by taking the last value of the month(i.e value at last date of the month).

data = (pd.concat([
    df.unstack('ticker')['traded_value'].resample('M').mean().stack('ticker').to_frame('traded_value'),
    df.unstack()[last_cols].resample('M').last().stack('ticker')
], axis=1)).dropna()

# Display the aggregated monthly data.
data
# data dataframe is at monthly level

In [None]:

# Calculating the 2-year rolling average of tradeed value for each stock.(2 yr rolling window with minimum of 12 months to calculate mean)


data['traded_value'] = (data.loc[:, 'traded_value'].unstack('ticker')
                         .rolling(2*12, min_periods=12).mean()
                         .stack())

# Calculate Monthly Returns for different time horizons as features.

# function to calculate monthly returns for various lag periods(1,2,3,6,9,12)
# also handles outliers by clipping return values outside a certain range.
def calculate_returns(df):

    # Define an outlier cutoff threshold. Returns beyond this threshold (both on the low and high side) will be treated as outliers.
    outlier_cutoff = 0.005

    # Define different time horizons (in months) for which we want to calculate returns.
    lags = [1, 2, 3, 6, 9, 12]

    # Loop over each lag to calculate the return for that time horizon.
    for lag in lags:

        df[f'returns_{lag}m'] = (df['adj close']
                              .pct_change(lag)
                              .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                     upper=x.quantile(1-outlier_cutoff)))
                              .add(1)
                              .pow(1/lag)
                              .sub(1))
    return df
# Apply the 'calculate_returns' function to our data grouped by ticker (level=1).
# Drop rows with NaN values after the calculation.
data = data.groupby(level=1, group_keys=False).apply(calculate_returns).dropna()

# Display the resulting dataframe with returns for each specified time horizon.
data



In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import joblib  # For saving scalers


In [None]:

if isinstance(data.index, pd.MultiIndex):
    data = data.reset_index()


In [None]:
# FEATURE SELECTION
features = [
    'traded_value', 'atr', 'bb_high', 'bb_low', 'bb_mid',
    'garman_klass_vol', 'macd', 'rsi',
    'returns_1m', 'returns_2m', 'returns_3m',
    'returns_6m', 'returns_9m', 'returns_12m'
]
target = 'adj close'

# SCALLING THE FEATURES
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

data[features] = feature_scaler.fit_transform(data[features])

# Shift the target column to predict the next month's closing price
data[target] = data.groupby('ticker')[target].shift(-1)

# Drop any NaN values that may have resulted from the shift
data.dropna(inplace=True)

# Split into features and target variable
X = data[features].values
y = data[target].values.reshape(-1, 1)  # Reshape for scaler

# Scale the target variable
y_scaled = target_scaler.fit_transform(y)

print("\nFeatures and Target Scaled:")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")


In [None]:
# Define the number of time steps (e.g., 12 months)(this will be used in lstm model)
time_steps = 12

X_lstm = []
y_lstm = []
tickers = data['ticker'].unique()

for ticker in tickers:
    ticker_data = data[data['ticker'] == ticker].sort_values('date')
    for i in range(time_steps, len(ticker_data)):
        X_lstm.append(X[ticker_data.index[i - time_steps:i]])
        y_lstm.append(y_scaled[ticker_data.index[i]])

# Convert lists to numpy arrays
X_lstm, y_lstm = np.array(X_lstm), np.array(y_lstm)

print(f"Shape of X_lstm: {X_lstm.shape}")  # Expected: (samples, time_steps, features)
print(f"Shape of y_lstm: {y_lstm.shape}")  # Expected: (samples, 1)


In [None]:
# Define the train-test split ratio
test_size = 0.2  # 20% for testing
split_index = int(X_lstm.shape[0] * (1 - test_size))

# Split the data
X_train, X_test = X_lstm[:split_index], X_lstm[split_index:]
y_train, y_test = y_lstm[:split_index], y_lstm[split_index:]

print(f"Training Samples: {X_train.shape[0]}")
print(f"Testing Samples: {X_test.shape[0]}")


In [None]:
# Define the LSTM model
model = Sequential([
    Bidirectional(LSTM(100, return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[2])),
    BatchNormalization(),
    Dropout(0.2),
    LSTM(100, return_sequences=False),
    BatchNormalization(),
    Dropout(0.2),
    Dense(50, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model with a suitable optimizer and loss function
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# Display the model's architecture
model.summary()


In [None]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,  
    batch_size=32,
    validation_split=0.2,  
    callbacks=[early_stopping],
    verbose=1
)


In [None]:
# Make predictions on the test set
predictions = model.predict(X_test)

# Inverse transform the predictions and actual values to their original scale
predictions_actual = target_scaler.inverse_transform(predictions)
y_test_actual = target_scaler.inverse_transform(y_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_actual, predictions_actual)
mse = mean_squared_error(y_test_actual, predictions_actual)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_actual, predictions_actual)
mape = mean_absolute_percentage_error(y_test_actual, predictions_actual)

# Print evaluation metrics
print("\nModel Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")


In [None]:
# Plot training & validation loss
plt.figure(figsize=(14, 7))
plt.plot(history.history['loss'], label='Train Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.title('Model Loss Over Epochs')
plt.ylabel('Loss (MSE)')
plt.xlabel('Epoch')
plt.legend()
plt.show()



# Ensure the data is 1D
y_test_actual_1d = y_test_actual.ravel() if len(y_test_actual.shape) > 1 else y_test_actual
predictions_actual_1d = predictions_actual.ravel() if len(predictions_actual.shape) > 1 else predictions_actual

# Prepare the data
predictions_df = pd.DataFrame({
    'Actual Closing Price': y_test_actual_1d,
    'Predicted Closing Price': predictions_actual_1d
})
import cufflinks as cf
cf.go_offline() 

# Plot using cufflinks with solid lines and better labels
predictions_df.iplot(kind='line',
                     title='Actual vs Predicted Closing Prices',
                     xTitle='Time',
                     yTitle='Closing Price (in Rs)',
                     colors=['blue', 'red'],
                     theme='white',
                     mode='lines',  
                     linewidth=3,  
                     asFigure=False,
                     legend=True)

