# NIFTY50 PRICE PREDICTION

### This Program is made to predict the next days NIFTY50 index price using machine learning 

# Importing Useful libraries
### From sklearn - StandardScaler, mean_squared_error, r2_score, mean_absolute_error, train_test_split, LinearRegression, RandomForestRegressor, SVR, DecisionTreeRegressor and some tree libraries Lightgbm and Xgboost

### Pandas, Numpy, Joblib

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Loading Datasets
#### Converting Date column to datetime type
#### Setting index of DataFrame to Date column

In [3]:
data = pd.read_csv('../data/NIFTY50_data.csv', parse_dates=['Date'], index_col='Date')
data.sort_index(inplace=True)

# Feature Engineering

In [4]:
data['Return'] = data['Close'].pct_change()
data['Rel_Volume'] = data['Volume'] / data['Volume'].rolling(20).mean()
data['Return_10'] = data['Close'].pct_change(10)
data['Return_3'] = data['Close'].pct_change(3)
data['Volatility'] = data['Return'].rolling(20).std()
data['Volatility_5'] = data['Return'].rolling(5).std()
data['MA20'] = data['Close'].rolling(20).mean()
data['MA50'] = data['Close'].rolling(50).mean()
data['Trend20'] = (data['Close'] - data['MA20']) / data['MA20']
data['Trend50'] = (data['Close'] - data['MA50']) / data['MA50']
data['HL_Range'] = (data['High'] - data['Low']) / data['Close']
data['Gap'] = (data['Open'] - data['Close'].shift(1)) / data['Close'].shift(1)

# Calculate RSI
delta = data['Close'].diff()
gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)
# Wilder smoothing
avg_gain = gain.ewm(alpha=1/14, adjust=False).mean()
avg_loss = loss.ewm(alpha=1/14, adjust=False).mean()
rs = avg_gain / avg_loss
data['RSI'] = 100 - (100 / (1 + rs))

data['Target'] = data['Return'].shift(-1)  # Predict next day's return

data.drop(columns=['MA20', 'MA50'], inplace=True)
data.dropna(inplace=True)

# Splitting the Data into Train and Test

In [5]:
train_size = int(len(data) * 0.8)

y_train = data['Target'][:train_size]
y_test = data['Target'][train_size:]

data.drop(columns=['Target'], inplace=True)
X_train = data[:train_size]
X_test = data[train_size:]

# Creating Piplines to Train Different models and measure metrics

In [6]:
tree_models = {
    "rf": RandomForestRegressor(n_estimators=200, random_state=42),
    "xgb": XGBRegressor(n_estimators=300, max_depth=3, learning_rate=0.05, random_state=42),
    "lgbm": LGBMRegressor(n_estimators=300, max_depth=3, learning_rate=0.05, random_state=42)
}

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linear_models = {
    "lr": LinearRegression(),
    "svr": SVR(kernel='rbf')
}

### If I trade based on the prediction, how much money do I make? -> That’s strategy return.
### If prediction > 0 → go LONG
### If prediction < 0 → go SHORT

In [7]:
def print_metrics(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    direction_acc = np.mean(np.sign(y_true) == np.sign(y_pred))
    
    position = np.sign(y_pred)              # long or short
    strategy_return = position * y_true     # actual profit
    cumulative_return = (1 + strategy_return).cumprod()

    all_metrics = {
        "MSE": mse,
        "MAE": mae,
        "Directional Accuracy": direction_acc,
        "Final Equity": cumulative_return.iloc[-1]
    }

    import json
    import os

    os.makedirs('../results', exist_ok=True)
    with open(f'../results/{name}_metrics.json', 'w') as f:
        json.dump(all_metrics, f, indent=4)


In [8]:
# Tree models
for name, model in tree_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print_metrics(name, y_test, y_pred)

# Linear models
for name, model in linear_models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print_metrics(name, y_test, y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000638 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4080
[LightGBM] [Info] Number of data points in the train set: 2376, number of used features: 16
[LightGBM] [Info] Start training from score 0.000493


# Fine Tuning LIGHTGBM model as it is best so far

In [9]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "n_estimators": [200, 300, 400],
    "learning_rate": [0.01, 0.02, 0.03],
    "max_depth": [-1, 3, 5],
    "num_leaves": [31, 50, 80],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "min_child_samples": [20, 30]
}

model = LGBMRegressor(random_state=42)

search = RandomizedSearchCV(
    model,
    param_grid,
    n_iter=30,
    scoring="neg_mean_squared_error",
    cv=3,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

best_model = search.best_estimator_

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4080
[LightGBM] [Info] Number of data points in the train set: 2376, number of used features: 16
[LightGBM] [Info] Start training from score 0.000493


In [10]:
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print_metrics("best_lgbm", y_test, y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000548 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4080
[LightGBM] [Info] Number of data points in the train set: 2376, number of used features: 16
[LightGBM] [Info] Start training from score 0.000493


# Pickling the model for production

In [11]:
import os

os.makedirs('../models', exist_ok=True)

joblib.dump(search.best_params_, '../models/best_lgbm_params.json')
joblib.dump(best_model, '../models/best_lgbm_model.pkl')

['../models/best_lgbm_model.pkl']