In [31]:
#Predictive Model
#Below are the libraries need to install before executing the code.
import yfinance as yf
import pandas as pd
import os

In [32]:
#Here, I have used data from Yahoo Finance for Nifty50
nifty50 = yf.Ticker("^NSEI")
nifty50 = nifty50.history(period="max")
print(nifty50)

                                   Open          High           Low  \
Date                                                                  
2007-09-17 00:00:00+05:30   4518.450195   4549.049805   4482.850098   
2007-09-18 00:00:00+05:30   4494.100098   4551.799805   4481.549805   
2007-09-19 00:00:00+05:30   4550.250000   4739.000000   4550.250000   
2007-09-20 00:00:00+05:30   4734.850098   4760.850098   4721.149902   
2007-09-21 00:00:00+05:30   4752.950195   4855.700195   4733.700195   
...                                 ...           ...           ...   
2024-12-02 00:00:00+05:30  24140.849609  24301.699219  24008.650391   
2024-12-03 00:00:00+05:30  24367.500000  24481.349609  24280.000000   
2024-12-04 00:00:00+05:30  24488.750000  24573.199219  24366.300781   
2024-12-05 00:00:00+05:30  24539.150391  24857.750000  24295.550781   
2024-12-06 00:00:00+05:30  24729.449219  24751.050781  24620.500000   

                                  Close  Volume  Dividends  Stock Splits  
D

In [33]:
#Pre-processing Data
del nifty50["Dividends"]
del nifty50["Stock Splits"]

In [34]:
#Trying to predict whether the stock is going to go up the next day. Storing that data into Target.
nifty50["Tomorrow"] = nifty50["Close"].shift(-1)
nifty50["Target"] = (nifty50["Tomorrow"] > nifty50["Close"]).astype(int)

In [35]:
nifty50 = nifty50.loc["2010-01-01":].copy()

In [36]:
#Here, I have used 2 variables(train,test) to train the model and backtest it to check the efficiency of the model.
#Used Random Forest Classifier algorithm which helps us to work on non-linear data effictively.
#Training Model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)

train = nifty50.iloc[:-100]   #from Jan 2010 till the last 100 rows
test = nifty50.iloc[-100:]    #Only last 100 rows

predictors = ["Close", "Volume", "Open", "High", "Low"]
model.fit(train[predictors], train["Target"])

In [37]:
from sklearn.metrics import precision_score

preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index)
precision_score(test["Target"], preds)

0.5204081632653061

combined = pd.concat([test["Target"], preds], axis=1)

In [38]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [39]:
#Backtesting Model
def backtest(data, model, predictors, start=2500, step=250):   #250 trading days in a year, so taking 10 years of data
    all_predictions = []
    
#In this for loop, we are trying to train the model and test the same model with increased 1 year. For example, if we take data for 2 years for training
# the model, 1 year of data is added to test ie. it becomes 3 years in total.
    for i in range(start, data.shape[0], step):    
        train = data.iloc[0:i].copy()              #Here, range starts from 0 with step of 250 that is 1 year
        test = data.iloc[i:(i+step)].copy()        #Here, step is increased by 1 year to test the model's efficiency 
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    
    return pd.concat(all_predictions)

In [40]:
nifty50 = nifty50.dropna(subset=nifty50.columns[nifty50.columns != "Tomorrow"])

In [41]:
nifty50

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04 00:00:00+05:30,5200.899902,5238.450195,5167.100098,5232.200195,0,5277.899902,1
2010-01-05 00:00:00+05:30,5277.149902,5288.350098,5242.399902,5277.899902,0,5281.799805,1
2010-01-06 00:00:00+05:30,5278.149902,5310.850098,5260.049805,5281.799805,0,5263.100098,0
2010-01-07 00:00:00+05:30,5281.799805,5302.549805,5244.750000,5263.100098,0,5244.750000,0
2010-01-08 00:00:00+05:30,5264.250000,5276.750000,5234.700195,5244.750000,0,5249.399902,1
...,...,...,...,...,...,...,...
2024-12-02 00:00:00+05:30,24140.849609,24301.699219,24008.650391,24276.050781,220400,24457.150391,1
2024-12-03 00:00:00+05:30,24367.500000,24481.349609,24280.000000,24457.150391,339500,24467.449219,1
2024-12-04 00:00:00+05:30,24488.750000,24573.199219,24366.300781,24467.449219,348000,24708.400391,1
2024-12-05 00:00:00+05:30,24539.150391,24857.750000,24295.550781,24708.400391,361500,24677.800781,0


In [42]:
predictions = backtest(nifty50, model, predictors)

In [43]:
predictions["Predictions"].value_counts()

Predictions
0    842
1    322
Name: count, dtype: int64

In [44]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5590062111801242

In [45]:
predictions["Target"].value_counts() / predictions.shape[0]

Target
1    0.564433
0    0.435567
Name: count, dtype: float64

In [46]:
#We achieved prediction of 56% that when Target = 1 that means if we buy the stock, chances of price going up the next day is 56%.