In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import joblib

In [2]:
stock_data = pd.read_csv('nifty50_stock_data.csv')
stock_data['date'] = pd.to_datetime(stock_data['date'])
stock_data.set_index('date', inplace=True)

In [4]:
stock_data['20_MA'] = stock_data['4. close'].rolling(window=20).mean()
stock_data['50_MA'] = stock_data['4. close'].rolling(window=50).mean()

In [5]:
def compute_rsi(data, window=14):
    delta = data['4. close'].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(data):
    ema_12 = data['4. close'].ewm(span=12, adjust=False).mean()
    ema_26 = data['4. close'].ewm(span=26, adjust=False).mean()
    macd = ema_12 - ema_26
    signal = macd.ewm(span=9, adjust=False).mean()
    return macd, signal

stock_data['RSI'] = compute_rsi(stock_data)
stock_data['MACD'], stock_data['MACD_Signal'] = compute_macd(stock_data)

In [7]:
stock_data.isnull().sum()

1. open        0
2. high        0
3. low         0
4. close       0
5. volume      0
Company        0
Symbol         0
20_MA          0
50_MA          0
RSI            0
MACD           0
MACD_Signal    0
dtype: int64

In [8]:
stock_data['Buy_Signal'] = (
    (stock_data['RSI'] < 30) &
    (stock_data['20_MA'] > stock_data['50_MA']) &
    (stock_data['20_MA'].shift(1) <= stock_data['50_MA'].shift(1))
)

In [9]:
stock_data['next_day_close'] = stock_data['4. close'].shift(-1)
stock_data['Target'] = (stock_data['next_day_close'] > stock_data['4. close']).astype(int)

In [10]:
stock_data.head()

Unnamed: 0_level_0,1. open,2. high,3. low,4. close,5. volume,Company,Symbol,20_MA,50_MA,RSI,MACD,MACD_Signal,Buy_Signal,next_day_close,Target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2025-05-22,1420.0,1422.5,1397.4,1409.4,388189.0,Reliance Industries,RELIANCE.BSE,1428.69,1454.925,35.431322,-11.379929,-11.270447,False,1428.9,1
2025-05-21,1424.95,1436.1,1418.35,1428.9,507422.0,Reliance Industries,RELIANCE.BSE,1428.6425,1455.305,46.641274,-10.192441,-11.054846,False,1425.3,0
2025-05-20,1435.65,1444.55,1421.2,1425.3,670532.0,Reliance Industries,RELIANCE.BSE,1428.3425,1455.452,41.557971,-9.433099,-10.730496,False,1441.65,1
2025-05-19,1454.3,1454.3,1438.05,1441.65,550729.0,Reliance Industries,RELIANCE.BSE,1428.4975,1456.539,49.363697,-7.4264,-10.069677,False,1456.6,1
2025-05-16,1453.75,1460.2,1447.6,1456.6,338754.0,Reliance Industries,RELIANCE.BSE,1429.945,1457.829,54.269972,-4.576976,-8.971137,False,1453.8,0


In [11]:
features = ['5. volume', '20_MA', '50_MA', 'RSI', 'MACD', 'MACD_Signal']
X = stock_data[features]
y = stock_data['Target']

In [12]:
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X.iloc[-126:]
y_train, y_test = y[:split_index], y.iloc[-126:]

In [16]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
model = RandomForestRegressor()
model.fit(X_train_scaled, y_train)

In [31]:
prediction = model.predict(X_train_scaled)
accuracy = metrics.r2_score(y_train, prediction)
print(f"R Squared Value (6-month backtest): {accuracy * 100:.2f}")

R Squared Value (6-month backtest): 85.35


In [32]:
joblib.dump(model, 'stock_prediction_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']