In [2]:
import pandas as pd 
import yfinance as yf
import os
from datetime import datetime, timedelta

In [26]:
end_date = datetime.today().strftime('%Y-%m-%d')
start_date = (datetime.today() - timedelta(days=6*365)).strftime('%Y-%m-%d')
data = yf.download('^NSEI', start=start_date, end=end_date)
df = pd.DataFrame(data)
df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
df.rename(columns={'Close_^NSEI': 'Close',	'High_^NSEI': 'High',	'Low_^NSEI': 'Low', 	'Open_^NSEI': 'Open',	'Volume_^NSEI': 'Volume'}, inplace=True)
df.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-03-18,11462.200195,11530.150391,11412.5,11473.849609,320300
2019-03-19,11532.400391,11543.849609,11451.25,11500.299805,326100
2019-03-20,11521.049805,11556.099609,11503.099609,11553.349609,366300
2019-03-22,11456.900391,11572.799805,11434.549805,11549.200195,386200
2019-03-25,11354.25,11395.650391,11311.599609,11395.650391,294500


In [27]:
df['Year'] = df.index.year
df['Month'] = df.index.month
df['Day'] = df.index.day
df['DayOfWeek'] = df.index.dayofweek  # 0=Monday, 6=Sunday
df['IsMonthEnd'] = df.index.is_month_end.astype(int)
df['IsMonthStart'] = df.index.is_month_start.astype(int)
df['IsBudgetDay'] = (df.index == '2024-02-01').astype(int)
df['Quarter'] = df.index.quarter
# Add technical indicators
    # 1. Moving averages
df['MA5'] = df['Close'].rolling(window=5).mean()
df['MA20'] = df['Close'].rolling(window=20).mean()
df['MA50'] = df['Close'].rolling(window=50).mean()
    
    # 2. Relative Strength Index (RSI)
delta = df['Close'].diff()
gain = delta.where(delta > 0, 0).rolling(window=14).mean()
loss = -delta.where(delta < 0, 0).rolling(window=14).mean()
rs = gain / loss
df['RSI'] = 100 - (100 / (1 + rs))
    
    # 3. MACD
df['EMA12'] = df['Close'].ewm(span=12).mean()
df['EMA26'] = df['Close'].ewm(span=26).mean()
df['MACD'] = df['EMA12'] - df['EMA26']
df['Signal'] = df['MACD'].ewm(span=9).mean()
    
    # 4. Bollinger Bands
df['20MA'] = df['Close'].rolling(window=20).mean()
df['20STD'] = df['Close'].rolling(window=20).std()
df['Upper_Band'] = df['20MA'] + 2 * df['20STD']
df['Lower_Band'] = df['20MA'] - 2 * df['20STD']
    
    # 5. Volume features
df['Volume_Change'] = df['Volume'].pct_change()
df['Volume_MA5'] = df['Volume'].rolling(window=5).mean()
    
    # 6. Price momentum
df['Price_Change'] = df['Close'].pct_change()
df['Price_Change_5d'] = df['Close'].pct_change(periods=5)
    
    # Drop NaN values that result from calculations
df = df.dropna()

In [28]:
df.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,Year,Month,Day,DayOfWeek,IsMonthEnd,...,MACD,Signal,20MA,20STD,Upper_Band,Lower_Band,Volume_Change,Volume_MA5,Price_Change,Price_Change_5d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-03,12088.549805,12103.049805,11920.099609,11953.75,315300,2019,6,3,0,0,...,115.926835,62.549934,11603.88999,311.878917,12227.647824,10980.132156,-0.281613,418360.0,0.013902,0.013736
2019-06-04,12021.650391,12095.200195,12005.849609,12052.650391,289200,2019,6,4,1,0,...,122.727309,74.585547,11630.07749,324.254203,12278.585896,10981.569085,-0.082778,356540.0,-0.005534,0.007788
2019-06-06,11843.75,12039.799805,11830.25,12039.799805,415200,2019,6,6,3,0,...,112.700695,82.208646,11654.29248,321.047993,12296.388467,11012.196494,0.435685,375960.0,-0.014798,-0.001463
2019-06-07,11870.650391,11897.5,11769.5,11865.200195,302500,2019,6,7,4,0,...,105.671624,86.901276,11682.73501,313.280148,12309.295306,11056.174713,-0.271435,352220.0,0.002271,-0.006299
2019-06-10,11922.700195,11975.049805,11871.75,11934.900391,303300,2019,6,10,0,0,...,103.050408,90.131121,11714.925,302.491545,12319.908089,11109.941911,0.002645,325100.0,0.004385,-8e-06


In [29]:
df['Target'] = df['Close'].shift(-1)
df['Target'][-1] = df['Close'][-1]
df.tail()

  df['Target'][-1] = df['Close'][-1]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['Target'][-1] = df['Close'][-1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

Unnamed: 0_level_0,Close,High,Low,Open,Volume,Year,Month,Day,DayOfWeek,IsMonthEnd,...,Signal,20MA,20STD,Upper_Band,Lower_Band,Volume_Change,Volume_MA5,Price_Change,Price_Change_5d,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-03-07,22552.5,22633.800781,22464.75,22508.650391,289800,2025,3,7,4,0,...,-254.941534,22748.655176,404.476881,23557.608938,21939.701414,-0.221177,300700.0,0.000346,0.019336,22460.300781
2025-03-10,22460.300781,22676.75,22429.050781,22521.849609,293900,2025,3,10,0,0,...,-253.375488,22693.672754,360.768062,23415.208878,21972.13663,0.014148,303000.0,-0.004088,0.015416,22497.900391
2025-03-11,22497.900391,22522.099609,22314.699219,22345.949219,347900,2025,3,11,1,0,...,-248.788182,22649.487793,324.358114,23298.204022,22000.771564,0.183736,321920.0,0.001674,0.018804,22470.5
2025-03-12,22470.5,22577.400391,22329.550781,22536.349609,369700,2025,3,12,2,0,...,-242.417802,22619.422754,310.734764,23240.892282,21997.953226,0.062662,334680.0,-0.001218,0.005963,22397.199219
2025-03-13,22397.199219,22558.050781,22377.349609,22541.5,287500,2025,3,13,3,0,...,-235.880724,22587.020215,297.500265,23182.020744,21992.019685,-0.222342,317760.0,-0.003262,-0.006543,22397.199219


In [22]:
import xgboost as xgb
import pandas as pd
import joblib


In [20]:
x = df.drop(columns=['Target'])
y = df['Target']


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1427 entries, 2019-06-03 to 2025-03-13
Data columns (total 30 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Close            1427 non-null   float64
 1   High             1427 non-null   float64
 2   Low              1427 non-null   float64
 3   Open             1427 non-null   float64
 4   Volume           1427 non-null   int64  
 5   Year             1427 non-null   int32  
 6   Month            1427 non-null   int32  
 7   Day              1427 non-null   int32  
 8   DayOfWeek        1427 non-null   int32  
 9   IsMonthEnd       1427 non-null   int64  
 10  IsMonthStart     1427 non-null   int64  
 11  IsBudgetDay      1427 non-null   int64  
 12  Quarter          1427 non-null   int32  
 13  MA5              1427 non-null   float64
 14  MA20             1427 non-null   float64
 15  MA50             1427 non-null   float64
 16  RSI              1427 non-null   float64
 

In [32]:
import numpy as np
# Replace inf values with max/min finite values
x = np.where(np.isinf(x), np.nan, x)  # Replace inf with NaN
y = np.where(np.isinf(y), np.nan, y)

# Replace NaN with column mean
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")
x = imputer.fit_transform(x)
y = imputer.fit_transform(y.reshape(-1, 1)).flatten()
# Convert to XGBoost DMatrix format
dtrain = xgb.DMatrix(x, label=y)

['xgboost_model.pkl']

In [37]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)

In [40]:
# XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 300, 500],  # Number of trees
    'learning_rate': [0.01, 0.05, 0.1],  # Step size shrinkage
    'max_depth': [3, 5, 7],  # Tree depth
    'subsample': [0.6, 0.8, 1.0],  # Row sampling
    'colsample_bytree': [0.6, 0.8, 1.0],  # Feature sampling
    'gamma': [0, 0.1, 0.2],  # Minimum loss reduction
    'reg_lambda': [0, 1, 10],  # L2 regularization
    'reg_alpha': [0, 1, 10]  # L1 regularization
}

# Grid Search with 5-fold Cross Validation
grid_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1, n_iter=20)

# Train
grid_search.fit(X_scaled, y)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [42]:
# Use best parameters
best_params = grid_search.best_params_

# Train XGBoost with best parameters
final_model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
final_model.fit(X_scaled, y)

joblib.dump(final_model, "optimized_Nifty50.pkl")
print("✅ Optimized model saved!")

✅ Optimized model saved!
