In [60]:
import pandas as pd
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor 
from xgboost import XGBRegressor 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_absolute_error, r2_score

In [6]:
temp = pd.read_csv(r'C:\Users\sandeep\OneDrive\Documents\Project\scoring_model\finops\sandeep_work\data\testdata.csv').drop('Unnamed: 0',axis=1)

In [28]:
df = temp.copy()
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116317 entries, 0 to 116316
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   date       116317 non-null  object 
 1   stock      116317 non-null  object 
 2   Open       116317 non-null  float64
 3   High       116317 non-null  float64
 4   Low        116317 non-null  float64
 5   Close      116317 non-null  float64
 6   Adj Close  116317 non-null  float64
 7   Volume     116317 non-null  float64
dtypes: float64(6), object(2)
memory usage: 7.1+ MB


In [29]:
test_stock = df['stock'].unique().tolist()[10]
test_stock

'ANANTRAJ.NS'

In [31]:
df_test = df[df['stock']==test_stock][['date','Open']].reset_index(drop = True)
df_test

Unnamed: 0,date,Open
0,2021-08-02,64.500000
1,2021-08-03,66.900002
2,2021-08-04,70.199997
3,2021-08-05,71.699997
4,2021-08-06,70.949997
...,...,...
735,2024-07-25,513.900024
736,2024-07-26,516.000000
737,2024-07-29,561.400024
738,2024-07-30,557.799988


## Feature Engineering

In [39]:
def add_lag_features(df:pd.DataFrame,lags:int):
    temp = df.copy()
    for i in range(1,lags+1):
        temp[f'lag{i}'] = temp['Open'].shift(i)    
    return temp

def add_ma_std_ewm_features(df:pd.DataFrame,windows:list):
    temp = df.copy()
    for i in windows:
        temp[f'ma_{i}'] = temp['Open'].rolling(window=i).mean()
        temp[f'std_{i}'] = temp['Open'].rolling(window=i).std()
        temp[f'ewm{i}'] = temp['Open'].ewm(span=i,adjust=False).mean()
    return temp
 

df_with_lag = add_lag_features(df_test,7)
df_fe = add_ma_std_ewm_features(df_with_lag,[5,10,15,20])
df_fe.head(10)

Unnamed: 0,date,Open,lag1,lag2,lag3,lag4,lag5,lag6,lag7,ma_5,...,ewm5,ma_10,std_10,ewm10,ma_15,std_15,ewm15,ma_20,std_20,ewm20
0,2021-08-02,64.5,,,,,,,,,...,64.5,,,64.5,,,64.5,,,64.5
1,2021-08-03,66.900002,64.5,,,,,,,,...,65.300001,,,64.936364,,,64.8,,,64.728572
2,2021-08-04,70.199997,66.900002,64.5,,,,,,,...,66.933333,,,65.893388,,,65.475,,,65.24966
3,2021-08-05,71.699997,70.199997,66.900002,64.5,,,,,,...,68.522221,,,66.949135,,,66.253124,,,65.863978
4,2021-08-06,70.949997,71.699997,70.199997,66.900002,64.5,,,,68.849998,...,69.331479,,,67.676565,,,66.840233,,,66.34836
5,2021-08-09,68.949997,70.949997,71.699997,70.199997,66.900002,64.5,,,69.739998,...,69.204319,,,67.908098,,,67.103954,,,66.596135
6,2021-08-10,67.800003,68.949997,70.949997,71.699997,70.199997,66.900002,64.5,,69.919998,...,68.736213,,,67.888444,,,67.19096,,,66.710789
7,2021-08-11,64.0,67.800003,68.949997,70.949997,71.699997,70.199997,66.900002,64.5,68.679999,...,67.157476,,,67.181454,,,66.79209,,,66.452619
8,2021-08-12,64.0,64.0,67.800003,68.949997,70.949997,71.699997,70.199997,66.900002,67.139999,...,66.104984,,,66.603008,,,66.443079,,,66.219036
9,2021-08-13,63.400002,64.0,64.0,67.800003,68.949997,70.949997,71.699997,70.199997,65.63,...,65.203323,67.239999,3.144994,66.020643,,,66.062694,,,65.950557


In [None]:
df_fe.dropna()

## Model Building

In [43]:
X = df_fe.iloc[:,2:]
y = df_fe.iloc[:,1]

In [53]:
train_length = int(0.9*X.shape[0])

x_train = X.iloc[:train_length,:]
x_test = X.iloc[train_length:,:]
y_train = y[:train_length]
y_test = y[train_length:]

In [56]:
scaler_x = StandardScaler()
x_train_scaled = scaler_x.fit_transform(x_train)
x_test_scaled = scaler_x.transform(x_test)


In [62]:
rf_model = RandomForestRegressor(n_estimators=100)

rf_model.fit(x_train_scaled,y_train)
rf_pred = rf_model.predict(x_test_scaled)
r2_score_rf = r2_score(y_test,rf_pred)
mae_rf = mean_absolute_error(y_test,rf_pred)
r2_score_rf,mae

(-1.1820538103956282, np.float64(73.33009487255201))

In [63]:
gb_model = XGBRegressor(n_estimators=100)

gb_model.fit(x_train_scaled,y_train)
gb_pred = rf_model.predict(x_test_scaled)
r2_score_gb = r2_score(y_test,gb_pred)
mae_gb = mean_absolute_error(y_test,gb_pred)
r2_score_gb,mae_gb

(-1.1820538103956282, np.float64(73.34317644686313))

1. Worse model r2 and mae (-1.1820538103956282, np.float64(73.34317644686313)) 
when we just add features of lag, ma, ewm. Apply scaling and fitting. 

2. 