# Stock Prediction

* The notebook that you submit **must** run (without *any* errors), and create a `submission.csv` file, in the required format.

* This notebook should serve as a template to your final notebook submission.

In [None]:
!pip install pmdarima

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import auto_arima

from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d')
train = pd.read_csv("/kaggle/input/ue21cs342aa2/train.csv", index_col = 0)
test = pd.read_csv("/kaggle/input/ue21cs342aa2/test.csv", index_col = 0)
train = train.set_index(pd.to_datetime(train['Date']))
test = test.set_index(pd.to_datetime(test['Date']))
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [None]:
train.columns

In [None]:
train.head()

In [None]:
train.tail(25)

In [None]:
#plot close price
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Date')
plt.ylabel('Close Prices')
plt.plot(train['Close'])
plt.title('Closing price Plot')
plt.show()

In [None]:
#Distribution of the dataset
train['Close'].plot(kind='kde')

In [None]:
#plot close price
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Date')
plt.ylabel('Open Prices')
plt.plot(train['Open'])
plt.title('Opening price Plot')
plt.show()

In [None]:
#plot close price
plt.figure(figsize=(10,6))
plt.grid(True)
plt.xlabel('Date')
plt.ylabel('Vol Traded')
plt.plot(train['Volume'])
plt.title('Date vs Vol Traded')
plt.show()

In [None]:
# Lag Features for 'train' variable using 'Open' - differences between consecutive data points
train['Open_Lag_Diff1'] = train['Open'].diff(1)  # Difference with lag 1 for Open price
train['Open_Lag_Diff2'] = train['Open'].diff(2)  # Difference with lag 2 for Open price
train['Open_Lag_Diff3'] = train['Open'].diff(3)  # Difference with lag 3 for Open price
train['Open_Lag_Diff4'] = train['Open'].diff(4)  # Difference with lag 4 for Open price

# Rolling Statistics for 'train' variable using 'Open'
window = 4
train['RollingMean_Open'] = train['Open'].rolling(window=window).mean()
train['RollingStd_Open'] = train['Open'].rolling(window=window).std()

# Volume Changes
train['Volume_Change'] = train['Volume'].pct_change()

# Market Indicators (Example: MACD)
train['EMA_4_Open'] = train['Open'].ewm(span=4, adjust=False).mean()
train['EMA_12_Open'] = train['Open'].ewm(span=12, adjust=False).mean()
train['MACD_Open'] = train['EMA_4_Open'] - train['EMA_12_Open']

In [None]:
# Lag Features for 'test' variable using 'Open' - differences between consecutive data points
test['Open_Lag_Diff1'] = test['Open'].diff(1)  # Difference with lag 1 for Open price
test['Open_Lag_Diff2'] = test['Open'].diff(2)  # Difference with lag 2 for Open price
test['Open_Lag_Diff3'] = test['Open'].diff(3)  # Difference with lag 3 for Open price
test['Open_Lag_Diff4'] = test['Open'].diff(4)  # Difference with lag 4 for Open price

# Rolling Statistics for 'train' variable using 'Open'
window = 4
test['RollingMean_Open'] = test['Open'].rolling(window=window).mean()
test['RollingStd_Open'] = test['Open'].rolling(window=window).std()

# Volume Changes
test['Volume_Change'] = test['Volume'].pct_change()

# Exponential Moving Averages for test dataset
test['EMA_4_Open'] = test['Open'].ewm(span=4, adjust=False).mean()
test['EMA_12_Open'] = test['Open'].ewm(span=12, adjust=False).mean()

# Market Indicators (MACD) for test dataset
test['MACD_Open'] = test['EMA_4_Open'] - test['EMA_12_Open']

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
# Select only the numeric columns for correlation calculation
numeric_cols = train.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
correlation_matrix = numeric_cols.corr()

# Print the correlation matrix
print(correlation_matrix)


#close and open prices are not strongly correlated with volume so we need not worry about volume

In [None]:
train_size = int(len(train) * 0.80) # Use 80% of the data for training
train_data = train.iloc[:train_size]
val_data = train.iloc[train_size:]

#remove later

In [None]:
# train_data = train

In [None]:
# set style of charts
sns.set(style="darkgrid")
plt.rcParams['figure.figsize'] = [10, 10]

# Create a plot showing the split of the train, valid, and test data
plt.plot(train_data['Close'], label = 'Train')
plt.plot(val_data['Close'], label = 'Validate')
# plt.plot(test['Close'], label = 'Test')
plt.title('Train Valid Split of Data')
plt.ylabel('Closing Price')
plt.xlabel('Timestep in Days')
plt.legend()

In [None]:
#Test for staionarity
def test_stationarity(timeseries):
    #Determing rolling statistics
    rolmean = timeseries.rolling(4).mean()
    rolstd = timeseries.rolling(4).std()
    #Plot rolling statistics:
    plt.plot(timeseries, color='blue',label='Original')
    plt.plot(rolmean, color='red', label='Rolling Mean')
    plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean and Standard Deviation')
    plt.show(block=False)
    
    print("Results of dickey fuller test")
    adft = adfuller(timeseries,autolag='AIC')
    # output for dft will give us without defining what the values are.
    #hence we manually write what values does it explains using a for loop
    output = pd.Series(adft[0:4],index=['Test Statistics','p-value','No. of lags used','Number of observations used'])
    for key,values in adft[4].items():
        output['critical value (%s)'%key] =  values
    print(output)
    
test_stationarity(train['Close'])

The test statistic is less negative than the critical values, and the p values is also very big, Therefore we fail to reject the null hypothesis and conclude that the data is non stationary.

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Plot ACF
plot_acf(train.Close,lags=30)
plt.show()

# Plot PACF
# Plot ACF
plot_pacf(train.Close,lags=30)
plt.show()

The ACF Plot tells us a lot about the seasonality present in the data, the values are very much dependent on its history of previous values and is quite evident in the ACF Plot above.

The PACF Plot tells that the value of Y is heavily dependent on lag1 then a seasonal variation between the lags in the range 12-25

In [None]:
#To separate the trend and the seasonality from a time series, 
# we can decompose the series using the following code.

#quaterly decomposition
res2 = seasonal_decompose(train['Close'], model='mul', period=12)
fig = plt.figure()  
fig = res2.plot()  
fig.set_size_inches(16, 9)

In [None]:
#To separate the trend and the seasonality from a time series, 
# we can decompose the series using the following code.

#monthly decomposition
result = seasonal_decompose(train['Close'], model='mul', period=4)
fig = plt.figure()  
fig = result.plot()  
fig.set_size_inches(16, 9)

In [None]:
#if not stationary then eliminate trend
#Eliminate trend
from pylab import rcParams
rcParams['figure.figsize'] = 10, 6
train_log = np.log(train['Close'])
moving_avg = train['Close'].rolling(30).mean()
std_dev = train['Close'].rolling(30).std()
plt.legend(loc='best')
plt.title('Moving Average')
plt.plot(std_dev, color ="black", label = "Standard Deviation")
plt.plot(moving_avg, color="red", label = "Mean")
plt.legend()
plt.show()

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Apply double exponential smoothing
model = ExponentialSmoothing(train_data['Close'], trend='mul')
fitted_exp = model.fit()

# Predict for the entire series length
predictions_exp = fitted_exp.predict(start=0, end=len(train_data['Close']) - 1)

# Plot the original data and the double exponential smoothing
plt.figure(figsize=(10, 6))
plt.plot(train_data['Close'], label='Original Data')
plt.plot(predictions_exp, color='red', label='Double Exponential Smoothing')
plt.legend()
plt.title('Double Exponential Smoothing')
plt.show()

In [None]:
# Predict on validation data : prediction of double exp smoothening:
val_pred_exp = fitted_exp.forecast(steps=len(val_data['Close']))

# Plot the original validation data and the double exponential smoothing for validation
plt.figure(figsize=(10, 6))
plt.plot(val_data['Close'], label='Original Validation Data')
plt.plot(val_pred_exp, color='green', label='Double Exponential Smoothing (Validation)')
plt.legend()
plt.title('Double Exponential Smoothing - Validation Data')
plt.show()

# Calculate evaluation metrics (e.g., MSE, MAE) for validation
# Compute residuals (difference between actual and predicted values)
residuals = val_data['Close'] - val_pred_exp
# Calculate evaluation metrics like MSE, MAE, etc.
# For example:
mse = np.mean((residuals) ** 2)
mae = np.mean(np.abs(residuals))

print(f"MSE: {mse}")
print(f"MAE: {mae}")

def calculate_smape(actual, forecast):
    return 100/len(actual) * np.sum(2 * np.abs(forecast - actual) / (np.abs(actual) + np.abs(forecast)))

# Assuming 'actual_values' is an array of actual values and 'forecast_values' is an array of forecasted values
smape = calculate_smape(val_data['Close'], val_pred_exp)
print("SMAPE:", smape)


#Tested on the double exponential smoothing method because it was fitting the data very well

In [None]:
#Its time to choose parameters p,q,d for ARIMA model. Last time we chose the value of p,d, and q by observing the plots of ACF and PACF but now we are going to use Auto ARIMA to get the best parameters without even plotting ACF and PACF graphs.

In [None]:
# train_data.drop(['Strategy', 'Date'], axis = 1)
train_data = train_data.drop('Strategy', axis = 1)

In [None]:
train_data.head()

In [None]:
train_data['Date'] = pd.to_datetime(train_data['Date'])
type(train_data['Date'])

In [None]:
train_data['Close_diff'] = train_data['Close'].diff(1)  # 2nd-order differencing
result_diff = adfuller(train_data['Close_diff'].dropna())
print('ADF Statistic:', result_diff[0])
print('p-value:', result_diff[1])

train_data['Close_diff']

In [None]:
plot_acf(train_data.Close_diff.dropna(),lags=30)
plt.show()

In [None]:

plot_pacf(train_data.Close_diff.dropna(),lags=30)
plt.show()

In [None]:
print(train_data.columns)
print(val_data.columns)
print(test.columns)
print(train.columns)

In [None]:
print(train_data.isna().sum())

In [None]:
# # #run only once or twice to get the best arima model parameters:

# # #Finding the best value for ARIMA
# import warnings
import statsmodels.api as sm
# warnings.filterwarnings("ignore")

# import itertools
# p=q=range (0,7)
# d = range(0,3)
# pdq = list(itertools.product (p, d, q))

# store = {}
# for param in pdq:
#     try:
#         model_arima = sm.tsa.arima.ARIMA (train_data.Close, order = param)
#         model_arima_fit = model_arima.fit()
#         store[param] =  model_arima_fit.aic
#         #print(param, model_arima_fit.aic)
#     except:
#         continue

# sorted_dict = dict(sorted(store.items(), key=lambda item: item[1]))
# print(sorted_dict)
# # The Akaike information criterion (AIC) is an estimator of in-sample prediction error and thereby relative quality of
# # statistical models for a given set of data
# # It's like the mean squared error in Regression - The smaller the number, the better

In [None]:
# model.fit(train_data)
train_data.dtypes

In [None]:
order = (5, 1, 6)
# seasonal_order = model.get_params()['seasonal_order']
endog = train_data['Close']
exog = train_data[['Open', 'Volume', 'Open_Lag_Diff1', 'Open_Lag_Diff2', 'Open_Lag_Diff3', 'Open_Lag_Diff4', 'RollingMean_Open', 'RollingStd_Open', 'Volume_Change', 'EMA_4_Open', 'EMA_12_Open', 'MACD_Open']]

In [None]:
# Columns requiring mean imputation
columns_to_impute = ['Open_Lag_Diff1', 'Open_Lag_Diff2', 'Open_Lag_Diff3', 'Open_Lag_Diff4', 'RollingMean_Open', 'RollingStd_Open', 'Volume_Change']

# Impute NaNs with mean for each column
for col in columns_to_impute:
    mean_value = train_data[col].mean()
    train_data[col].fillna(mean_value, inplace=True)
    

# Impute NaNs with mean for each column : val data
for col in columns_to_impute:
    mean_value = val_data[col].mean()
    val_data[col].fillna(mean_value, inplace=True)
    

# Impute NaNs with mean for each column : test
for col in columns_to_impute:
    mean_value = test[col].mean()
    test[col].fillna(mean_value, inplace=True)
    
# Impute NaNs with mean for each column : train
for col in columns_to_impute:
    mean_value = train[col].mean()
    train[col].fillna(mean_value, inplace=True)

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Create and fit the SARIMA model with the obtained order for the differenced series
model = sm.tsa.statespace.SARIMAX(endog=endog, order=order, exog = train_data[['Open', 'Volume', 'Open_Lag_Diff1', 'Open_Lag_Diff2', 'Open_Lag_Diff3', 'Open_Lag_Diff4', 'RollingMean_Open', 'RollingStd_Open', 'Volume_Change', 'EMA_4_Open', 'EMA_12_Open', 'MACD_Open']])
model_fit = model.fit()
model_fit.summary()

In [None]:
# fig, ax= plt.subplots(figsize=(15,8))
# ax.plot(train_data['Close'], label='Actual return')

# #plot the fitted values of model (in sample data predicted values)
# train_pred = results.fittedvalues
# ax.plot(train_data.index, train_pred, color='green', label='fitted')


train_data['Close'].plot(figsize=(25,10))
model_fit.fittedvalues.plot()
plt.show()

In [None]:
# #plot the forecast values of model (out of sample data predicted values)
# Ntest = 30
# prediction_res = results.get_forecast(Ntest)
# conf_int = prediction_res.conf_int()
# #lower and upper limits of prediction 
# lower, upper = conf_int[conf_int.columns[0]], conf_int[conf_int.columns[1]]
# forecast = prediction_res.predicted_mean
# ax.plot(val_data.index, forecast, label='forecast')
# ax.fill_between(val_data.index, lower, upper, color='red', alpha=0.3)
# ax.legend()
# # print(conf_int)


predict = model_fit.predict(start = len(train_data),end = len(train_data)+len(val_data)-1, exog = val_data[['Open', 'Volume', 'Open_Lag_Diff1', 'Open_Lag_Diff2', 'Open_Lag_Diff3', 'Open_Lag_Diff4', 'RollingMean_Open', 'RollingStd_Open', 'Volume_Change', 'EMA_4_Open', 'EMA_12_Open', 'MACD_Open']])
val_data['predicted'] = predict.values
val_data.head(10)

In [None]:
MAE = mean_absolute_error(val_data["Close"], val_data["predicted"])
RMSE = math.sqrt(mean_squared_error(val_data["Close"], val_data["predicted"]))
smape = np.mean(np.abs(val_data['predicted'] - val_data['Close']) / (val_data['predicted'] + val_data['Close']))
print("MAE:", MAE)
print("RMSE:", RMSE)
print("SMAPE:", smape)

In [None]:
# y_true = val_data['Close'].values
# rmse = np.sqrt(mean_squared_error(y_true,forecast))
# mae = mean_absolute_error(y_true,forecast)



val_data['Close'].plot(figsize=(25,10),color = 'red')
val_data['predicted'].plot()
plt.show()

In [None]:
plot_acf(train.Close.diff(2).dropna(),lags=30)
plt.show()

In [None]:
plot_pacf(train.Close.diff(2).dropna(),lags=30)
plt.show()

In [None]:
# # #run only once or twice to get the best arima model parameters:

# # #Finding the best value for ARIMA
# import warnings
# import statsmodels.api as sm
# warnings.filterwarnings("ignore")

# import itertools
# p=q=range (0,8)
# d = range(0,3)
# pdq2 = list(itertools.product (p, d, q))

# store2 = {}
# for param in pdq2:
#     try:
#         model_arima = sm.tsa.arima.ARIMA (train.Close, order = param)
#         model_arima_fit = model_arima.fit()
#         store2[param] =  model_arima_fit.aic
#         #print(param, model_arima_fit.aic)
#     except:
#         continue

# sorted_dict2 = dict(sorted(store2.items(), key=lambda item: item[1]))
# print(sorted_dict2)
# # The Akaike information criterion (AIC) is an estimator of in-sample prediction error and thereby relative quality of
# # statistical models for a given set of data
# # It's like the mean squared error in Regression - The smaller the number, the better

In [None]:
order2=(5, 1, 6) #or(4,1,2)
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Create and fit the SARIMA model with the obtained order for the differenced series
model2 = sm.tsa.statespace.SARIMAX(endog=train[['Close']], order=order2, exog = train[['Open','Volume', 'Open_Lag_Diff1', 'Open_Lag_Diff2', 'Open_Lag_Diff3', 'Open_Lag_Diff4', 'RollingMean_Open', 'RollingStd_Open', 'Volume_Change', 'EMA_4_Open', 'EMA_12_Open', 'MACD_Open']])
model_fit2 = model2.fit()
model_fit2.summary()

In [None]:
predicted_test = model_fit2.predict(start = len(train),end = len(train) + len(test)-1, exog = test[['Open', 'Volume', 'Open_Lag_Diff1', 'Open_Lag_Diff2', 'Open_Lag_Diff3', 'Open_Lag_Diff4', 'RollingMean_Open', 'RollingStd_Open', 'Volume_Change', 'EMA_4_Open', 'EMA_12_Open', 'MACD_Open']])
test['Close'] = predicted_test.values
test.tail(10)

In [None]:
val_data

In [None]:
test.head()

In [None]:
# Lag Features for 'test' variable using 'Close' - differences between consecutive data points
test['Close_Lag_Diff1'] = test['Close'].diff(1)  # Difference with lag 1 for Close price
test['Close_Lag_Diff2'] = test['Close'].diff(2)  # Difference with lag 2 for Close price
test['Close_Lag_Diff3'] = test['Close'].diff(3)  # Difference with lag 3 for Close price
test['Close_Lag_Diff4'] = test['Close'].diff(4)  # Difference with lag 4 for Close price

# Rolling Statistics for 'test' variable using 'Close'
window = 4
test['RollingMean_Close'] = test['Close'].rolling(window=window).mean()
test['RollingStd_Close'] = test['Close'].rolling(window=window).std()

# Market Indicators (Example: MACD) for 'test'
test['EMA_4_Close'] = test['Close'].ewm(span=4, adjust=False).mean()
test['EMA_12_Close'] = test['Close'].ewm(span=12, adjust=False).mean()
test['MACD_Close'] = test['EMA_4_Close'] - test['EMA_12_Close']

In [None]:
# Lag Features for 'train' variable using 'Close' - differences between consecutive data points
train['Close_Lag_Diff1'] = train['Close'].diff(1)  # Difference with lag 1 for Close price
train['Close_Lag_Diff2'] = train['Close'].diff(2)  # Difference with lag 2 for Close price
train['Close_Lag_Diff3'] = train['Close'].diff(3)  # Difference with lag 3 for Close price
train['Close_Lag_Diff4'] = train['Close'].diff(4)  # Difference with lag 4 for Close price

# Rolling Statistics for 'test' variable using 'Close'
window = 4
train['RollingMean_Close'] = train['Close'].rolling(window=window).mean()
train['RollingStd_Close'] = train['Close'].rolling(window=window).std()

# Market Indicators (Example: MACD) for 'test'
train['EMA_4_Close'] = train['Close'].ewm(span=4, adjust=False).mean()
train['EMA_12_Close'] = train['Close'].ewm(span=12, adjust=False).mean()
train['MACD_Close'] = train['EMA_4_Close'] - train['EMA_12_Close']

In [None]:
columns_to_impute2 = ['Close_Lag_Diff1', 'Close_Lag_Diff2', 'Close_Lag_Diff3', 'Close_Lag_Diff4']
# Impute NaNs with mean for each column : test
for col in columns_to_impute2:
    mean_value = test[col].mean()
    test[col].fillna(mean_value, inplace=True)
    
# Impute NaNs with mean for each column : train
for col in columns_to_impute2:
    mean_value = train[col].mean()
    train[col].fillna(mean_value, inplace=True)

In [None]:
#Prediction of strategy using random forest method : 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier  # Importing XGBoost's classifier
#from sklearn.svm import SVC
#from sklearn.naive_bayes import GaussianNB

In [None]:
X = train[["Open","Close","Volume", 'Open_Lag_Diff1', 'Open_Lag_Diff2', 'Open_Lag_Diff3', 'Open_Lag_Diff4', 'RollingMean_Open', 'RollingStd_Open', 'EMA_4_Open', 'EMA_12_Open', 'MACD_Open', 'Close_Lag_Diff1',
       'Close_Lag_Diff2', 'Close_Lag_Diff3', 'Close_Lag_Diff4', 'RollingMean_Close',
       'RollingStd_Close', 'EMA_4_Close', 'EMA_12_Close', 'MACD_Close']] 
y_strategy = train["Strategy"]

In [None]:
# Splitting the data
X_train, X_val, y_strategy_train, y_strategy_val = train_test_split(X, y_strategy, test_size=0.2, random_state=42)


# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform on the training set
y_strategy_train_encoded = label_encoder.fit_transform(y_strategy_train)

# Transform the validation set
y_strategy_val_encoded = label_encoder.transform(y_strategy_val)

In [None]:
# strategy_model = RandomForestClassifier(n_estimators=450, random_state=42)
# strategy_model.fit(X_train, y_strategy_train)

# Initialize and train XGBoost Classifier
# Initialize and train LGBM Classifier
strategy_model = LGBMClassifier(
    learning_rate=0.5,
    n_estimators=300,
    max_depth=5,
    min_child_weight=3,
    subsample=0.7,
    colsample_bytree=0.7,
    objective='multiclass',
    class_weight='balanced',
    random_state=42
)
strategy_model.fit(X_train, y_strategy_train_encoded)

#strategy_model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=0)
#strategy_model.fit(X_train, y_strategy_train)

#strategy_model = GradientBoostingClassifier(n_estimators=100, random_state=0)
#strategy_model.fit(X_train, y_strategy_train)

#strategy_model = SVC(kernel='linear', random_state=0)
#strategy_model.fit(X_train, y_strategy_train)

# = GaussianNB()
#strategy_model.fit(X_train, y_strategy_train)

# Make predictions for 'Strategy' on the validation set
y_strategy_val_pred = strategy_model.predict(X_val)

In [None]:
accuracy = accuracy_score(y_strategy_val_encoded, y_strategy_val_pred)
print("Accuracy for Strategy:", accuracy)

# accuracy = accuracy_score(y_strategy_val, y_strategy_val_pred)
# print("Accuracy for Strategy:", accuracy)

# Final Predictions for Strategy' on the test dataset
X_test = test[["Open","Close", "Volume", 'Open_Lag_Diff1', 'Open_Lag_Diff2', 'Open_Lag_Diff3', 'Open_Lag_Diff4', 'RollingMean_Open', 'RollingStd_Open', 'EMA_4_Open', 'EMA_12_Open', 'MACD_Open', 'Close_Lag_Diff1',
       'Close_Lag_Diff2', 'Close_Lag_Diff3', 'Close_Lag_Diff4', 'RollingMean_Close',
       'RollingStd_Close', 'EMA_4_Close', 'EMA_12_Close', 'MACD_Close']]  # Use the same features as in the training data
test_strategy_predictions = strategy_model.predict(X_test)

In [None]:
test.head(100)

In [None]:
print(test_strategy_predictions)

In [None]:
# Create a Submission DataFrame
submission = pd.DataFrame()
submission["Date"] = test["Date"]
submission["Close"] = predicted_test.values
submission["Strategy"] = test_strategy_predictions

In [None]:
dicti = {0: 'Buy', 1: 'Hold', 2: 'Sell'}

In [None]:
for i in range(submission.shape[0]):
    submission['Strategy'][i] = dicti[submission['Strategy'][i]]

In [None]:
submission.head(10)

In [None]:
numberIdxs = [i for i in range(0, 100)]
submission['id'] = numberIdxs

In [None]:
last_column_name = submission.columns[-1]

# Step 2: Create a new DataFrame with rearranged column order
new_order = [last_column_name] + [col for col in submission.columns if col != last_column_name]
new_df = submission[new_order]

# Step 3: Assign the new DataFrame to the original DataFrame
submission = new_df

In [None]:
submission.to_csv('submission.csv', index=False)