In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import ExponentialSmoothing, SimpleExpSmoothing


train_df = pd.read_csv(r"/kaggle/input/ue21cs342aa2/train.csv")
train_df.set_index('Date', inplace = True, drop = True)
train_df.head()

In [None]:
X = train_df[['Open', 'Volume']]  
y_close = train_df['Close']  
y_strategy = train_df['Strategy']  

In [None]:
# plt.figure(figsize = (10, 5))
# plt.plot(diff['dif'])
# plt.title("Differencing price")
# plt.show()

In [None]:
plt.figure(figsize = (12, 5))
plt.plot(train_df['Close'])
plt.title("Close price")
plt.show()

In [None]:
result_add = seasonal_decompose(train_df['Close'], model = 'additive', period = 7)
result_mul = seasonal_decompose(train_df['Close'], model = 'multiplicative', period = 7)


In [None]:
plt.rcParams.update({'figure.figsize': (20, 10)})
result_mul.plot()
result_add.plot()
# plt.legend()
plt.show()

In [None]:
# plt.rcParams.update({'figure.figsize': (20, 10)})
# result_mul.resid.plot(label = 'multiplicative')
# result_add.resid.plot(label = 'additive')
# plt.legend()
# plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 3))
plot_pacf(train_df['Close'],ax = axes[0], title = "PACF Plot")
plot_acf(train_df['Close'], ax = axes[1], title = "ACF Plot")
plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller
time_series = train_df['Close']

# Dickey-Fuller test
result = adfuller(time_series)

# Extracting test statistics and p-value
test_statistic = result[0]
p_value = result[1]

# Print the results
print(f'Test Statistic: {test_statistic}')
print(f'p-value: {p_value}')

# Check the p-value against a significance level (e.g., 0.05)
if p_value <= 0.05:
    print("Reject the null hypothesis. The time series is likely stationary.")
else:
    print("Fail to reject the null hypothesis. The time series may be non-stationary.")

In [None]:
train = train_df.iloc[0:(len(train_df)-60)]
test = train_df.iloc[len(train):(len(train_df)-1)]

In [None]:
train['close_diff'] = train['Close'].diff(1) 
train.dropna(inplace=True)
train

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 3))
plot_pacf(train['close_diff'],ax = axes[0], title = "PACF Plot")
plot_acf(train['close_diff'], ax = axes[1], title = "ACF Plot")
plt.show()

In [None]:
# fig, axes = plt.subplots(1, 2, figsize=(8, 3))
# plot_pacf(train_df['Close_diff'],ax = axes[0], title = "PACF Plot")
# plot_acf(train_df['Close_diff'], ax = axes[1], title = "ACF Plot")
# plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller
time_series = train['close_diff']

# Dickey-Fuller test
result = adfuller(time_series)

# Extracting test statistics and p-value
test_statistic = result[0]
p_value = result[1]

# Print the results
print(f'Test Statistic: {test_statistic}')
print(f'p-value: {p_value}')

# Check the p-value against a significance level (e.g., 0.05)
if p_value <= 0.05:
    print("Reject the null hypothesis. The time series is likely stationary.")
else:
    print("Fail to reject the null hypothesis. The time series may be non-stationary.")

In [None]:
# train_df.shape

In [None]:
model = ARIMA(train_df['Close'], order=(1, 1, 1))
model_fit = model.fit()
model_fit.summary()
train_df['forecast']=model_fit.predict(start = 229,end = 299,dynamic=True)
train_df[['Close','forecast']].plot(figsize=(12,8))

In [None]:
model2 = SARIMAX(train_df['Close'], order = (1, 1, 1), seasonal_order = (1, 1, 1, 7))
results = model2.fit()
train_df['forecast']=model_fit.predict(start = 229,end = 300,dynamic=True)
train_df[['Close','forecast']].plot(figsize=(12,8))

In [None]:
data = train_df['Close'][229:300]

fit1 = SimpleExpSmoothing(data).fit(smoothing_level=0.95, optimized=False)
fit2 = SimpleExpSmoothing(data).fit(smoothing_level=0.8, optimized=False)

plt.figure(figsize=(18, 8))
plt.plot(data.index.tolist(), data.values, marker='o', color="black", label='Actual Data')
plt.plot(data.index.tolist(), fit1.fittedvalues, marker="o", color="b", label='Fitted (Smoothing Level = 0.95)')
# plt.plot(data.index.tolist(), fit2.fittedvalues, marker="o", color="r", label='Fitted (Smoothing Level = 0.8)')
plt.xticks(rotation="vertical")
plt.legend()
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


X = train_df[['Open','Volume','Close']] 
y = train_df['Strategy']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_validation)

accuracy = accuracy_score(y_validation, y_pred)
print("Accuracy = ", accuracy)

In [None]:
train_df

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from itertools import product
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

target_col = 'Close'

data = train_df[target_col]

train_size = int(len(data) * 0.8)
train_data, test_data = data[:train_size], data[train_size:]

p_values = range(0, 3)  
d_values = range(0, 2)
q_values = range(0, 3)

best_rmse = np.inf
best_order_rmse = None

for p in tqdm(p_values, desc='Progress', position=0, leave=True):
    for d in d_values:
        for q in q_values:
            order = (p, d, q)
            try:
                model = ARIMA(train_data, order=order)
                results = model.fit()
                predictions = results.forecast(steps=len(test_data))
                rmse = np.sqrt(mean_squared_error(test_data, predictions))
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_order_rmse = order
            except:
                continue

print(f'Best RMSE: {best_rmse}')
print(f'Best Order (p, d, q) using RMSE: {best_order_rmse}')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX

target_col = 'Close'
feature_col = 'Open'

data = train_df[[target_col, feature_col]]

train_size = int(len(data) * 0.8)
train_data, test_data = data[:train_size], data[train_size:]

order = (0, 0, 3) 
seasonal_order = (0, 0, 3, 7)
model = SARIMAX(endog=train_data[target_col], exog=train_data[feature_col], order=order, seasonal_order = seasonal_order)
model_fit = model.fit()

forecast = model_fit.forecast(steps=len(test_data), exog=test_data[feature_col])


# smape = calculate_smape(test_data[target_col], forecast)
# print(f'SMAPE: {smape:.2f}%')

plt.figure(figsize=(12, 6))
plt.plot(test_data.index, test_data[target_col], label='True')
plt.plot(test_data.index, forecast, label='Predicted', color='red')
plt.legend()
plt.title('Stock Price Prediction with SARIMAX')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.show()


In [None]:
# Convert data to lists
true_values = test_data[target_col].tolist()
predicted_values = forecast.tolist()

# Convert lists to numpy arrays for arithmetic operations
true_values = np.array(true_values)
predicted_values = np.array(predicted_values)

# Calculate SMAPE
def calculate_smape(y_true, y_pred):
    epsilon = 1e-10  # Small constant to avoid division by zero
    smape = np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true) + epsilon)) * 100
    return smape

smape = calculate_smape(true_values, predicted_values)
print(f'SMAPE: {smape:.2f}%')


In [None]:
# test_df = pd.read_csv(r'/kaggle/input/ue21cs342aa2/test.csv')
# test_df.head()

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.ensemble import RandomForestClassifier

# Assuming 'Close' is the column you want to predict
target_col = 'Close'
feature_col = 'Open'

# Load the test dataset
test_df = pd.read_csv(r'/kaggle/input/ue21cs342aa2/test.csv') 

# Train SARIMAX model for Close price prediction
train_data = train_df[[target_col, feature_col]]
order = (0, 0, 3) 
seasonal_order = (0, 0, 3, 7)
model = SARIMAX(endog=train_data[target_col], exog=train_data[feature_col], order=order, seasonal_order=seasonal_order)
model_fit = model.fit()

# Make predictions on the test dataset and append to test_df
test_data = test_df[[feature_col]]
forecast = model_fit.forecast(steps=len(test_data), exog=test_data)
test_df['Close'] = forecast.values

# Train RandomForestClassifier for Strategy prediction
X_train = train_df[['Open', 'Volume', 'Close']]
y_train = train_df['Strategy']
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test dataset for Strategy
X_test = test_df[['Open', 'Volume', 'Close']]
strategy_predictions = clf.predict(X_test)

# Create a submission dataframe
submission_df = pd.DataFrame({
    
    'id': test_df['id'],
    'Date': test_df['Date'],
    'Strategy': strategy_predictions,
    'Close': forecast.values,
})

# Save the submission dataframe to a CSV file
submission_df.to_csv('submission.csv', index=False)
