In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'ue21cs342aa2:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F62334%2F6834706%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240525%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240525T105912Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D878cd5c7282670972e92595b8a5e51f51b8d4cdc3e624a2bb05af44d2d7a1e5f4e4c88857bbff1c0800e9aea6781c7d6021d08d0e5686212e6f4217a3e36e942f0dbc856f84c696fe2644211c943a03688e5881f679fe9612cf9d5a14fb8b110259e84eed9b692d27aca9bd49b538fe5cefe2f8058d8efae34a7c3ed150b7b1d2bca3de07a632e246d6e9c458dbc04acfad60ab8907cb94db3b44c6f6778d02f73a918f2b9fc0044144d6bb489f8a397889a08d287374b2a83e2fa49b71bc617f86c176e8c60944c4bf611cc37e9d5114467e50d4bb67b75105ac14ee73cb3d85b33d0f8eb83d038cdf9327c585b20d58d1b585f25cf28da33fdeb337b5a35d4'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA

# Load the training data
train_data = pd.read_csv("/kaggle/input/ue21cs342aa2/train.csv")

# Convert 'Date' column to datetime
train_data['Date'] = pd.to_datetime(train_data['Date'])

# Set alpha (smoothing factor)
alpha = 0.25

# Initialize the forecast with the first observed close value
forecast = train_data["Close"].iloc[-1]
# Load the test data
test_data = pd.read_csv("/kaggle/input/ue21cs342aa2/test.csv")
test_data['Date'] = pd.to_datetime(test_data['Date'])
predicted_closes = []

# Iterate over test data to make ARIMA predictions
for i in range(len(test_data)):
    close_value = alpha * test_data["Open"].iloc[i] + (1 - alpha) * forecast
    forecast = close_value  # Update the forecast for the next period

    # Make ARIMA predictions
    arima_model = ARIMA(train_data["Close"], order=(5, 1, 7))  # You can set the order according to your data
    arima_results = arima_model.fit()

    arima_forecast = arima_results.forecast(steps=1)
    arima_close = arima_forecast.iat[0]  # Access the value correctly

    # Use ARIMA prediction if it's not too far from the exponential smoothing prediction
    if np.abs(arima_close - close_value) < 1.0:
        predicted_close = arima_close


    else:
        predicted_close = close_value

    # Append the predicted close to the list
    predicted_closes.append(predicted_close)

# Add the predicted close values to the test data
test_data["Close"] = predicted_closes
i = 0
while i < len(predicted_closes):
    print(predicted_closes[i])
    i += 1

In [None]:
train_data.tail(10)

In [None]:
test_data.head(10)

In [None]:
train_data['p_change'] = 0
test_data['p_change'] = 0
train_data['close-1'] = 0
test_data['close-1'] = 0
train_data['close-1%'] = 0
test_data['close-1%'] = 0
train_data['volume-1'] = 0
test_data['volume-1'] = 0
train_data['volume-1%'] = 0
test_data['volume-1%'] = 0
train_data['logv'] = 0
test_data['logv'] = 0

In [None]:
print(train_data.columns)
print(test_data.columns)


In [None]:
import warnings
import math
warnings.filterwarnings("ignore")


new_features = ['p_change', 'close-1', 'close-1%', 'volume-1', 'volume-1%','logv']
for i in new_features:
    test_data[i] = 0
    test_data[i] = train_data[i].astype('float')
if 'p_change' not in train_data.columns:
    train_data['p_change'] = 0

for k in range(1, len(test_data)):
        test_data['p_change'][k] = test_data['Close'][k] - test_data['Open'][k]
        test_data['close-1'][k] = test_data['Close'][k] - test_data['Close'][k-1]
        test_data['close-1%'][k] = ((test_data['Close'][k] / test_data['Close'][k-1]) * 100) - 100
        if test_data['Volume'][k] != 0 and test_data['Volume'][k-1] != 0:
            test_data['volume-1'][k] = test_data['Volume'][k] - test_data['Volume'][k-1]
            test_data['volume-1%'][k] = ((test_data['Volume'][k] / test_data['Volume'][k-1]) * 100) - 100
for k in range(0, len(test_data)):
        test_data['p_change'][k] = test_data['Close'][k] - test_data['Open'][k]
test_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['year'] = test_data['Date'].dt.year
test_data['month'] = test_data['Date'].dt.month

for k in range(1, len(test_data)):
    test_data['logv'][k]=math.log(test_data['Volume'][k])
    train_data['logv'][k]=math.log(test_data['Volume'][k])


# for i in new_features:
#     train_data[i] = 0
#     train_data[i] = train_data[i].astype('float')

for k in range(1, len(train_data)):
        train_data['p_change'][k] = train_data['Close'][k] - train_data['Open'][k]
        train_data['close-1'][k] = train_data['Close'][k] - train_data['Close'][k-1]
        train_data['close-1%'][k] = ((train_data['Close'][k] / train_data['Close'][k-1]) * 100) - 100
        if train_data['Volume'][k] != 0 and train_data['Volume'][k-1] != 0:
            train_data['volume-1'][k] = train_data['Volume'][k] - train_data['Volume'][k-1]
            train_data['volume-1%'][k] = ((train_data['Volume'][k] / train_data['Volume'][k-1]) * 100) - 100
for k in range(0, len(train_data)):
        train_data['p_change'][k] = train_data['Close'][k] - train_data['Open'][k]
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data['year'] = train_data['Date'].dt.year
train_data['month'] = train_data['Date'].dt.month




In [None]:
import pandas as pd
import numpy as np
import warnings
from statsmodels.tsa.seasonal import seasonal_decompose

warnings.filterwarnings("ignore")

# Define the lag order for lagged values and rolling window for rolling means
lag_order = 3
rolling_window = 7  # Choose an appropriate window size

# Lagged values
for lag in range(1, lag_order + 1):
    train_data[f'lag_{lag}'] = train_data['Close'].shift(lag)
    test_data[f'lag_{lag}'] = test_data['Close'].shift(lag)

# Rolling means
train_data['rolling_mean'] = train_data['Close'].rolling(window=rolling_window).mean()
test_data['rolling_mean'] = test_data['Close'].rolling(window=rolling_window).mean()

# Calculate differenced values
train_data['close_diff'] = train_data['Close'].diff()
test_data['close_diff'] = test_data['Close'].diff()

# Calculate moving averages
train_data['50_day_MA'] = train_data['Close'].rolling(window=50).mean()
test_data['50_day_MA'] = test_data['Close'].rolling(window=50).mean()

# Decompose data into trend, seasonality, and residuals
result = seasonal_decompose(train_data['Close'], model='multiplicative',period=12)  # Period depends on your data
train_data['trend'] = result.trend
train_data['seasonality'] = result.seasonal
train_data['residuals'] = result.resid

result = seasonal_decompose(test_data['Close'], model='multiplicative',period=12)  # Period depends on your data
test_data['trend'] = result.trend
test_data['seasonality'] = result.seasonal
test_data['residuals'] = result.resid

# Calculate residuals from other models if available
# For example, if you have a SARIMA model, calculate residuals as follows:
# model = sm.tsa.statespace.SARIMAX(endog=train_data['Close'], order=(2, 1, 3))
# model_fit = model.fit()
# train_data['sarima_residuals'] = model_fit.resid
# model = sm.tsa.statespace.SARIMAX(endog=test_data['Close'], order=(2, 1, 3))
# model_fit = model.fit()
# test_data['sarima_residuals'] = model_fit.resid

# Handling missing values (e.g., forward-fill or backward-fill)
train_data.fillna(method='bfill', inplace=True)
test_data.fillna(method='bfill', inplace=True)

# Error handling for unexpected errors
try:
    # Your code that may raise exceptions
    pass
except Exception as e:
    # Handle the exception gracefully
    print(f"An error occurred: {str(e)}")

# Continue with the rest of your data preprocessing steps

# Print or display the first few rows of the updated datasets to check the new features
print(train_data.head())
print(test_data.head())


In [None]:
# import pandas as pd
# import numpy as np
# import warnings
# from statsmodels.tsa.seasonal import seasonal_decompose

# warnings.filterwarnings("ignore")

# # Load your dataset, assuming it's already loaded into 'test_data'

# # Define the lag order for lagged values and rolling window for rolling means
# lag_order = 3
# rolling_window = 7  # Choose an appropriate window size

# # Lagged values
# for lag in range(1, lag_order + 1):
#     test_data[f'lag_{lag}'] = test_data['Close'].shift(lag)

# # Rolling means
# test_data['rolling_mean'] = test_data['Close'].rolling(window=rolling_window).mean()

# # Calculate differenced values
# test_data['close_diff'] = test_data['Close'].diff()

# # Calculate moving averages
# test_data['50_day_MA'] = test_data['Close'].rolling(window=50).mean()
# test_data['200_day_MA'] = test_data['Close'].rolling(window=200).mean()

# # Decompose data into trend, seasonality, and residuals
# result = seasonal_decompose(test_data['Close'], model='additive', period=52)  # Period depends on your data
# test_data['trend'] = result.trend
# test_data['seasonality'] = result.seasonal
# test_data['residuals'] = result.resid

# # Calculate residuals from other models if available
# # For example, if you have a SARIMA model, calculate residuals as follows:
# # model = sm.tsa.statespace.SARIMAX(endog=test_data['Close'], order=(2, 1, 3))
# # model_fit = model.fit()
# # test_data['sarima_residuals'] = model_fit.resid

# # Handling missing values (e.g., forward-fill or backward-fill)
# test_data.fillna(method='bfill', inplace=True)

# # Error handling for unexpected errors
# try:
#     # Your code that may raise exceptions
#     pass
# except Exception as e:
#     # Handle the exception gracefully
#     print(f"An error occurred: {str(e)}")

# # Continue with the rest of your data preprocessing steps

# # Print or display the first few rows of the updated dataset to check the new features
# print(test_data.head())


In [None]:
train_data.head(10)

In [None]:
test_data.tail(10)

In [None]:
train_data.columns

In [None]:
test_data.isnull().sum()

In [None]:
train_data['trend'].interpolate(method='linear', inplace=True)
train_data['residuals'].interpolate(method='linear', inplace=True)


In [None]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
threshold_buy = 0.0
threshold_sell = 0.0
test_data["Strategy"] = np.where(test_data["Close"] > threshold_buy, "Buy",np.where(test_data["Close"] < threshold_sell, "Sell", "Hold"))


label_encoder = LabelEncoder()
train_data["Strategy_Label"] = label_encoder.fit_transform(train_data["Strategy"])
# Features for XGBoost Classifier
X = train_data[['Open','seasonality','residuals','month']]
y = train_data["Strategy_Label"]

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the XGBoost model
model = xgb.XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
predicted_strategy = model.predict(X_val)
#print(predicted_strategy)
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_val, predicted_strategy)
print("Model accuracy is:", accuracy)

In [None]:
# Predict the strategy for the test data using the XGBoost model
test_features = test_data[['Open','seasonality','residuals','month']]
predicted_strategy = model.predict(test_features)


predicted_strategy = label_encoder.inverse_transform(predicted_strategy)

# Add the predicted strategy to the test data
test_data["Strategy"] = predicted_strategy

# Save the DataFrame to a CSV file named "submission.csv"
submission_df = test_data[['id', "Date", "Close", "Strategy"]]
submission_df.to_csv("submission.csv", index=False)