# Requirements

In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [3]:
# Add as many imports as you need.

## Introduction
In this laboratory assignment, the focus is on time series forecasting, specifically targeting the prediction of the current **close price** for Bitcoin. To accomplish this, you will use data from the preceding 7 days, and past statistics. 


## The Amazon Stock Price Dataset

The dataset comprises the following columns:
- date - the date of the recorded price
- symbol - the resource for prediction
- open - the open price of BTC
- high - the high price of BTC
- low - the low price of BTC
- volumeBTC - the volume of trades BTC
- volumeUSD - the volume of trades USD
- close - the close price of BTC

Target:
close

Load the dataset into a `pandas` data frame.

In [8]:
# Write your code here. Add as many boxes as you need.
data = pd.read_csv('BTC-Daily.csv')
data.head()

Unnamed: 0,date,symbol,open,high,low,close,Volume BTC,Volume USD
0,3/1/2022 0:00,BTC/USD,43221.71,43626.49,43185.48,43185.48,49.006289,2116360.0
1,2/28/2022 0:00,BTC/USD,37717.1,44256.08,37468.99,43178.98,3160.61807,136472300.0
2,2/27/2022 0:00,BTC/USD,39146.66,39886.92,37015.74,37712.68,1701.817043,64180080.0
3,2/26/2022 0:00,BTC/USD,39242.64,40330.99,38600.0,39146.66,912.724087,35730100.0
4,2/25/2022 0:00,BTC/USD,38360.93,39727.97,38027.61,39231.64,2202.851827,86421490.0


In [9]:
data.isnull().sum()

date          0
symbol        0
open          0
high          0
low           0
close         0
Volume BTC    0
Volume USD    0
dtype: int64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2651 entries, 0 to 2650
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        2651 non-null   object 
 1   symbol      2651 non-null   object 
 2   open        2651 non-null   float64
 3   high        2651 non-null   float64
 4   low         2651 non-null   float64
 5   close       2651 non-null   float64
 6   Volume BTC  2651 non-null   float64
 7   Volume USD  2651 non-null   float64
dtypes: float64(6), object(2)
memory usage: 165.8+ KB


Explore the dataset using visualizations of your choice.

In [12]:
import matplotlib.pyplot as plt

KeyboardInterrupt: 

In [None]:
# Write your code here. Add as many boxes as you need.
data.hist()
plt.show()

In [None]:
data.head()

In [None]:
label = LabelEncoder()
data['symbol'] = label.fit_transform(data['symbol'])

In [None]:
data.head()

In [None]:
data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)
data.info()

In [None]:
data.head()

# Feauture Extraction
Select the relevant features for prediction and apply a lag of up to 7 days to each chosen feature

Hint: Use `df['column_name'].shift(period)`. Check the documentation at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shift.html.

In [None]:
# Write your code here. Add as many boxes as you need.
for lag in range(1, 8):  # lag between 1 day and 7 days
    data[f'open_lag_{lag}'] = data['open'].shift(lag)
    data[f'high_lag_{lag}'] = data['high'].shift(lag)
    data[f'low_lag_{lag}'] = data['low'].shift(lag)
    data[f'volumeBTC_lag_{lag}'] = data['Volume BTC'].shift(lag)
    data[f'volumeUSD_lag_{lag}'] = data['Volume USD'].shift(lag)

data = data.dropna(axis=0)
data = data.drop(['open','high','low','Volume BTC','Volume USD'], axis= 1)
data.sample(5)

## Dataset Splitting
Partition the dataset into training and testing sets with an 80:20 ratio.

**WARNING: DO NOT SHUFFLE THE DATASET.**



In [None]:
# Write your code here. Add as many boxes as you need.
input = data.drop(['close'], axis=1)
target = data['close']

In [None]:
input.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(input, target, test_size=0.2, shuffle = False)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
#scaler = MinMaxScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

In [None]:
#columns_to_keep = (X_train.shape[1] // lag) * lag
#X_train = X_train[:, :columns_to_keep]
#X_train = X_train.reshape(X_train.shape[0], lag, (X_train.shape[1] // lag))


In [None]:
#columns_to_keep = (X_test.shape[1] // lag) * lag
#X_test = X_test[:, :columns_to_keep] 
#X_test = X_test.reshape(X_test.shape[0], lag, (X_test.shape[1] // lag))


In [None]:
import numpy as np

In [None]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [None]:
X_train = np.reshape(X_train,(X_train.shape[0],1,X_train.shape[1]))

## Neural Networks

Create an LSTM model and train it using the `train` function.

In [None]:
from keras.models import Sequential
from keras.layers import Input, LSTM, Dense

In [None]:
#model = Sequential([
    #Input(shape=(lag, X_train.shape[2])), # (timesteps, features)
    #LSTM(64, activation="relu", return_sequences=True),
    #LSTM(32, activation="relu"),
    #Dense(1, activation="linear")
#])

model = Sequential()
model.add(LSTM(100,activation='relu',input_shape=(X_train.shape[1],X_train.shape[2]),return_sequences=True))
model.add(LSTM(64,activation='relu'))
model.add(Dense(1,activation='linear'))
          

In [None]:
model.summary()

In [None]:
model.compile(
    loss="mse",
    optimizer="adam",
    metrics=["accuracy"],
)

In [None]:
history = model.fit(X_train, Y_train,validation_split=0.2,epochs=64, batch_size=8)

Use the trained model to make predictions for the test set.

In [None]:
X_test = np.reshape(X_test,(X_test.shape[0],1,X_test.shape[1]))

In [None]:
# Write your code here. Add as many boxes as you need.
y_pred = model.predict(X_test)

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Write your code here. Add as many boxes as you need.
r2_score(Y_test, y_pred)

# Additional Bonus Task

Group the data by month. You can use [pandas.Grouper](https://pandas.pydata.org/docs/reference/api/pandas.Grouper.html) function.

Create an LSTM model to predict the 'close' price on a montly frequency.