<a href="https://colab.research.google.com/github/Praise-Atadja/BTC-Forecasting/blob/main/btc_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PROJECT NAME:** Forecasting Bitcoin Prices with Time Series Analysis

# **SUMMARY**
In this notebook, we perform time series forecasting on Bitcoin (BTC) price data. Time series forecasting is a crucial aspect of financial analysis, helping investors and analysts predict future price movements based on historical data. Accurate forecasts can aid in decision-making processes, risk management, and strategic planning.

# **DATASET**
Data files for select bitcoin exchanges for the time period of Jan 2012 to December March 2021, with minute to minute updates of OHLC (Open, High, Low, Close), Volume in BTC and indicated currency, and weighted bitcoin price. Timestamps are in Unix time. Timestamps without any trades or activity have their data fields filled with NaNs.

# Data Cleaning, Feature Engineering, Transformation

In [2]:
#Import Necessary Libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import joblib
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load Datasets

In [3]:
#load data
def load_data():
    bitstamp = pd.read_csv('/content/drive/MyDrive/datasources/bitstamp.csv')
    coinbase = pd.read_csv('/content/drive/MyDrive/datasources/coinbase.csv')

    return bitstamp, coinbase

bitstamp, coinbase = load_data()
coinbase.head()



Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1417411980,300.0,300.0,300.0,300.0,0.01,3.0,300.0
1,1417412040,,,,,,,
2,1417412100,,,,,,,
3,1417412160,,,,,,,
4,1417412220,,,,,,,


In [4]:
bitstamp.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
0,1325317920,4.39,4.39,4.39,4.39,0.455581,2.0,4.39
1,1325317980,,,,,,,
2,1325318040,,,,,,,
3,1325318100,,,,,,,
4,1325318160,,,,,,,


In [5]:
coinbase.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2099760 entries, 0 to 2099759
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Timestamp          int64  
 1   Open               float64
 2   High               float64
 3   Low                float64
 4   Close              float64
 5   Volume_(BTC)       float64
 6   Volume_(Currency)  float64
 7   Weighted_Price     float64
dtypes: float64(7), int64(1)
memory usage: 128.2 MB


In [6]:
print(coinbase.columns)

Index(['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume_(BTC)',
       'Volume_(Currency)', 'Weighted_Price'],
      dtype='object')


In [7]:
print(bitstamp.columns)

Index(['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume_(BTC)',
       'Volume_(Currency)', 'Weighted_Price'],
      dtype='object')


Preprocessing

In [8]:
def preprocess_data(df):
    # Convert Unix time to datetime
    df['datetime'] = pd.to_datetime(df['Timestamp'], unit='s')
    df.set_index('datetime', inplace=True)
    df.drop(columns=['Timestamp'], inplace=True)

    # Selecting useful features
    features = ['Open', 'High', 'Low', 'Close', 'Volume_(BTC)', 'Volume_(Currency)','Weighted_Price']
    df = df[features]

     # Handling missing values
    df = df.dropna(how='all')

    # Normalizing the data
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

    return df_scaled, scaler

# Preprocess data
coinbase_data_preprocessed, coinbase_scaler = preprocess_data(coinbase)
bitstamp_data_preprocessed, bitstamp_scaler = preprocess_data(bitstamp)




In [9]:
# Display the first few rows of the preprocessed data
coinbase_data_preprocessed.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-12-01 05:33:00,0.015078,0.015078,0.015078,0.015078,6e-06,1.502195e-07,0.015078
2014-12-01 05:40:00,0.015078,0.015078,0.015078,0.015078,6e-06,1.502195e-07,0.015078
2014-12-01 06:24:00,0.018597,0.018597,0.018598,0.018597,6e-06,1.852707e-07,0.018597
2014-12-01 06:50:00,0.018597,0.018597,0.018598,0.018597,1.7e-05,4.919965e-07,0.018597
2014-12-02 05:29:00,0.018949,0.018949,0.018949,0.018949,6e-06,1.887758e-07,0.018949


In [10]:
# Display the first few rows of the preprocessed data
bitstamp_data_preprocessed.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-12-31 07:52:00,3e-05,3e-05,0.000147,0.000147,7.8e-05,3.647458e-07,3e-05
2011-12-31 15:50:00,3e-05,3e-05,0.000147,0.000147,0.0082,3.842961e-05,3e-05
2011-12-31 16:59:00,3.6e-05,3.9e-05,0.000153,0.000156,0.006468,3.125513e-05,3.7e-05
2011-12-31 17:00:00,4e-05,4e-05,0.000157,0.000157,0.001537,7.51741e-06,4e-05
2012-01-01 04:16:00,4e-05,4e-05,0.000157,0.000157,0.000257,1.254572e-06,4e-05


In [11]:
def save_preprocessed_data(df, file_path):
    df.to_csv(file_path)

# Save preprocessed data
save_preprocessed_data(coinbase_data_preprocessed, 'coinbase_preprocessed.csv')
save_preprocessed_data(bitstamp_data_preprocessed, 'bitstamp_preprocessed.csv')

# to inverse transform predictions later
joblib.dump(coinbase_scaler, 'coinbase_scaler.pkl')
joblib.dump(bitstamp_scaler, 'bitstamp_scaler.pkl')


['bitstamp_scaler.pkl']

In [12]:
#load preprocessed data
def load_preprocessed_data(file_path):
    return pd.read_csv(file_path, index_col='datetime', parse_dates=True)

def create_dataset(data, look_back=24):
    X, y = [], []
    for i in range(len(data) - look_back):
        X.append(data.iloc[i:(i + look_back)].values)
        y.append(data['Close'].iloc[i + look_back])  # Assuming 'Close' column is the target variable
    return np.array(X), np.array(y)

# Load preprocessed data
coinbase_file = 'coinbase_preprocessed.csv'
bitstamp_file = 'bitstamp_preprocessed.csv'

coinbase_data = load_preprocessed_data(coinbase_file)
bitstamp_data = load_preprocessed_data(bitstamp_file)

look_back = 24

# Create datasets
coinbase_X, coinbase_y = create_dataset(coinbase_data, look_back)
bitstamp_X, bitstamp_y = create_dataset(bitstamp_data, look_back)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(coinbase_X, coinbase_y, test_size=0.2, random_state=42)


Setting Up tf.data.Dataset for Model Inputs

In [None]:
def setup_tf_dataset(X, y, batch_size=32, shuffle_buffer_size=10000):
    """
    Set up a TensorFlow dataset for training/validation.

    Args:
    X (numpy.ndarray): Array of input sequences.
    y (numpy.ndarray): Array of target values.
    batch_size (int): Size of each batch.
    shuffle_buffer_size (int): Buffer size for shuffling the dataset.

    Returns:
    tf.data.Dataset: TensorFlow dataset object.
    """
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

# Create TensorFlow datasets
train_dataset = setup_tf_dataset(X_train, y_train)
val_dataset = setup_tf_dataset(X_val, y_val, shuffle_buffer_size=0)  # No need to shuffle validation data


# **MODEL TRAINING**

Model Architecture

In [None]:
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(look_back, coinbase_X.shape[2])),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(train_dataset, validation_data=val_dataset, epochs=10)

# Save the model
model.save('btc_forecasting_model.h5')


Results and Evaluations

In [None]:
# Inverse transform the predictions and the true values
def inverse_transform(predictions, y_test, scaler, look_back, num_features):
    predictions_extended = np.hstack((np.zeros((predictions.shape[0], num_features - 1)), predictions))
    y_test_extended = np.hstack((np.zeros((y_test.shape[0], num_features - 1)), y_test.reshape(-1, 1)))

    predictions_inversed = scaler.inverse_transform(predictions_extended)[:, -1]
    y_test_inversed = scaler.inverse_transform(y_test_extended)[:, -1]

    return predictions_inversed, y_test_inversed

# Evaluate the model
predictions = model.predict(X_val)
predictions_inversed, y_test_inversed = inverse_transform(predictions, y_val, coinbase_scaler, look_back, coinbase_X.shape[2])

# Calculate performance metrics
mae = mean_absolute_error(y_test_inversed, predictions_inversed)
rmse = np.sqrt(mean_squared_error(y_test_inversed, predictions_inversed))

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


In [None]:
# Plot the results
plt.figure(figsize=(14, 7))
plt.plot(y_test_inversed, label='True Price')
plt.plot(predictions_inversed, label='Predicted Price')
plt.title('Bitcoin Price Prediction')
plt.xlabel('Time')
plt.ylabel('Price (USD)')
plt.legend()
plt.show()