<a href="https://colab.research.google.com/github/RoozbehSanaei/deep-learning-notebooks/blob/master/data_exploration_with_adversarial_autoencoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Exploration with Adversarial Autoencoders

In [0]:
#install additional dependencies if needed
#I assume you have tensorflow, keras, numpy and pandas installed
!pip install requests
!pip install tables
!pip install arrow
!pip install ta
!pip install tqdm

In [0]:
import ta
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import arrow as time
import requests as http
from tqdm import tqdm

from keras.layers.advanced_activations import PReLU
from keras.optimizers import Nadam
from keras.models import Model
from keras.layers import *

## Loading the Data from Oanda

In [0]:
class InstrumentLoader:
    def __init__(self, config):
        self.config = config
        self.store = pd.HDFStore('oanda_api_store.h5')
        self.time_format = 'YYYY-MM-DDTHH:mm:ss.SSSSSSSSZ'
        self.inst_base_url = 'https://api-fxpractice.oanda.com/v3/instruments/'

    def load_period(self, instrument, granularity, start, end):
        time_range = time.Arrow.range('day', start, end)

        trading_days = [
            day for day in time_range if day.format('d') not in ['6']
        ]

        signals = []
        for i in tqdm(range(len(trading_days)), desc="downloading data"):
            day = trading_days[i]
            values = self.load_into_df(day, instrument, granularity)
            signals.append(values)
        return signals

    def load_into_df(self, day, instrument, granularity):
        day_key = instrument + granularity + day.format('YYYYMMDD')
        if day_key not in self.store:

            values = self.load_day(day, instrument, granularity, "M")
            
            day_close = pd.Series([
                float(candle['mid']['c']) for candle in values
            ]).rename('close')
            day_open = pd.Series([
                float(candle['mid']['o']) for candle in values
            ]).rename('open')
            day_high = pd.Series([
                float(candle['mid']['h']) for candle in values
            ]).rename('high')
            day_low = pd.Series([
                float(candle['mid']['l']) for candle in values
            ]).rename('low')
            day_volume = pd.Series([
                int(candle['volume']) for candle in values
            ]).rename('volume')

            time_of_day = pd.Series([
                int(time.get(candle['time'], self.time_format).format('HHmmss'))
                for candle in values
            ]).rename('time_of_day')

            signals = [
                time_of_day, day_open, day_close, day_high, day_low, day_volume
            ]

            raw_day = pd.concat(signals, axis=1).set_index('time_of_day')

            self.store[day_key] = raw_day
            return raw_day
        else:
            raw_day = self.store[day_key]
            return raw_day

    def load_day(self, day, instrument, granularity, price):
        time_format = 'YYYY-MM-DDTHH:mm:ssZ'
        base_uri = self.inst_base_url + instrument + "/candles"
        headers = {"Authorization": self.config['token']}
        parameters = {
            "from": day.format(time_format),
            "to": day.shift(days=1).format(time_format),
            "price": price,
            "granularity": granularity,
            "includeFirst": "True",
        }
        response = http.get(
            base_uri, params=parameters, headers=headers).json()
        return response['candles']



In [0]:
config = {
      'token': "Bearer <YOUR API TOKEN HERE!>",
    }

start = time.get("2018-01-02T00:00:00Z", 'YYYY-MM-DDTHH:mm:ss')
end = time.get("2018-12-30T00:00:00Z", 'YYYY-MM-DDTHH:mm:ss')
api = InstrumentLoader(config)

days = api.load_period('EUR_USD', 'M5', start, end)

test = days[0]

## Enhancing and preprocessing data

In [0]:
def roll_stats(data, window=32):
    name = data.name
    data_mean = data.rolling(window).mean().rename(name + '_mean')
    data_var = data.rolling(window).var().rename(name + '_var')
    data_skew = data.rolling(window).skew().rename(name + '_skew')
    return pd.concat([data_mean, data_var, data_skew], axis=1)

def log_returns(frame):
    return np.log(frame) - np.log(frame.shift(1))

enhanced_days = []

for i in range(len(days)):
    day = days[i]

    high = day['high']
    low = day['low']
    close = day['close']
    volume = day['volume']

    close_returns = log_returns(close)
    close_stats = roll_stats(close_returns)
    rsi = ta.momentum.rsi(close, n=13, fillna=False).rename('rsi')
    atr = ta.volatility.average_true_range(high, low, close, n=13, fillna=False).rename('atr')
    macd_signal = ta.trend.macd_signal(close, n_fast=13, n_slow=35, n_sign=8, fillna=False).rename('macd_signal')

    all_data = pd.concat([close_returns, log_returns(high), log_returns(low), close_stats, volume, rsi, atr, macd_signal], axis=1)
    aligned_data = all_data.dropna()
    enhanced_days.append(aligned_data)

In [0]:
data_x = []

window_size = 32
step_size = 16

all_days = pd.concat(enhanced_days)
all_mean = all_days.mean()
all_std = all_days.std()

windowed_signals = []

for i in range(len(enhanced_days)):
    signal = enhanced_days[i]

    if len(signal) >= window_size:
        for j in range(0, len(signal) - window_size, step_size):
            window_x = signal.iloc[j:j + window_size]
            windowed_signals.append(window_x)
            window_x = (window_x - window_x.mean()) / window_x.std()
            data_x.append(window_x.values)
    

data_x = np.array(data_x)

split_train = int(len(data_x) * 0.7)

train_x = data_x[:split_train]
originals_x = windowed_signals[:split_train]

test_x = data_x[split_train:]

print(train_x.shape)


In [0]:
def sample_normal(latent_dim, batch_size, window_size=None):
    shape = (batch_size, latent_dim) if window_size is None else (batch_size, window_size, latent_dim)
    return np.random.normal(size=shape)
  
def sample_categories(cat_dim, batch_size):
    cats = np.zeros((batch_size, cat_dim))
    for i in range(batch_size):
        one = np.random.randint(0, cat_dim)
        cats[i][one] = 1
    return cats
  
def create_encoder(latent_dim, cat_dim, window_size, input_dim):
    input_layer = Input(shape=(window_size, input_dim))
    
    code = TimeDistributed(Dense(64, activation='linear'))(input_layer)
    code = Bidirectional(LSTM(128, return_sequences=True))(code)
    code = BatchNormalization()(code)
    code = ELU()(code)
    code = Bidirectional(LSTM(64))(code)
    code = BatchNormalization()(code)
    code = ELU()(code)
    
    cat = Dense(64)(code)
    cat = BatchNormalization()(cat)
    cat = PReLU()(cat)
    cat = Dense(cat_dim, activation='softmax')(cat)
    
    latent_repr = Dense(64)(code)
    latent_repr = BatchNormalization()(latent_repr)
    latent_repr = PReLU()(latent_repr)
    latent_repr = Dense(latent_dim, activation='linear')(latent_repr)
    
    decode = Concatenate()([latent_repr, cat])
    decode = RepeatVector(window_size)(decode)
    decode = Bidirectional(LSTM(64, return_sequences=True))(decode)
    decode = ELU()(decode)
    decode = Bidirectional(LSTM(128, return_sequences=True))(decode)
    decode = ELU()(decode)
    decode = TimeDistributed(Dense(64))(decode)
    decode = ELU()(decode)
    decode = TimeDistributed(Dense(input_dim, activation='linear'))(decode)
    
    error = Subtract()([input_layer, decode])
        
    return Model(input_layer, [decode, latent_repr, cat, error])

  
def create_discriminator(latent_dim):
    input_layer = Input(shape=(latent_dim,))
    disc = Dense(128)(input_layer)
    disc = ELU()(disc) #LeakyReLU(alpha=0.2)(disc)
    disc = Dense(64)(disc)
    disc = ELU()(disc) #LeakyReLU(alpha=0.2)(disc)
    disc = Dense(1, activation="sigmoid")(disc)
    
    model = Model(input_layer, disc)
    return model

## Encoding Data to find Clusters



In [0]:
window_size = train_x.shape[1]
input_dim = train_x.shape[2]
latent_dim = 32
cat_dim = 8

prior_discriminator = create_discriminator(latent_dim)
prior_discriminator.compile(loss='binary_crossentropy', 
                            optimizer=Nadam(0.0002, 0.5), 
                            metrics=['accuracy'])

prior_discriminator.trainable = False

cat_discriminator = create_discriminator(cat_dim)
cat_discriminator.compile(loss='binary_crossentropy', 
                          optimizer=Nadam(0.0002, 0.5), 
                          metrics=['accuracy'])

cat_discriminator.trainable = False

encoder = create_encoder(latent_dim, cat_dim, window_size, input_dim)

signal_in = Input(shape=(window_size, input_dim))
reconstructed_signal, encoded_repr, category, _ = encoder(signal_in)

is_real_prior = prior_discriminator(encoded_repr)
is_real_cat = cat_discriminator(category)

autoencoder = Model(signal_in, [reconstructed_signal, is_real_prior, is_real_cat])
autoencoder.compile(loss=['mse', 'binary_crossentropy', 'binary_crossentropy'],
                                loss_weights=[0.99, 0.005, 0.005],
                                optimizer=Nadam(0.0002, 0.5))

In [0]:
autoencoder.summary()

In [0]:
batches = 10000
batch_size=64

losses_disc = []
losses_disc_cat = []
losses_ae = []
losses_val = []

real = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))

def discriminator_training(discriminator, real, fake):
    def train(real_samples, fake_samples):
        discriminator.trainable = True

        loss_real = discriminator.train_on_batch(real_samples, real)
        loss_fake = discriminator.train_on_batch(fake_samples, fake)
        loss = np.add(loss_real, loss_fake) * 0.5

        discriminator.trainable = False

        return loss
    return train

train_prior_discriminator = discriminator_training(prior_discriminator, real, fake)
train_cat_discriminator = discriminator_training(cat_discriminator, real, fake)

pbar = tqdm(range(batches))

for _ in pbar:
  
    ids = np.random.randint(0, train_x.shape[0], batch_size)
    signals = train_x[ids]

    _, latent_fake, category_fake, _ = encoder.predict(signals)

    latent_real = sample_normal(latent_dim, batch_size)
    category_real = sample_categories(cat_dim, batch_size)

    prior_loss = train_prior_discriminator(latent_real, latent_fake)
    cat_loss = train_cat_discriminator(category_real, category_fake)

    losses_disc.append(prior_loss)
    losses_disc_cat.append(cat_loss)

    encoder_loss = autoencoder.train_on_batch(signals, [signals, real, real])
    losses_ae.append(encoder_loss)

    val_loss = autoencoder.test_on_batch(signals, [signals, real, real])
    losses_val.append(val_loss)

    pbar.set_description("[Acc. Prior/Cat: %.2f%% / %.2f%%] [MSE train/val: %f / %f]" 
            % (100*prior_loss[1], 100*cat_loss[1], encoder_loss[1], val_loss[1]))

In [0]:
autoencoder.save_weights('aae.hdf')

## Show Loss and Result Samples

In [0]:
fig, axes = plt.subplots(nrows=1, ncols=3)
fig.set_size_inches(30, 6)

axes[0].plot([loss[1] for loss in losses_disc])
axes[1].plot([loss[1] for loss in losses_disc_cat])
axes[2].plot([loss[1] for loss in losses_ae])
axes[2].plot([loss[1] for loss in losses_val])

fig.show()

size = 5
offset = 5

fig, axes = plt.subplots(nrows=size, ncols=5)
fig.set_size_inches(20, 3 * size)

(dec, rep, cat, error) = encoder.predict(test_x[offset:size+offset])

for i in range(size):  
    axes[i,0].plot(test_x[i+offset])
    axes[i,1].imshow(rep[i].reshape(8,4))
    axes[i,2].imshow(cat[i].reshape(cat_dim, 1))
    axes[i,3].plot(dec[i])
    axes[i,4].plot(error[i])

fig.show()

## Inspect Categories

In [0]:
(dec, rep, cat, error) = encoder.predict(data_x)

In [0]:
from collections import Counter

categories = [np.argmax(item) for item in cat]
counts = Counter(categories)
print(counts)

labels, count = zip(*counts.items())

plt.figure(figsize=(14,7))
plt.bar(labels, count)
plt.show()

## Latent Distribution

In [0]:
from sklearn.manifold import TSNE

fig, axes = plt.subplots(nrows=1, ncols=3)
fig.set_size_inches(30,7)

X = TSNE(n_components=2, perplexity=10).fit_transform(rep)

axes[0].scatter([x[0] for x in X], [x[1] for x in X], c=categories, cmap='viridis')

X = TSNE(n_components=2, perplexity=30).fit_transform(rep)

axes[1].scatter([x[0] for x in X], [x[1] for x in X], c=categories, cmap='viridis')

X = TSNE(n_components=2, perplexity=50).fit_transform(rep)

axes[2].scatter([x[0] for x in X], [x[1] for x in X], c=categories, cmap='viridis')

fig.show()

## Categories in the Input Data


In [0]:
from sklearn.manifold import TSNE

fig, axes = plt.subplots(nrows=1, ncols=3)
fig.set_size_inches(30,7)

input_data = data_x.reshape(data_x.shape[0], data_x.shape[1] * data_x.shape[2])

X = TSNE(n_components=2, perplexity=10).fit_transform(input_data)

axes[0].scatter([x[0] for x in X], [x[1] for x in X], c=categories, cmap='viridis')

X = TSNE(n_components=2, perplexity=30).fit_transform(input_data)

axes[1].scatter([x[0] for x in X], [x[1] for x in X], c=categories, cmap='viridis')

X = TSNE(n_components=2, perplexity=50).fit_transform(input_data)

axes[2].scatter([x[0] for x in X], [x[1] for x in X], c=categories, cmap='viridis')