## Project Title: To determine if data augmentation using the method proposed in 'Finding Order in Chaos: A Novel Data Augmentation Method for Time Series in Contrastive Learning' will lead to better 1 day prediction results.



In [1]:
import numpy as np
import tensorflow as tf
import random
import os

# Seed value
seed_value= 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)


In [2]:
import tensorflow as tf
import pandas as pd
import yfinance as yf
import seaborn as sns
from tensorflow.keras import layers, Model
import numpy as np
import torch
import matplotlib.pyplot as plt
from copy import deepcopy
from scipy.fft import rfft, rfftfreq, irfft
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
# from tcn import TCN  # If you have the tcn p /ackage installed
from sklearn.metrics import mean_squared_error

import optuna
from optuna.samplers import TPESampler
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from pykalman import KalmanFilter

display(HTML("<style>.container { width:100% !important; }</style>"))

### Initialize Parameters

In [3]:
# Assuming cfg is a configuration object with a seed attribute
cfg = type('config', (object,), {'seed': 42})
# Make the sampler behave in a deterministic way.
sampler = TPESampler(seed=cfg.seed)

alpha = 0.4
seq_len = 20
test_size = 0.3

### Helper Functions

In [4]:
# Function to import stock data
def get_stock_data(ticker, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date)
    return data

def z_score_normalize(series):
    mean = series.mean()
    std = series.std()
    return (series - mean) / std

def denormalize_z_score(normalized_series, original_mean, original_std):
    return (normalized_series * original_std) + original_mean

# Function to create model (make sure this is defined in your environment)
def create_model(best_params, input_shape):
    model = Sequential()
    model.add(LSTM(best_params['lstm_units'], input_shape=input_shape, return_sequences=True))
    model.add(Dropout(best_params['dropout_rate']))
    model.add(LSTM(best_params['lstm_units']))  # Stacking LSTM for deep learning
    model.add(Dropout(best_params['dropout_rate']))
    model.add(Dense(1))  # Output layer
    model.compile(optimizer=Adam(learning_rate=best_params['learning_rate']), loss='mse')
    return model

def create_sequences(features, target, seq_len):
    X, y = [], []
    for i in range(len(target) - seq_len):
        X.append(features[i:(i + seq_len)])
        y.append(target[i + seq_len])
    return np.array(X), np.array(y)

In [5]:
def plot_correlation(df):
    correlation_matrix = df.corr()

    # Set up the matplotlib figure
    plt.figure(figsize=(10, 8))

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm',
                square=True, linewidths=.5, cbar_kws={"shrink": .5})

    # Adjust the plot as needed
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()  # Adjusts the plot to ensure everything fits without overlap

    # Show the plot
    plt.show()

In [6]:
def engineer_features(data):
    df = data.copy(deep=True)
    delta = df['Close'].diff()
    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0
    roll_up = up.rolling(window=14).mean()
    roll_down = down.abs().rolling(window=14).mean()
    RS = roll_up / roll_down
    df['RSI'] = 100.0 - (100.0 / (1.0 + RS))

    # Volume Weighted Average Price (VWAP)
    vwap = (df['Volume'] * (df['High'] + df['Low'] + df['Close']) / 3).cumsum() / df['Volume'].cumsum()
    df['VWAP'] = vwap

    # Price Ratios
    df['high_to_low_ratio'] = df['High'] / df['Low']
    df['open_to_close_ratio'] = df['Open'] / df['Close']

    # Volatility
    df['volatility_10'] = df['Close'].rolling(window=10).std()

    df1 = df.drop(columns=['Open', 'High', 'Low', 'Adj Close']).dropna()
    return df1

In [7]:
def cut_mix(df1, df2, alpha=0.4):
    assert df1.shape == df2.shape
    size = len(df1)
    cut_point = np.random.randint(0, size)
    cut_length = int(size * alpha)
    
    mixed_df = df1.copy()
    mixed_df.iloc[cut_point:cut_point+cut_length] = df2.iloc[cut_point:cut_point+cut_length]
    
    return mixed_df

def binary_mix(data1, data2, alpha=0.4):
    assert len(data1) == len(data2)
    size = data1.shape
    mask = np.random.binomial(1, alpha, size=size).astype(bool)
    
    mixed_data = np.where(mask, data1, data2)
    
    return pd.DataFrame(mixed_data, columns=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'])

def linear_mix(data1, data2, alpha=0.4):
    assert len(data1) == len(data2)
    
    mixed_data = alpha * data1 + (1 - alpha) * data2
    
    return mixed_data

def geometric_mix(data1, data2, alpha=0.4):
    assert len(data1) == len(data2)
    
    mixed_data = data1**alpha * data2**(1 - alpha)
    
    return mixed_data

def amplitude_mix(data1, data2, alpha=0.4):
    assert len(data1) == len(data2)
    
    fft1 = np.fft.rfft(data1)
    fft2 = np.fft.rfft(data2)
    
    # Mix the magnitudes
    magnitude1 = np.abs(fft1)
    magnitude2 = np.abs(fft2)
    mixed_magnitude = alpha * magnitude1 + (1 - alpha) * magnitude2
    
    # Keep the phase of the first data
    phase1 = np.angle(fft1)
    mixed_fft = mixed_magnitude * np.exp(1j * phase1)
    
    mixed_data = np.fft.irfft(mixed_fft)
    
    return pd.DataFrame(mixed_data, columns=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'])

### PROPOSE TECHNIQUE BELOW

def proposed_mixup(df1, df2, threshold=0.1, alpha=0.4):
    
    def proposed_mixup_feature(data1, data2, threshold, alpha):
        
        def get_significant_frequencies(data, threshold):
            """
            Perform Fourier Transform on data and identify frequencies with significant amplitude.

            Args:
            - data: Time series data.
            - threshold: Threshold for significance, relative to the max amplitude.

            Returns:
            - significant_freq: Frequencies with significant amplitude.
            - significant_ampl: Amplitude of the significant frequencies.
            - full_spectrum: Full Fourier spectrum for all frequencies.
            """
            # Perform Fourier Transform
            spectrum = rfft(data)
            frequencies = rfftfreq(data.size, d=1)  # Assuming unit time interval between samples

            # Find significant amplitudes
            amplitude = np.abs(spectrum)
            significant_indices = amplitude > (amplitude.max() * threshold)
            significant_freq = frequencies[significant_indices]
            significant_ampl = amplitude[significant_indices]

            return significant_freq, significant_ampl, spectrum

        def phase_mixup(sig_freq1, sig_ampl1, spectrum1, sig_freq2, sig_ampl2, spectrum2, alpha):
            mixed_spectrum = np.copy(spectrum1)
            freqs1 = rfftfreq(spectrum1.size, d=1)
            freqs2 = rfftfreq(spectrum2.size, d=1)

            for freq in sig_freq1:
                index1 = np.argmin(np.abs(freqs1 - freq))
                index2 = np.argmin(np.abs(freqs2 - freq))

                if index1 >= len(sig_ampl1) or index2 >= len(sig_ampl2):
                    continue  # Skip the frequency if the index is out of bounds

                phase1 = np.angle(spectrum1[index1])
                phase2 = np.angle(spectrum2[index2])

                phase_diff = (phase2 - phase1) % (2 * np.pi)
                phase_diff = phase_diff - 2 * np.pi if phase_diff > np.pi else phase_diff

                new_amplitude = alpha * sig_ampl1[index1] + (1 - alpha) * sig_ampl2[index2]
                new_phase = phase1 + alpha * phase_diff

                mixed_spectrum[index1] = new_amplitude * np.exp(1j * new_phase)

            return mixed_spectrum


        def reconstruct_time_series(mixed_spectrum):
            """
            Reconstruct time series from mixed spectrum using inverse Fourier Transform.

            Returns:
            - mixed_time_series: The reconstructed time series.
            """
            # Perform inverse Fourier Transform
            mixed_time_series = irfft(mixed_spectrum)

            return mixed_time_series

        # Step 1: Get significant frequencies and amplitude for both time series
        sig_freq1, sig_ampl1, spectrum1 = get_significant_frequencies(data1, threshold)
        sig_freq2, sig_ampl2, spectrum2 = get_significant_frequencies(data2, threshold)

        # Step 2: Identify significant frequencies (already done in step 1)

        # Step 3: Phase and Magnitude Mixup
        mixed_spectrum = phase_mixup(sig_freq1, sig_ampl1, spectrum1, sig_freq2, sig_ampl2, spectrum2, alpha)

        # Step 4: Reconstruction of the time series
        mixed_time_series = reconstruct_time_series(mixed_spectrum)
        return mixed_time_series
    
    output_df = pd.DataFrame()
    
    for feature in df1.columns:
        output_df[feature] = proposed_mixup_feature(df1[feature].values, df2[feature].values, threshold, alpha)
        
    return output_df

In [8]:
def objective(trial):
    # Hyperparameters to be tuned by Optuna
    n_layers = trial.suggest_int('n_layers', 1, 3)
    lstm_units = trial.suggest_categorical('lstm_units', [50, 100, 150])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    epochs = trial.suggest_int('epochs', 20, 100)
    
    # Dictionary to hold RMSE for each stock
    stock_rmse = {}
    
    for stock, df in historical_data_augmented.items():
        # Preprocess the data
        df = df.copy()
        rets = df['Close'].pct_change().dropna()
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(df[df.columns].values)
        seq_len = 20
        X, y = create_sequences(scaled_features, rets.values, seq_len)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
        
        # Model architecture
        input_shape = (X_train.shape[1], X_train.shape[2])
        model = Sequential()
        for i in range(n_layers):
            model.add(LSTM(units=lstm_units, return_sequences=(i < n_layers - 1)))
            model.add(Dropout(rate=dropout_rate))
        model.add(Dense(units=1))
        model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
        
        # Fit the model
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
        
        # Predictions and evaluate
        predictions = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        stock_rmse[stock] = rmse
    
    # Calculate the average RMSE across all stocks
    average_rmse = np.mean(list(stock_rmse.values()))
    
    return average_rmse

In [9]:
def create_augmented_data(rets, df1, df2, method, alpha, window_size=20):
    if method == 'cut_mix':
        df = cut_mix(df1, df2, alpha)
    elif method == 'binary_mix':
        df = binary_mix(df1, df2, alpha)
    elif method == 'linear_mix':
        df = linear_mix(df1, df2, alpha)
        print('linear mixing')
    elif method == 'geometrix_mix':
        df = geometric_mix(df1, df2, alpha)
    elif method == 'amplitude_mix':
        df = amplitude_mix(df1, df2, alpha)
    elif method == 'proposed_mix':
        df = proposed_mixup(df1, df2, alpha)

    # Original
    else:
        df = df1.copy()
        
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df[df.columns].values)
        
    # Create sequences
    X, y = create_sequences(scaled_features, rets, window_size)
    
    return X, y, df

In [26]:
def plot_TSNE(df1, df2):
    df1_log = np.log(df1 + 1)  # Adding 1 to avoid log(0)
    df2_log = np.log(df2 + 1)

    combined_data = pd.concat([df1_log, df2_log])

    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=0, perplexity=100, n_iter=1000)
    tsne_results = tsne.fit_transform(combined_data)

    # Now we split the t-SNE results back into original and augmented parts
    tsne_df1 = tsne_results[:len(df1), :]
    tsne_df2 = tsne_results[len(df1):, :]

    # Plot the results
    plt.figure(figsize=(12,8))
    plt.scatter(tsne_df1[:, 0], tsne_df1[:, 1], label='Original', alpha=0.8)
    plt.scatter(tsne_df2[:, 0], tsne_df2[:, 1], label='Augmented', alpha=0.8)
    plt.legend()
    plt.show()

### Pull Data from Yahoo Finance

In [27]:
start_date = '2010-01-01'
end_date = '2023-01-01'

# Define the list of Dow Jones Industrial Average companies
tickers = [
    "MMM", "AXP", "AMGN", "AAPL", "BA", "CAT", "CVX", "CSCO", "KO", "DIS",
    "DOW", "GS", "HD", "HON", "IBM", "INTC", "JNJ", "JPM", "MCD", "MRK",
    "MSFT", "NKE", "PG", "CRM", "TRV", "UNH", "V", "WBA", "WMT"
]

# tickers = ['AAPL']
# Create a dictionary to store historical data for each company
historical_data = {}

# Loop through the Dow companies and retrieve historical data
for ticker in tickers:
    stock_data = get_stock_data(ticker, start_date, end_date)
    historical_data[ticker] = stock_data

In [30]:
historical_data

In [31]:
# Create a DataFrame to hold the 'Close' prices of each stock
close_prices = pd.DataFrame()

# Extract 'Close' columns and merge them into the close_prices DataFrame
for ticker, data in historical_data.items():
    close_prices[ticker] = data['Close']

# Calculate the correlation matrix
correlation_matrix = close_prices.corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix for Closing Prices')
plt.show()

In [42]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'correlation_matrix' is your DataFrame containing the correlation coefficients

# Define the low correlation threshold
low_correlation_threshold = 0.6

# Count the number of low correlations for each stock
low_correlation_counts = (correlation_matrix < low_correlation_threshold).sum(axis=1)

# Filter stocks that have low correlation with others more than a certain number of times
# This threshold could be, for example, half the size of the correlation matrix
threshold_num_low_correlations = len(correlation_matrix) // 2
least_correlated_stocks = low_correlation_counts[low_correlation_counts > threshold_num_low_correlations].index

# Now you have the tickers of the stocks that are least correlated with others
# You can use this list to filter your original stock data

# Assuming 'historical_data' is a dictionary with your stock data
filtered_data = {ticker: historical_data[ticker] for ticker in least_correlated_stocks}

# You can convert this dictionary back into a DataFrame if needed, for example:
filtered_close_prices = pd.DataFrame({ticker: data['Close'] for ticker, data in filtered_data.items()})

# Calculate the new correlation matrix for the filtered stocks
filtered_correlation_matrix = filtered_close_prices.corr()

# Plot the new correlation matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(filtered_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Filtered Correlation Matrix for Least Correlated Closing Prices')
plt.show()


### Simple EDA

In [12]:
plot_correlation(historical_data['AAPL'])

In [13]:
#     plot_correlation(historical_data['MSFT'])

### Feature Engineering - As our current features are all literally the same

In [14]:
historical_data_augmented = {}

for key, value in historical_data.items():
    df = historical_data[key]
    new_df = engineer_features(df)
    historical_data_augmented[key] = new_df

In [15]:
plot_correlation(historical_data_augmented['AAPL'])

In [16]:
# plot_correlation(historical_data_augmented['MSFT'])

## Original

In [17]:
def apply_lowess_smoothing(df, frac=0.1):
    smoothed_data = pd.DataFrame(index=df.index)
    
    # Apply LOWESS to each column
    for column in df.columns:
        smoothed_values = lowess(df[column], df.index, frac=frac, return_sorted=False)
        smoothed_data[column] = smoothed_values
    
    return smoothed_data

In [18]:
from statsmodels.nonparametric.smoothers_lowess import lowess

In [19]:
# Best hyperparameters
best_params = {
    'n_layers': 1, 
    'lstm_units': 50, 
    'dropout_rate': 0.5, 
    'learning_rate': 0.001, 
    'batch_size': 128,
    'epochs': 50
}

def run_model(historical_data_augmented, best_params, method):
    stock_rmse = {}
    stock_augmented = {}
    
    for stock, df in historical_data_augmented.items():
        df1 = df.copy(deep=True).dropna()
        print(f'Processing stock: {stock}')
        
        if method == 'original':
            df_to_augment = df1
        else: 
            df_to_augment = apply_lowess_smoothing(df)

        # Compute returns and drop NaN values
        rets = df['Close'].pct_change().dropna()
        
        X, y, df_augmented = create_augmented_data(rets, df1, df_to_augment, method, alpha, seq_len)

        # Train Test Split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Model creation
        input_shape = (X_train.shape[1], X_train.shape[2])
        model = create_model(best_params, input_shape)

        # Fit the model
        model.fit(X_train, y_train, epochs=best_params['epochs'], batch_size=best_params['batch_size'], verbose=0)

        # Predictions
        predictions = model.predict(X_test)

        # Compute RMSE for the current stock
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        stock_rmse[stock] = rmse
        stock_augmented[stock] = df_augmented
        print(f'Stock: {stock}, RMSE: {rmse}')

        # Plotting t-SNE
        plot_TSNE(df1, df_augmented)

        return stock_rmse, df1, stock_augmented

In [20]:
rmse, df, df_to_augment = run_model(historical_data_augmented, best_params, 'cut_mix')

In [21]:
rmse, df, df_to_augment = run_model(historical_data, best_params, 'cut_mix')

In [22]:
df

In [23]:
df_to_augment

In [24]:
plot_TSNE(df, df_to_augment['AAPL'])

In [25]:
# study = optuna.create_study(direction='minimize', sampler=TPESampler(seed=42))  # Set seed for reproducibility
# study.optimize(objective, n_trials=50)

# # Print the best hyperparameters
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)