# **Data Pipeline**

In [1]:
import yfinance as yf
from datetime import datetime, timedelta

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

"""
Claim: Develop some system that when backtracking produces substantial Alpha by exploiting hidden market trends
    Method: We will be conducting Sector Research, and we will attempt to generate some level of predicted alpha, or even some trend prediction
"""

'\nClaim: Develop some system that when backtracking produces substantial Alpha by exploiting hidden market trends\n    Method: We will be conducting Sector Research, and we will attempt to generate some level of predicted alpha, or even some trend prediction\n'

In [2]:
# Sector Conversion Map
sector_mapping = {
        'XLK': 'Technology',
        'XLF': 'Financials', 
        'XLE': 'Energy',
        'XLV': 'Health Care',
        'XLI': 'Industrials',
        'XLP': 'Consumer Staples',
        'XLY': 'Consumer Discretionary',
        'XLU': 'Utilities',
        'XLB': 'Materials',
        'XLRE': 'Real Estate',
        'XLC': 'Communication Services'
    }

node_mapping = {}

for i, key in enumerate(sector_mapping.keys()):
    node_mapping[i] = key

print(node_mapping)

{0: 'XLK', 1: 'XLF', 2: 'XLE', 3: 'XLV', 4: 'XLI', 5: 'XLP', 6: 'XLY', 7: 'XLU', 8: 'XLB', 9: 'XLRE', 10: 'XLC'}


In [3]:
def download_sector_data(sector_mapping) -> pd.DataFrame:

    start_date = (datetime.now() - timedelta(days=5 * 365)).strftime('%Y-%m-%d')
    end_date = datetime.now().strftime('%Y-%m-%d')

    print(f"Downloading Sector-Based Data...")
    sector_data = yf.download(tickers=list(sector_mapping.keys()), start=start_date, end=end_date)
    print(f"Downloaded yfinance sector data for =:\n-> Keys: {list(sector_mapping.keys())}\n-> Values: list{sector_mapping.values()}")

    print(f"Flattening Column Labels to Remove Tuple-Based Column Structure... & Shifting Date into Column")
    sector_data.columns = [f"{sector}_{metric}" for metric, sector in sector_data.columns]
    sector_data = sector_data.reset_index()

    print(f"* Done Preprocessing and Organizing Data *")
    return sector_data

sector_data = download_sector_data(sector_mapping=sector_mapping)
sector_data.head()

Downloading Sector-Based Data...


  sector_data = yf.download(tickers=list(sector_mapping.keys()), start=start_date, end=end_date)
[*********************100%***********************]  11 of 11 completed

Downloaded yfinance sector data for =:
-> Keys: ['XLK', 'XLF', 'XLE', 'XLV', 'XLI', 'XLP', 'XLY', 'XLU', 'XLB', 'XLRE', 'XLC']
-> Values: listdict_values(['Technology', 'Financials', 'Energy', 'Health Care', 'Industrials', 'Consumer Staples', 'Consumer Discretionary', 'Utilities', 'Materials', 'Real Estate', 'Communication Services'])
Flattening Column Labels to Remove Tuple-Based Column Structure... & Shifting Date into Column
* Done Preprocessing and Organizing Data *





Unnamed: 0,Date,XLB_Close,XLC_Close,XLE_Close,XLF_Close,XLI_Close,XLK_Close,XLP_Close,XLRE_Close,XLU_Close,...,XLC_Volume,XLE_Volume,XLF_Volume,XLI_Volume,XLK_Volume,XLP_Volume,XLRE_Volume,XLU_Volume,XLV_Volume,XLY_Volume
0,2020-07-16,54.920506,54.941647,30.416573,22.060677,66.268852,102.061813,53.747543,29.181149,50.420036,...,2216700,18822800,74678500,13017100,8590300,7770200,3339300,15063200,6412200,3024700
1,2020-07-17,55.382324,54.81752,29.991512,21.877981,66.667023,102.561005,54.037067,29.58667,51.558929,...,1928900,17716900,47033200,8605200,6355200,6321700,8696300,22950800,7548600,3094600
2,2020-07-20,54.902401,55.476357,29.492876,21.777494,65.852211,105.181931,53.493111,29.307875,50.882427,...,2205300,20076500,40127600,9404600,8509200,6818600,2792300,12127600,6776300,2735800
3,2020-07-21,55.382324,55.209,31.274883,22.206837,66.722588,104.125885,53.958111,29.31633,51.113644,...,2228100,36170200,46491300,9344300,9397400,7847300,5930700,15108000,7458600,2657900
4,2020-07-22,56.025253,55.266296,30.890682,22.206837,67.259598,104.961121,54.388023,29.671158,51.901459,...,1528700,19621700,44437400,7620200,7514000,7022700,21831500,14847100,5648400,2660500


## **Feature Engineering**

In [4]:
# Create a list of Different Dataframes
def get_sector_data_separated(sector_mapping, original_data, original_columns) -> dict:
    sector_separated_data = {}

    for sector in sector_mapping.keys():
        sector_separated_data[sector] = original_data[[f"{sector}_{metric[metric.index("_") + 1:]}" for metric in original_columns if sector in metric]].copy()
        sector_data = sector_separated_data[sector] # Dynamic State any change conserved
        new_columns = [old[old.index("_") + 1:] for old in sector_data.columns if "_" in old]
        sector_data.columns = new_columns
        sector_data["Date"] = original_data["Date"].values
        sector_data["Sector"] = sector
        sector_data["Liquidity"] = sector_data["Volume"] * sector_data["Close"]

        sector_data["IntraDay_Return($)"] = sector_data["Close"] - sector_data["Open"]
        sector_data["InterDay_Return($)"] = sector_data["Close"] - sector_data["Close"].shift(1)

        sector_data["IntraDay_Return(%)"] = sector_data["IntraDay_Return($)"] / sector_data["Open"]
        sector_data["InterDay_Return(%)"] = sector_data["InterDay_Return($)"] / sector_data["Close"].shift(1)

        for bound in ["Intra", "Inter"]:
            sector_data[f"{bound}_Gain"] = np.max(sector_data["IntraDay_Return($)"], 0)
            sector_data[f"{bound}_Loss"] = np.max(-1 * sector_data["IntraDay_Return($)"], 0)
            sector_data[f"Log_{bound}_Return(%)"] = np.log(sector_data[f"{bound}Day_Return(%)"])
    return sector_separated_data

In [5]:
sector_separated_data = get_sector_data_separated(sector_mapping=sector_mapping, original_data=sector_data, original_columns=sector_data.columns)
print(f"Separated Sector Data Keys: {list(sector_separated_data.keys())}")

Separated Sector Data Keys: ['XLK', 'XLF', 'XLE', 'XLV', 'XLI', 'XLP', 'XLY', 'XLU', 'XLB', 'XLRE', 'XLC']


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

### Momentum

In [6]:
def calculate_sector_momentums(sector_separated_data, momentums) -> dict:
    for sector in sector_separated_data.keys():
        sector_data = sector_separated_data[sector] # Not using copy so it remains in a dynamic state

        # Calcuting the separate momentums for specific "sector"
        for momentum in momentums:
            sector_data[f"Opening_Momentum_{momentum}Days"] = sector_data["Open"] - sector_data["Open"].shift(momentum)
            sector_data[f"Closing_Momentum_{momentum}Days"] = sector_data["Close"] - sector_data["Close"].shift(momentum)
            sector_data[f"High_Momentum_{momentum}Days"] = sector_data["High"] - sector_data["High"].shift(momentum)
            sector_data[f"Low_Momentum_{momentum}Days"] = sector_data["Low"] - sector_data["Low"].shift(momentum)
            sector_data[f"Volume_Momentum_{momentum}Days"] = sector_data["Volume"] - sector_data["Volume"].shift(momentum)
            
    return sector_separated_data

In [None]:
momentums = [5, 10, 20, 30]
sector_separated_data = calculate_sector_momentums(sector_separated_data=sector_separated_data, momentums=momentums)

### Moving Averages

In [8]:
def calcualate_sector_moving_averages(sector_separated_data, moving_averages) -> dict:
    for sector in sector_separated_data.keys():
        sector_data = sector_separated_data[sector] # Not using copy so it remains in a dynamic state

        # Calcuting the separate momentums for specific "sector"
        for ma in moving_averages:
            sector_data[f"Opening_MovingAverage_{ma}Days"] = sector_data["Open"].rolling(window=ma).mean()
            sector_data[f"Closing_MovingAverage_{ma}Days"] = sector_data["Close"].rolling(window=ma).mean()
            sector_data[f"High_MovingAverage_{ma}Days"] = sector_data["High"].rolling(window=ma).mean()
            sector_data[f"Low_MovingAverage_{ma}Days"] = sector_data["Low"].rolling(window=ma).mean()
            sector_data[f"Volume_MovingAverage_{ma}Days"] = sector_data["Volume"].rolling(window=ma).mean()
            
        for bound in ["Intra", "Inter"]:
            sector_data[f"{bound}_Gain_MovingAverage_{14}Days"] = sector_data[f"{bound}_Gain"].rolling(window=14).mean()
            sector_data[f"{bound}_Loss_MovingAverage_{14}Days"] = sector_data[f"{bound}_Loss"].rolling(window=14).mean()
                
    return sector_separated_data

In [9]:
moving_averages = [5, 10, 20, 30]
sector_separated_data = calcualate_sector_moving_averages(sector_separated_data=sector_separated_data, moving_averages=moving_averages)

In [10]:
print(f"Current Features: {[i for i in sector_separated_data["XLB"].columns]}")

Current Features: ['Close', 'High', 'Low', 'Open', 'Volume', 'Date', 'Sector', 'Liquidity', 'IntraDay_Return($)', 'InterDay_Return($)', 'IntraDay_Return(%)', 'InterDay_Return(%)', 'Intra_Gain', 'Intra_Loss', 'Log_Intra_Return(%)', 'Inter_Gain', 'Inter_Loss', 'Log_Inter_Return(%)', 'Opening_Momentum_5Days', 'Closing_Momentum_5Days', 'High_Momentum_5Days', 'Low_Momentum_5Days', 'Volume_Momentum_5Days', 'Opening_Momentum_10Days', 'Closing_Momentum_10Days', 'High_Momentum_10Days', 'Low_Momentum_10Days', 'Volume_Momentum_10Days', 'Opening_Momentum_20Days', 'Closing_Momentum_20Days', 'High_Momentum_20Days', 'Low_Momentum_20Days', 'Volume_Momentum_20Days', 'Opening_Momentum_30Days', 'Closing_Momentum_30Days', 'High_Momentum_30Days', 'Low_Momentum_30Days', 'Volume_Momentum_30Days', 'Opening_MovingAverage_5Days', 'Closing_MovingAverage_5Days', 'High_MovingAverage_5Days', 'Low_MovingAverage_5Days', 'Volume_MovingAverage_5Days', 'Opening_MovingAverage_10Days', 'Closing_MovingAverage_10Days', 'Hig

### Relative Strength

In [11]:
def calculate_sector_relative_strengths(sector_separated_data, rsi_periods=[14]) -> dict:
    for sector in sector_separated_data.keys():
        sector_data = sector_separated_data[sector] # Not using copy so it remains in a dynamic state

        # Calcuting the separate momentums for specific "sector"
        for prd in rsi_periods:
            for bound in ["Intra", "Inter"]:    
                sector_data[f"{bound}_RSI_{prd}Days"] = 100 - (100 / (1 + (sector_data[f"{bound}_Gain_MovingAverage_{prd}Days"] / sector_data[f"{bound}_Loss_MovingAverage_{prd}Days"])))

    return sector_separated_data

In [12]:
rsi_periods = [14]
sector_separated_data = calculate_sector_relative_strengths(sector_separated_data=sector_separated_data, rsi_periods=rsi_periods)

In [13]:
print(f"Number of Features: {len(sector_separated_data["XLB"].columns)}")

Number of Features: 64


### Volatility

In [14]:
def calculate_sector_realized_volatility(sector_separated_data, periods) -> dict:
    """
    Inner Volatility DURING the day -> Intra Only
    """
    for sector in sector_separated_data.keys():
        sector_data = sector_separated_data[sector]

        for prd in periods:
            sector_data[f"RV_{prd}Days"] = (
                sector_data["IntraDay_Return(%)"]
                .rolling(window=prd)
                .apply(lambda x: np.sqrt(np.sum(x**2)), raw=True)
            )
            sector_data[f"RV_{prd}Days_1DayDelta(%)"] = (sector_data[f"RV_{prd}Days"] - sector_data[f"RV_{prd}Days"].shift(1)) / sector_data[f"RV_{prd}Days"].shift(1)

    return sector_separated_data

In [15]:
realized_volatility_periods = [5, 10, 20, 30]
sector_separated_data = calculate_sector_realized_volatility(sector_separated_data=sector_separated_data, periods=realized_volatility_periods)

In [16]:
def remove_nans(sector_separated_data):
    for sector in sector_separated_data.keys():
        sector_data = sector_separated_data[sector]
        sector_data = sector_data.dropna()

    return sector_separated_data

In [17]:
def define_target(sector_separated_data):
    for sector in sector_separated_data.keys():
        sector_data = sector_separated_data[sector]
        sector_data["Target"] = sector_data["RV_5Days_1DayDelta(%)"].shift(-1)

    return sector_separated_data

In [18]:
sector_separated_data = remove_nans(sector_separated_data)
sector_separated_data = define_target(sector_separated_data)

In [19]:
print(f"Length Data: {len(sector_separated_data["XLB"]) * len(sector_separated_data.keys())}")
print(f"# of Features: {len(sector_separated_data["XLB"].columns)}")

print(f"Features:")
for idx, i in enumerate(sector_separated_data["XLB"].columns):
    if idx % 6 != 0:
        print(f"{i}", end="|")
    else:
        print(f"{i}\n")

Length Data: 13794
# of Features: 73
Features:
Close

High|Low|Open|Volume|Date|Sector

Liquidity|IntraDay_Return($)|InterDay_Return($)|IntraDay_Return(%)|InterDay_Return(%)|Intra_Gain

Intra_Loss|Log_Intra_Return(%)|Inter_Gain|Inter_Loss|Log_Inter_Return(%)|Opening_Momentum_5Days

Closing_Momentum_5Days|High_Momentum_5Days|Low_Momentum_5Days|Volume_Momentum_5Days|Opening_Momentum_10Days|Closing_Momentum_10Days

High_Momentum_10Days|Low_Momentum_10Days|Volume_Momentum_10Days|Opening_Momentum_20Days|Closing_Momentum_20Days|High_Momentum_20Days

Low_Momentum_20Days|Volume_Momentum_20Days|Opening_Momentum_30Days|Closing_Momentum_30Days|High_Momentum_30Days|Low_Momentum_30Days

Volume_Momentum_30Days|Opening_MovingAverage_5Days|Closing_MovingAverage_5Days|High_MovingAverage_5Days|Low_MovingAverage_5Days|Volume_MovingAverage_5Days

Opening_MovingAverage_10Days|Closing_MovingAverage_10Days|High_MovingAverage_10Days|Low_MovingAverage_10Days|Volume_MovingAverage_10Days|Opening_MovingAverage_20