### Imports

In [1]:
# Remove unwanted warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data extraction and management
import polars as pl
import numpy as np
import yfinance as yf

# Feature Engineering
from sklearn.preprocessing import StandardScaler

# Machine Learning
from sklearn.cluster import KMeans
from sklearn import metrics
from kneed import KneeLocator

# Cointegration and Statistics
from statsmodels.tsa.stattools import coint
import statsmodels.api as sm

# Reporting visualization
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

## Data Extraction 

In [2]:
# Set Data Extraction parameters for trading with Polars
start_date = "2019-01-01"  # Start date for historical data
end_date = "2024-06-01"    # End date for historical data (adjust as needed)

# File paths for storing and retrieving data
file_name = "data/raw_data_etf.csv"  # Path for ETF data
file_name_coint = "data/raw_data_coint_pairs.csv"  # Path for cointegrated pairs data

# Flags for data loading
load_existing = True     # Set to True to load existing ETF data from file
load_coint_pairs = False   # Set to True to load existing cointegrated pairs data

# Note: When using Polars for trading:
# - Ensure date parsing is handled correctly when reading/writing CSV files
# - Consider using Polars' memory-efficient data types for large datasets
# - Leverage Polars' fast I/O operations for reading/writing data
# - Utilize Polars' vectorized operations for quick data manipulations

### Get Symbols

In [3]:
import yfinance as yf  # Import the yfinance library for financial data
import polars as pl    # Import the Polars library for DataFrame operations
import pandas as pd    # Import the pandas library for data manipulation

# Variable to determine whether to load existing data or fetch new data
load_existing = True

# Function to fetch NASDAQ symbols
def get_nasdaq_symbols():
    # URL of the NASDAQ listed symbols file
    url = 'ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt'
    # Read the symbols data from the URL using pandas
    symbols = pd.read_csv(url, sep='|')
    # Drop rows with any missing values
    symbols = symbols.dropna()
    # Set the 'Symbol' column as the index of the DataFrame
    symbols.set_index('Symbol', inplace=True)
    # Return the DataFrame containing the symbols
    return symbols

# Function to load existing symbols from a file
def load_existing_symbols(file_path):
    # Read the symbols from a CSV file
    existing_symbols = pd.read_csv(file_path)
    # Set the 'Symbol' column as the index of the DataFrame
    existing_symbols.set_index('Symbol', inplace=True)
    # Return the DataFrame containing the existing symbols
    return existing_symbols

# Path to the file where existing symbols are stored
existing_symbols_file = 'data/existing_symbols.csv'

# Check if we should load existing data or fetch new data
if load_existing:
    # Load existing symbols from the file
    existing_symbols = load_existing_symbols(existing_symbols_file)
else:
    # Fetch the NASDAQ symbols using the defined function
    symbols = get_nasdaq_symbols()
    # Filter the symbols to include only those that are ETFs (ETF == 'Y') and belong to the NASDAQ Global Market (Market Category == 'G')
    symbols = symbols[(symbols['ETF'] == 'Y') & (symbols['Market Category'] == 'G')]
    # Convert the filtered symbols DataFrame index to a list of symbol strings
    new_symbols = list(symbols.index.values)

    # Compare new symbols with existing symbols (if they exist)
    try:
        existing_symbols = load_existing_symbols(existing_symbols_file)
        existing_symbols_list = list(existing_symbols.index.values)

        if new_symbols != existing_symbols_list:
            # If there are new or updated symbols, replace the existing symbols
            symbols.to_csv(existing_symbols_file)
            print("Symbols updated.")
        else:
            print("No new symbols.")
    except FileNotFoundError:
        # If the existing symbols file does not exist, save the new symbols
        symbols.to_csv(existing_symbols_file)
        print("Symbols saved.")

    # Print the list of new symbols
    print(new_symbols)


Symbols saved.
['AADR', 'AAPB', 'AAPD', 'AAPU', 'AAXJ', 'ABCS', 'ACWI', 'ACWX', 'AGMI', 'AGNG', 'AGZD', 'AIA', 'AIPI', 'AIQ', 'AIRL', 'AIRR', 'ALTY', 'AMDL', 'AMDS', 'AMID', 'AMZD', 'AMZU', 'AMZZ', 'ANGL', 'AOTG', 'AQWA', 'ARVR', 'ASET', 'AUMI', 'AVXC', 'BABX', 'BBH', 'BDGS', 'BEEZ', 'BELT', 'BGRN', 'BGRO', 'BIB', 'BIS', 'BITS', 'BJK', 'BKCH', 'BKIV', 'BKWO', 'BLCN', 'BLCR', 'BLLD', 'BMDL', 'BND', 'BNDW', 'BNDX', 'BOTT', 'BOTZ', 'BRHY', 'BRNY', 'BRRR', 'BRTR', 'BSCO', 'BSCP', 'BSCQ', 'BSCR', 'BSCS', 'BSCT', 'BSCU', 'BSCV', 'BSCW', 'BSCX', 'BSCY', 'BSJO', 'BSJP', 'BSJQ', 'BSJR', 'BSJS', 'BSJT', 'BSJU', 'BSJV', 'BSJW', 'BSMO', 'BSMP', 'BSMQ', 'BSMR', 'BSMS', 'BSMT', 'BSMU', 'BSMV', 'BSMW', 'BSSX', 'BSVO', 'BTEC', 'BTF', 'BTFX', 'BUFC', 'BUG', 'BULD', 'CA', 'CAFG', 'CALY', 'CANC', 'CANQ', 'CARZ', 'CATH', 'CCSB', 'CCSO', 'CDC', 'CDL', 'CEFA', 'CFA', 'CFO', 'CHPS', 'CIBR', 'CID', 'CIL', 'CIZ', 'CLOA', 'CLOD', 'CLOU', 'CLSM', 'CNCR', 'COMT', 'CONL', 'COPJ', 'COPP', 'COWG', 'COWS', 'CPLS', 'C

In [4]:
print(len(symbols))

644


In [5]:
import yfinance as yf  
import pandas as pd   

# Ensure symbols is a list of strings, somehow errored because it was not in string format
if isinstance(symbols, pd.DataFrame):
    symbols = symbols.index.tolist()

# Fetch adjusted close prices for the symbols from Yahoo Finance
data = yf.download(symbols, start=start_date, end=end_date)['Adj Close']

# Save the data to a CSV file
data.to_csv(file_name)

print(f"Data saved to {file_name}")


[*********************100%%**********************]  644 of 644 completed


19 Failed downloads:
['AIPI', 'TMET', 'BSCY', 'QBUF', 'BRHY', 'BGRO', 'METU', 'METD', 'BELT', 'BSJW', 'FCTE', 'IBTP', 'BMDL', 'EVSD', 'QXQ', 'IBGK', 'IBGA', 'GLOW', 'QQQT']: Exception("%ticker%: Data doesn't exist for startDate = 1546318800, endDate = 1717214400")



Data saved to data/raw_data_etf.csv


In [31]:
import polars as pl

# Load (or re-load for consistency) Data and remove features with NaN's
data = pl.read_csv(file_name)

# Filter columns that have all null values
data = data.filter(~pl.all_horizontal(pl.all().is_null()))

# Replace null values with zero for each column explicitly
data = data.with_columns(
    [pl.col(column).fill_null(0) for column in data.columns]
)

# Print the shape of the dataset
print("Shape:", data.shape)

# Check for any null values and print a boolean
has_null_values = data.null_count().sum_horizontal().sum() > 0
print("Null Values:", has_null_values)

# Display the first 50 rows to verify
print(data.head(50))


Shape: (1363, 645)
Null Values: False
shape: (50, 645)
┌─────────────────────┬───────────┬──────┬──────┬───┬──────┬───────────┬───────────┬─────┐
│ Date                ┆ AADR      ┆ AAPB ┆ AAPD ┆ … ┆ XFIX ┆ XT        ┆ YLDE      ┆ ZZZ │
│ ---                 ┆ ---       ┆ ---  ┆ ---  ┆   ┆ ---  ┆ ---       ┆ ---       ┆ --- │
│ str                 ┆ f64       ┆ str  ┆ str  ┆   ┆ str  ┆ f64       ┆ f64       ┆ str │
╞═════════════════════╪═══════════╪══════╪══════╪═══╪══════╪═══════════╪═══════════╪═════╡
│ 2019-01-02 00:00:00 ┆ 37.611729 ┆ 0    ┆ 0    ┆ … ┆ 0    ┆ 31.699692 ┆ 23.761585 ┆ 0   │
│ 2019-01-03 00:00:00 ┆ 37.192055 ┆ 0    ┆ 0    ┆ … ┆ 0    ┆ 30.861727 ┆ 23.422525 ┆ 0   │
│ 2019-01-04 00:00:00 ┆ 38.637589 ┆ 0    ┆ 0    ┆ … ┆ 0    ┆ 31.918703 ┆ 23.999844 ┆ 0   │
│ 2019-01-07 00:00:00 ┆ 39.094551 ┆ 0    ┆ 0    ┆ … ┆ 0    ┆ 32.404335 ┆ 24.228937 ┆ 0   │
│ 2019-01-08 00:00:00 ┆ 39.364998 ┆ 0    ┆ 0    ┆ … ┆ 0    ┆ 32.58527  ┆ 24.250925 ┆ 0   │
│ …                   ┆ …         ┆