### Imports

In [1]:
# Remove unwanted warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data extraction and management
import polars as pl
import numpy as np
import yfinance as yf

# Feature Engineering
from sklearn.preprocessing import StandardScaler

# Machine Learning
from sklearn.cluster import KMeans
from sklearn import metrics
from kneed import KneeLocator

# Cointegration and Statistics
from statsmodels.tsa.stattools import coint
import statsmodels.api as sm

# Reporting visualization
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

## Data Extraction 

In [2]:
# Set Data Extraction parameters for trading with Polars
start_date = "2019-01-01"  # Start date for historical data
end_date = "2024-06-01"    # End date for historical data (adjust as needed)

# File paths for storing and retrieving data
file_name = "data/raw_data_etf.csv"  # Path for ETF data
file_name_coint = "data/raw_data_coint_pairs.csv"  # Path for cointegrated pairs data

# Flags for data loading
load_existing = True     # Set to True to load existing ETF data from file
load_coint_pairs = False   # Set to True to load existing cointegrated pairs data

# Note: When using Polars for trading:
# - Ensure date parsing is handled correctly when reading/writing CSV files
# - Consider using Polars' memory-efficient data types for large datasets
# - Leverage Polars' fast I/O operations for reading/writing data
# - Utilize Polars' vectorized operations for quick data manipulations

### Get Symbols

In [3]:
import yfinance as yf  # Import the yfinance library for financial data
import polars as pl    # Import the Polars library for DataFrame operations
import pandas as pd    # Import the pandas library for data manipulation

# Variable to determine whether to load existing data or fetch new data
load_existing = True

# Function to fetch NASDAQ symbols
def get_nasdaq_symbols():
    # URL of the NASDAQ listed symbols file
    url = 'ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt'
    # Read the symbols data from the URL using pandas
    symbols = pd.read_csv(url, sep='|')
    # Drop rows with any missing values
    symbols = symbols.dropna()
    # Set the 'Symbol' column as the index of the DataFrame
    symbols.set_index('Symbol', inplace=True)
    # Return the DataFrame containing the symbols
    return symbols

# Function to load existing symbols from a file
def load_existing_symbols(file_path):
    # Read the symbols from a CSV file
    existing_symbols = pd.read_csv(file_path)
    # Set the 'Symbol' column as the index of the DataFrame
    existing_symbols.set_index('Symbol', inplace=True)
    # Return the DataFrame containing the existing symbols
    return existing_symbols

# Path to the file where existing symbols are stored
existing_symbols_file = 'data/existing_symbols.csv'

# Check if we should load existing data or fetch new data
if load_existing:
    # Load existing symbols from the file
    existing_symbols = load_existing_symbols(existing_symbols_file)
else:
    # Fetch the NASDAQ symbols using the defined function
    symbols = get_nasdaq_symbols()
    # Filter the symbols to include only those that are ETFs (ETF == 'Y') and belong to the NASDAQ Global Market (Market Category == 'G')
    symbols = symbols[(symbols['ETF'] == 'Y') & (symbols['Market Category'] == 'G')]
    # Convert the filtered symbols DataFrame index to a list of symbol strings
    new_symbols = list(symbols.index.values)

    # Compare new symbols with existing symbols (if they exist)
    try:
        existing_symbols = load_existing_symbols(existing_symbols_file)
        existing_symbols_list = list(existing_symbols.index.values)

        if new_symbols != existing_symbols_list:
            # If there are new or updated symbols, replace the existing symbols
            symbols.to_csv(existing_symbols_file)
            print("Symbols updated.")
        else:
            print("No new symbols.")
    except FileNotFoundError:
        # If the existing symbols file does not exist, save the new symbols
        symbols.to_csv(existing_symbols_file)
        print("Symbols saved.")

    # Print the list of new symbols
    print(new_symbols)


Symbols saved.
['AADR', 'AAPB', 'AAPD', 'AAPU', 'AAXJ', 'ABCS', 'ACWI', 'ACWX', 'AGMI', 'AGNG', 'AGZD', 'AIA', 'AIPI', 'AIQ', 'AIRL', 'AIRR', 'ALTY', 'AMDL', 'AMDS', 'AMID', 'AMZD', 'AMZU', 'AMZZ', 'ANGL', 'AOTG', 'AQWA', 'ARVR', 'ASET', 'AUMI', 'AVXC', 'BABX', 'BBH', 'BDGS', 'BEEZ', 'BELT', 'BGRN', 'BGRO', 'BIB', 'BIS', 'BITS', 'BJK', 'BKCH', 'BKIV', 'BKWO', 'BLCN', 'BLCR', 'BLLD', 'BMDL', 'BND', 'BNDW', 'BNDX', 'BOTT', 'BOTZ', 'BRHY', 'BRNY', 'BRRR', 'BRTR', 'BSCO', 'BSCP', 'BSCQ', 'BSCR', 'BSCS', 'BSCT', 'BSCU', 'BSCV', 'BSCW', 'BSCX', 'BSCY', 'BSJO', 'BSJP', 'BSJQ', 'BSJR', 'BSJS', 'BSJT', 'BSJU', 'BSJV', 'BSJW', 'BSMO', 'BSMP', 'BSMQ', 'BSMR', 'BSMS', 'BSMT', 'BSMU', 'BSMV', 'BSMW', 'BSSX', 'BSVO', 'BTEC', 'BTF', 'BTFX', 'BUFC', 'BUG', 'BULD', 'CA', 'CAFG', 'CALY', 'CANC', 'CANQ', 'CARZ', 'CATH', 'CCSB', 'CCSO', 'CDC', 'CDL', 'CEFA', 'CFA', 'CFO', 'CHPS', 'CIBR', 'CID', 'CIL', 'CIZ', 'CLOA', 'CLOD', 'CLOU', 'CLSM', 'CNCR', 'COMT', 'CONL', 'COPJ', 'COPP', 'COWG', 'COWS', 'CPLS', 'C

In [4]:
print(len(symbols))

644


In [5]:
import yfinance as yf  
import pandas as pd   

# Ensure symbols is a list of strings, somehow errored because it was not in string format
if isinstance(symbols, pd.DataFrame):
    symbols = symbols.index.tolist()

# Fetch adjusted close prices for the symbols from Yahoo Finance
data = yf.download(symbols, start=start_date, end=end_date)['Adj Close']

# Save the data to a CSV file
data.to_csv(file_name)

print(f"Data saved to {file_name}")


[*********************100%%**********************]  644 of 644 completed


19 Failed downloads:
['AIPI', 'TMET', 'BSCY', 'QBUF', 'BRHY', 'BGRO', 'METU', 'METD', 'BELT', 'BSJW', 'FCTE', 'IBTP', 'BMDL', 'EVSD', 'QXQ', 'IBGK', 'IBGA', 'GLOW', 'QQQT']: Exception("%ticker%: Data doesn't exist for startDate = 1546318800, endDate = 1717214400")



Data saved to data/raw_data_etf.csv


In [33]:
import polars as pl

# Load (or re-load for consistency) Data and remove features with NaN's
data = pl.read_csv(file_name)

# Filter columns that have all null values
data = data.filter(~pl.all_horizontal(pl.all().is_null()))

# Replace null values with zero for each column explicitly
data = data.with_columns(
    [pl.col(column).fill_null(0) for column in data.columns]
)

# Print the shape of the dataset
print("Shape:", data.shape)

# Check for any null values and print a boolean
has_null_values = data.null_count().sum_horizontal().sum() > 0
print("Null Values:", has_null_values)

# Display the first 50 rows to verify
data


Shape: (1363, 645)
Null Values: False


Date,AADR,AAPB,AAPD,AAPU,AAXJ,ABCS,ACWI,ACWX,AGMI,AGNG,AGZD,AIA,AIPI,AIQ,AIRL,AIRR,ALTY,AMDL,AMDS,AMID,AMZD,AMZU,AMZZ,ANGL,AOTG,AQWA,ARVR,ASET,AUMI,AVXC,BABX,BBH,BDGS,BEEZ,BELT,BGRN,…,VIGI,VMBS,VMOT,VNQI,VONE,VONG,VONV,VPLS,VRIG,VSDA,VSMV,VTC,VTHR,VTIP,VTWG,VTWO,VTWV,VWOB,VXUS,VYMI,WABF,WBND,WCBR,WCLD,WEEI,WGMI,WINC,WISE,WNDY,WOOD,WRND,WTBN,XBIL,XFIX,XT,YLDE,ZZZ
str,f64,str,str,str,f64,str,f64,f64,str,f64,f64,f64,str,f64,str,f64,f64,str,str,str,str,str,str,f64,str,str,str,f64,str,str,str,f64,str,str,str,f64,…,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,str,str,str,str,f64,str,str,f64,str,str,str,str,f64,f64,str
"""2019-01-02 00:00:00""",37.611729,"""0""","""0""","""0""",57.365273,"""0""",57.852909,35.929523,"""0""",18.078403,18.813509,49.29137,"""0""",12.734489,"""0""",21.639997,8.677715,"""0""","""0""","""0""","""0""","""0""","""0""",20.298042,"""0""","""0""","""0""",22.040138,"""0""","""0""","""0""",109.191833,"""0""","""0""","""0""",43.900219,…,49.100071,45.093952,22.233305,42.919491,104.99736,31.997368,43.003792,"""0""",20.791592,25.126347,23.901342,67.175781,104.828293,39.93549,117.246986,50.260658,84.56337,57.170979,40.046909,44.070271,"""0""",70.805153,"""0""","""0""","""0""","""0""",0.0,"""0""","""0""",51.86504,"""0""","""0""","""0""","""0""",31.699692,23.761585,"""0"""
"""2019-01-03 00:00:00""",37.192055,"""0""","""0""","""0""",56.01046,"""0""",56.869751,35.594364,"""0""",18.043737,18.773779,47.861725,"""0""",12.401689,"""0""",21.451567,8.691301,"""0""","""0""","""0""","""0""","""0""","""0""",20.320822,"""0""","""0""","""0""",21.988029,"""0""","""0""","""0""",109.97773,"""0""","""0""","""0""",43.900219,…,48.399132,45.277599,22.203083,42.98547,102.711037,31.044077,42.305412,"""0""",20.80003,24.629292,23.610014,67.293098,102.631538,40.043842,115.129013,49.30814,84.382973,57.323936,39.596573,43.921253,"""0""",70.721504,"""0""","""0""","""0""","""0""",0.0,"""0""","""0""",51.638756,"""0""","""0""","""0""","""0""",30.861727,23.422525,"""0"""
"""2019-01-04 00:00:00""",38.637589,"""0""","""0""","""0""",57.856274,"""0""",58.682728,36.69434,"""0""",18.670582,18.869139,49.552124,"""0""",12.900888,"""0""",22.145788,8.808433,"""0""","""0""","""0""","""0""","""0""","""0""",20.632038,"""0""","""0""","""0""",22.378817,"""0""","""0""","""0""",115.390663,"""0""","""0""","""0""",43.982483,…,49.963421,45.120186,22.258947,43.925667,106.126755,32.250004,43.498848,"""0""",20.816893,25.235594,24.027611,67.1548,105.912865,40.027172,119.461166,51.134197,86.574875,57.469265,40.888126,45.160469,"""0""",70.554161,"""0""","""0""","""0""","""0""",0.0,"""0""","""0""",53.449051,"""0""","""0""","""0""","""0""",31.918703,23.999844,"""0"""
"""2019-01-07 00:00:00""",39.094551,"""0""","""0""","""0""",58.20179,"""0""",59.043526,36.745903,"""0""",19.003744,18.809532,49.857834,"""0""",13.145596,"""0""",22.502817,8.988332,"""0""","""0""","""0""","""0""","""0""","""0""",20.912901,"""0""","""0""","""0""",22.569866,"""0""","""0""","""0""",118.102043,"""0""","""0""","""0""",43.982483,…,50.006161,45.067719,22.428373,44.181339,107.054153,32.566971,43.737545,"""0""",20.804247,25.426773,24.244078,67.191704,106.79525,40.010506,122.378151,52.096012,87.702393,57.668129,41.015583,45.199677,"""0""",70.91671,"""0""","""0""","""0""","""0""",0.0,"""0""","""0""",53.783951,"""0""","""0""","""0""","""0""",32.404335,24.228937,"""0"""
"""2019-01-08 00:00:00""",39.364998,"""0""","""0""","""0""",58.356358,"""0""",59.476475,36.986523,"""0""",19.028782,18.908867,49.722958,"""0""",13.27284,"""0""",22.790428,9.078924,"""0""","""0""","""0""","""0""","""0""","""0""",21.049534,"""0""","""0""","""0""",22.84602,"""0""","""0""","""0""",119.781929,"""0""","""0""","""0""",43.956512,…,50.373737,45.05896,22.501638,44.527733,108.027466,32.984043,44.055805,"""0""",20.816893,25.426773,24.402824,67.234398,107.925812,40.002159,124.361267,52.890549,89.100479,57.668129,41.312984,45.411434,"""0""",71.056122,"""0""","""0""","""0""","""0""",0.0,"""0""","""0""",54.290844,"""0""","""0""","""0""","""0""",32.58527,24.250925,"""0"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2024-05-24 00:00:00""",63.126698,"""20.989999771118164""","""20.237186431884766""","""26.952613830566406""",71.892708,"""26.473419189453125""",110.770073,53.762825,"""28.229999542236328""",29.647701,22.070314,66.198334,"""0""",34.358345,"""25.25""",73.17453,11.20698,"""17.65999984741211""","""12.517600059509277""","""32.98400115966797""","""14.392091751098633""","""33.04756164550781""","""26.097999572753906""",28.301798,"""38.11000061035156""","""17.668235778808594""","""36.51853942871094""",30.676825,"""31.799999237060547""","""51.826969146728516""","""16.190000534057617""",168.050003,"""27.989999771118164""","""29.85300064086914""","""0""",46.1031,…,80.55307,44.700729,25.639999,41.759998,239.539413,88.926918,77.127655,"""75.0330581665039""",25.006441,48.683243,44.657619,74.986801,233.913132,47.709568,192.210449,82.763855,135.939209,62.685833,61.15012,69.752563,"""24.990909576416016""",19.690783,"""24.3799991607666""","""32.09000015258789""","""23.62512969970703""","""17.110000610351562""",23.655672,"""28.329999923706055""","""12.916607856750488""",82.913925,"""28.813793182373047""","""24.4611873626709""","""49.62916564941406""","""50.85038375854492""",59.230984,46.30584,"""24.38761329650879"""
"""2024-05-28 00:00:00""",63.562946,"""21.030000686645508""","""20.237186431884766""","""27.00230598449707""",71.902664,"""26.29389190673828""",110.799835,53.7234,"""29.22170066833496""",29.438425,22.110189,66.367355,"""0""",34.528137,"""25.062999725341797""",72.624802,11.187213,"""18.729999542236328""","""12.140000343322754""","""32.582000732421875""","""14.283435821533203""","""33.61341094970703""","""26.452999114990234""",28.173199,"""38.4640007019043""","""17.295068740844727""","""36.56250762939453""",30.669895,"""32.54199981689453""","""51.7772331237793""","""15.880000114440918""",164.440002,"""28.01099967956543""","""29.62700080871582""","""0""",45.973957,…,80.175591,44.412659,25.620001,41.610001,239.509521,89.346298,76.630188,"""74.81471252441406""",24.996492,48.19968,44.307343,74.629486,233.843353,47.719475,192.450119,82.674187,135.700272,62.339336,61.130276,69.851112,"""24.86711883544922""",19.571743,"""23.979999542236328""","""31.829999923706055""","""23.859655380249023""","""17.25""",23.665588,"""28.43000030517578""","""13.165963172912598""",82.431526,"""28.861526489257812""","""24.32659339904785""","""49.63412857055664""","""50.65788650512695""",59.320744,46.065861,"""24.31945037841797"""
"""2024-05-29 00:00:00""",63.414223,"""21.110000610351562""","""20.227298736572266""","""27.042057037353516""",70.776848,"""25.95079231262207""",109.679237,52.875813,"""28.619800567626953""",29.219181,22.10022,64.915726,"""0""",34.19854,"""24.540000915527344""",72.165016,11.098269,"""17.34000015258789""","""12.59000015258789""","""32.11000061035156""","""14.308130264282227""","""33.56377410888672""","""26.530000686645508""",28.064384,"""37.92100143432617""","""17.115949630737305""","""36.20274353027344""",30.215534,"""31.73699951171875""","""50.947601318359375""","""15.279999732971191""",162.360001,"""28.0""","""29.299999237060547""","""0""",45.894493,…,79.072937,44.293457,25.424999,41.07,237.845016,88.896965,75.774544,"""74.54673767089844""",25.006441,47.743038,44.017944,74.321793,232.168762,47.669933,189.614105,81.458679,133.370743,62.111641,60.108463,68.806503,"""24.765111923217773""",19.482466,"""23.8799991607666""","""31.68000030517578""","""23.476957321166992""","""16.90999984741211""",23.640797,"""28.1299991607666""","""12.981440544128418""",80.96463,"""28.539318084716797""","""24.251821517944336""","""49.63908386230469""","""50.49054718017578""",58.433125,45.606804,"""24.150156021118164"""
"""2024-05-30 00:00:00""",64.256973,"""21.31999969482422""","""20.123493194580078""","""27.300453186035156""",70.607483,"""26.158246994018555""",109.401566,53.161625,"""28.55699920654297""",29.348734,22.090252,64.806358,"""0""",33.389519,"""24.739999771118164""",73.074585,11.177331,"""17.6200008392334""","""12.479999542236328""","""32.17100143432617""","""14.513589859008789""","""32.51149368286133""","""25.68000030517578""",28.113846,"""37.17100143432617""","""17.262229919433594""","""36.12978744506836""",30.509533,"""32.12799835205078""","""50.792423248291016""","""15.529999732971191""",163.820007,"""27.989999771118164""","""29.152000427246094""","""0""",46.053432,…,79.529892,44.50206,25.530001,41.470001,236.339966,87.678787,76.14267,"""74.75614929199219""",25.016394,48.036167,44.038902,74.679108,231.132126,47.749199,190.79245,82.215874,135.122864,62.438332,60.485447,69.387939,"""24.87801170349121""",19.591585,"""23.030000686645508""","""30.559999465942383""","""23.490825653076172""","""16.6200008392334""",23.645756,"""27.545000076293945""","""13.265705108642578""",81.525787,"""28.471694946289062""","""24.344539642333984""","""49.63908386230469""","""50.710391998291016""",58.094036,45.859734,"""24.16814613342285"""
