In [4]:
# Imports
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf


In [5]:
# Extract and view data
start_date = "2017-01-01"
end_date = "2022-06-01"
symbol = "ETH-USD"

# Fetch data using yfinance
df_yf = yf.download(symbol, start=start_date, end=end_date)

# Convert to Polars DataFrame
df = pl.from_pandas(df_yf.reset_index())


[*********************100%%**********************]  1 of 1 completed


In [6]:
df.head()

Date,Open,High,Low,Close,Adj Close,Volume
datetime[ns],f64,f64,f64,f64,f64,i64
2017-11-09 00:00:00,308.644989,329.451996,307.056,320.884003,320.884003,893249984
2017-11-10 00:00:00,320.67099,324.717987,294.541992,299.252991,299.252991,885985984
2017-11-11 00:00:00,298.585999,319.453003,298.191986,314.681,314.681,842300992
2017-11-12 00:00:00,314.690002,319.153015,298.513,307.90799,307.90799,1613479936
2017-11-13 00:00:00,307.024994,328.415009,307.024994,316.716003,316.716003,1041889984


In [7]:
# View head and tail
print(df.head())
print(df.tail())
print(df.tail(3))


shape: (5, 7)
┌──────────────┬────────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ Date         ┆ Open       ┆ High       ┆ Low        ┆ Close      ┆ Adj Close  ┆ Volume     │
│ ---          ┆ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---        ┆ ---        │
│ datetime[ns] ┆ f64        ┆ f64        ┆ f64        ┆ f64        ┆ f64        ┆ i64        │
╞══════════════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╡
│ 2017-11-09   ┆ 308.644989 ┆ 329.451996 ┆ 307.056    ┆ 320.884003 ┆ 320.884003 ┆ 893249984  │
│ 00:00:00     ┆            ┆            ┆            ┆            ┆            ┆            │
│ 2017-11-10   ┆ 320.67099  ┆ 324.717987 ┆ 294.541992 ┆ 299.252991 ┆ 299.252991 ┆ 885985984  │
│ 00:00:00     ┆            ┆            ┆            ┆            ┆            ┆            │
│ 2017-11-11   ┆ 298.585999 ┆ 319.453003 ┆ 298.191986 ┆ 314.681    ┆ 314.681    ┆ 842300992  │
│ 00:00:00     ┆            ┆       

In [8]:
# Info about the dataframe
print(df.schema)


OrderedDict({'Date': Datetime(time_unit='ns', time_zone=None), 'Open': Float64, 'High': Float64, 'Low': Float64, 'Close': Float64, 'Adj Close': Float64, 'Volume': Int64})


In [26]:
# More comprehensive info about the dataframe
print("DataFrame Info:")
print(f"Shape: {df.shape}")
print("\nColumn Info:")
for col in df.columns:
    dtype = df.schema[col]
    null_count = df[col].null_count()
    non_null_count = df.height - null_count
    print(f"{col}: {dtype} | Non-Null Count: {non_null_count} | Null Count: {null_count}")
print(f"\nMemory Usage: {df.estimated_size() / 1e6:.2f} MB")

DataFrame Info:
Shape: (1664, 11)

Column Info:
Date: Datetime(time_unit='ns', time_zone=None) | Non-Null Count: 1664 | Null Count: 0
Open: Float64 | Non-Null Count: 1664 | Null Count: 0
High: Float64 | Non-Null Count: 1664 | Null Count: 0
Low: Float64 | Non-Null Count: 1664 | Null Count: 0
Close: Float64 | Non-Null Count: 1664 | Null Count: 0
Adj Close: Float64 | Non-Null Count: 1664 | Null Count: 0
Volume: Int64 | Non-Null Count: 1664 | Null Count: 0
Returns: Float64 | Non-Null Count: 1663 | Null Count: 1
LRets: Float64 | Non-Null Count: 1664 | Null Count: 0
CumSum: Float64 | Non-Null Count: 1664 | Null Count: 0
RetNormal: Float64 | Non-Null Count: 1664 | Null Count: 0

Memory Usage: 0.15 MB


# Working with returns

In [10]:
# Add a column for returns
df = df.with_columns(
    pl.col("Close").pct_change().alias("Returns")
)
print(df.head(3))

shape: (3, 8)
┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ Date       ┆ Open       ┆ High       ┆ Low       ┆ Close     ┆ Adj Close ┆ Volume    ┆ Returns   │
│ ---        ┆ ---        ┆ ---        ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ datetime[n ┆ f64        ┆ f64        ┆ f64       ┆ f64       ┆ f64       ┆ i64       ┆ f64       │
│ s]         ┆            ┆            ┆           ┆           ┆           ┆           ┆           │
╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 2017-11-09 ┆ 308.644989 ┆ 329.451996 ┆ 307.056   ┆ 320.88400 ┆ 320.88400 ┆ 893249984 ┆ null      │
│ 00:00:00   ┆            ┆            ┆           ┆ 3         ┆ 3         ┆           ┆           │
│ 2017-11-10 ┆ 320.67099  ┆ 324.717987 ┆ 294.54199 ┆ 299.25299 ┆ 299.25299 ┆ 885985984 ┆ -0.067411 │
│ 00:00:00   ┆            ┆            ┆ 2         ┆ 1         ┆ 1         ┆ 

In [25]:
# Add a column for returns
df = df.with_columns(
    pl.col("Close").pct_change().alias("Returns")
)
# pl.col() is a Polars function used to reference columns in expressions.
# This method calculates the percentage change between consecutive elements in the "Close" column.
# It's equivalent to (current_value - previous_value) / previous_value.
# This names the resulting column "Returns".
# Without this, Polars would generate a default name for the new column.

''' akes the "Close" price column
Calculates the percentage change from one row to the next
Names this new column of percentage changes "Returns"

In financial terms, this is calculating the simple returns of the asset based on its closing prices.
For example:

If yesterday's close was 100 and today's close is 105, the return would be (105 - 100) / 100 = 0.05
or 5%.
If yesterday's close was 100 and today's close is 95, the return would be (95 - 100) / 100 = -0.05
or -5%. '''
print(df.head(3))

shape: (3, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Date      ┆ Open      ┆ High      ┆ Low       ┆ … ┆ Returns   ┆ LRets     ┆ CumSum    ┆ RetNorma │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ l        │
│ datetime[ ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│ ns]       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2017-11-1 ┆ 320.67099 ┆ 324.71798 ┆ 294.54199 ┆ … ┆ null      ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ 0         ┆           ┆ 7         ┆ 2         ┆   ┆           ┆           ┆           ┆          │
│ 00:00:00  ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ 2017-11-1 ┆ 298.58599 ┆ 319.45300 ┆ 298.19198 ┆ … ┆ inf       ┆ 0.0       

In [11]:
# Drop NA values
df = df.drop_nulls()
print(df.head(3))

shape: (3, 8)
┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ Date       ┆ Open       ┆ High       ┆ Low       ┆ Close     ┆ Adj Close ┆ Volume    ┆ Returns   │
│ ---        ┆ ---        ┆ ---        ┆ ---       ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ datetime[n ┆ f64        ┆ f64        ┆ f64       ┆ f64       ┆ f64       ┆ i64       ┆ f64       │
│ s]         ┆            ┆            ┆           ┆           ┆           ┆           ┆           │
╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 2017-11-10 ┆ 320.67099  ┆ 324.717987 ┆ 294.54199 ┆ 299.25299 ┆ 299.25299 ┆ 885985984 ┆ -0.067411 │
│ 00:00:00   ┆            ┆            ┆ 2         ┆ 1         ┆ 1         ┆           ┆           │
│ 2017-11-11 ┆ 298.585999 ┆ 319.453003 ┆ 298.19198 ┆ 314.681   ┆ 314.681   ┆ 842300992 ┆ 0.051555  │
│ 00:00:00   ┆            ┆            ┆ 6         ┆           ┆           ┆ 

In [12]:

# Info about the dataframe
print(df.schema)


OrderedDict({'Date': Datetime(time_unit='ns', time_zone=None), 'Open': Float64, 'High': Float64, 'Low': Float64, 'Close': Float64, 'Adj Close': Float64, 'Volume': Int64, 'Returns': Float64})


In [13]:
# More comprehensive info about the dataframe
print("DataFrame Info:")
print(f"Shape: {df.shape}")
print("\nColumn Info:")
for col in df.columns:
    dtype = df.schema[col]
    non_null_count = df[col].null_count()
    print(f"{col}: {dtype} | Non-Null Count: {df.height - non_null_count}")
print(f"\nMemory Usage: {df.estimated_size() / 1e6:.2f} MB")

DataFrame Info:
Shape: (1664, 8)

Column Info:
Date: Datetime(time_unit='ns', time_zone=None) | Non-Null Count: 1664
Open: Float64 | Non-Null Count: 1664
High: Float64 | Non-Null Count: 1664
Low: Float64 | Non-Null Count: 1664
Close: Float64 | Non-Null Count: 1664
Adj Close: Float64 | Non-Null Count: 1664
Volume: Int64 | Non-Null Count: 1664
Returns: Float64 | Non-Null Count: 1664

Memory Usage: 0.11 MB


In [39]:
# Calculate log returns mathematical functions like log are accessed through the pl.expr
# Calculate log returns
df = df.with_columns(
    (pl.col("Close") / pl.col("Close").shift(1)).log().alias("Log Returns")
)
'''This is equivalent to the mathematical formula for log returns:
log(P_t / P_{t-1})
Where:

P_t is the price at time t
P_{t-1} is the price at the previous time step

This formula for log returns is widely used in finance because:

It's approximately equal to the percentage change for small changes.
Log returns are additive over time, which is useful for multi-period returns.
They're more likely to be normally distributed, which is an assumption in many financial models.
Purpose: .shift(1) is used to access the previous day's closing price.
Functionality:

It shifts the entire "Close" column down by one row.
This creates a new series where each price is matched with the previous day's price.

In the context of log returns:

We're calculating: ln(Close_t / Close_t-1)
pl.col("Close") gives us Close_t (current price)
pl.col("Close").shift(1) gives us Close_t-1 (previous price)
'''
print(df.head(3))

shape: (3, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Date      ┆ Open      ┆ High      ┆ Low       ┆ … ┆ LRets     ┆ CumSum    ┆ RetNormal ┆ Log      │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ Returns  │
│ datetime[ ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│ ns]       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2017-11-1 ┆ 320.67099 ┆ 324.71798 ┆ 294.54199 ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ null     │
│ 0         ┆           ┆ 7         ┆ 2         ┆   ┆           ┆           ┆           ┆          │
│ 00:00:00  ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ 2017-11-1 ┆ 298.58599 ┆ 319.45300 ┆ 298.19198 ┆ … ┆ 0.0       ┆ 0.0       

In [40]:
import numpy as np

# Calculate log returns
df = df.with_columns(
    (pl.col("Close") / pl.col("Close").shift(1)).map_batches(np.log).alias("Log Returns")
)
print(df.head(3))

shape: (3, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Date      ┆ Open      ┆ High      ┆ Low       ┆ … ┆ LRets     ┆ CumSum    ┆ RetNormal ┆ Log      │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ Returns  │
│ datetime[ ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│ ns]       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2017-11-1 ┆ 320.67099 ┆ 324.71798 ┆ 294.54199 ┆ … ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ null     │
│ 0         ┆           ┆ 7         ┆ 2         ┆   ┆           ┆           ┆           ┆          │
│ 00:00:00  ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ 2017-11-1 ┆ 298.58599 ┆ 319.45300 ┆ 298.19198 ┆ … ┆ 0.0       ┆ 0.0       

In [42]:
# Cumulative sum of log returns
df = df.with_columns(
    pl.col("Log Returns").cum_sum().alias("CumSum")
)
print(df.head())


shape: (5, 12)
┌────────────┬────────────┬───────────┬───────────┬───┬───────────┬────────┬───────────┬───────────┐
│ Date       ┆ Open       ┆ High      ┆ Low       ┆ … ┆ LRets     ┆ CumSum ┆ RetNormal ┆ Log       │
│ ---        ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---    ┆ ---       ┆ Returns   │
│ datetime[n ┆ f64        ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64    ┆ f64       ┆ ---       │
│ s]         ┆            ┆           ┆           ┆   ┆           ┆        ┆           ┆ f64       │
╞════════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪════════╪═══════════╪═══════════╡
│ 2017-11-10 ┆ 320.67099  ┆ 324.71798 ┆ 294.54199 ┆ … ┆ 0.0       ┆ null   ┆ 0.0       ┆ null      │
│ 00:00:00   ┆            ┆ 7         ┆ 2         ┆   ┆           ┆        ┆           ┆           │
│ 2017-11-11 ┆ 298.585999 ┆ 319.45300 ┆ 298.19198 ┆ … ┆ 0.0       ┆ inf    ┆ 0.0       ┆ inf       │
│ 00:00:00   ┆            ┆ 3         ┆ 6         ┆   ┆           ┆        ┆

In [46]:
# Normalize log returns
df = df.with_columns(
    (np.exp(pl.col("CumSum")) - 1).alias("RetNormal")
)
print(df.head())

shape: (5, 12)
┌────────────┬────────────┬───────────┬───────────┬───┬───────────┬────────┬───────────┬───────────┐
│ Date       ┆ Open       ┆ High      ┆ Low       ┆ … ┆ LRets     ┆ CumSum ┆ RetNormal ┆ Log       │
│ ---        ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---    ┆ ---       ┆ Returns   │
│ datetime[n ┆ f64        ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64    ┆ f64       ┆ ---       │
│ s]         ┆            ┆           ┆           ┆   ┆           ┆        ┆           ┆ f64       │
╞════════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪════════╪═══════════╪═══════════╡
│ 2017-11-10 ┆ 320.67099  ┆ 324.71798 ┆ 294.54199 ┆ … ┆ 0.0       ┆ null   ┆ null      ┆ null      │
│ 00:00:00   ┆            ┆ 7         ┆ 2         ┆   ┆           ┆        ┆           ┆           │
│ 2017-11-11 ┆ 298.585999 ┆ 319.45300 ┆ 298.19198 ┆ … ┆ 0.0       ┆ inf    ┆ inf       ┆ inf       │
│ 00:00:00   ┆            ┆ 3         ┆ 6         ┆   ┆           ┆        ┆

In [24]:
# Fill NA with zero
df = df.fill_null(0)
print(df.head(3))

shape: (3, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Date      ┆ Open      ┆ High      ┆ Low       ┆ … ┆ Returns   ┆ LRets     ┆ CumSum    ┆ RetNorma │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ l        │
│ datetime[ ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│ ns]       ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2017-11-1 ┆ 320.67099 ┆ 324.71798 ┆ 294.54199 ┆ … ┆ -0.067411 ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ 0         ┆           ┆ 7         ┆ 2         ┆   ┆           ┆           ┆           ┆          │
│ 00:00:00  ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ 2017-11-1 ┆ 298.58599 ┆ 319.45300 ┆ 298.19198 ┆ … ┆ 0.051555  ┆ 0.0       

In [47]:
print(df.tail(3))

shape: (3, 12)
┌────────────┬────────────┬───────────┬───────────┬───┬───────────┬────────┬───────────┬───────────┐
│ Date       ┆ Open       ┆ High      ┆ Low       ┆ … ┆ LRets     ┆ CumSum ┆ RetNormal ┆ Log       │
│ ---        ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---    ┆ ---       ┆ Returns   │
│ datetime[n ┆ f64        ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64    ┆ f64       ┆ ---       │
│ s]         ┆            ┆           ┆           ┆   ┆           ┆        ┆           ┆ f64       │
╞════════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪════════╪═══════════╪═══════════╡
│ 2022-05-29 ┆ 1792.18444 ┆ 1818.7766 ┆ 1765.9373 ┆ … ┆ 0.027764  ┆ inf    ┆ inf       ┆ 0.027764  │
│ 00:00:00   ┆ 8          ┆ 11        ┆ 78        ┆   ┆           ┆        ┆           ┆           │
│ 2022-05-30 ┆ 1811.88598 ┆ 2005.2108 ┆ 1804.4560 ┆ … ┆ 0.09287   ┆ inf    ┆ inf       ┆ 0.09287   │
│ 00:00:00   ┆ 6          ┆ 15        ┆ 55        ┆   ┆           ┆        ┆

In [27]:
# Get descriptive statistics
description = df.describe()
print(description)

shape: (9, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ Date      ┆ Open      ┆ High      ┆ … ┆ Returns   ┆ LRets     ┆ CumSum    ┆ RetNorma │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ l        │
│ str       ┆ str       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 1664      ┆ 1664.0    ┆ 1664.0    ┆ … ┆ 1663.0    ┆ 1664.0    ┆ 1664.0    ┆ 1664.0   │
│ null_coun ┆ 0         ┆ 0.0       ┆ 0.0       ┆ … ┆ 1.0       ┆ 0.0       ┆ 0.0       ┆ 0.0      │
│ t         ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ mean      ┆ 2020-02-1 ┆ 1094.3970 ┆ 1130.6296 ┆ … ┆ inf       ┆ 0.000921  