# Structure Changes

In [1]:
# Imports
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf


In [2]:
# Extract and view data
start_date = "2023-01-01"
end_date = "2024-06-01"
symbol = "SHIB-USD"

# Fetch data using yfinance
df_yf = yf.download(symbol, start=start_date, end=end_date)

# Convert to Polars DataFrame
df = pl.from_pandas(df_yf.reset_index())
df.head(5)

[*********************100%%**********************]  1 of 1 completed


Date,Open,High,Low,Close,Adj Close,Volume
datetime[ns],f64,f64,f64,f64,f64,i64
2023-01-01 00:00:00,8e-06,8e-06,8e-06,8e-06,8e-06,54630954
2023-01-02 00:00:00,8e-06,8e-06,8e-06,8e-06,8e-06,87586024
2023-01-03 00:00:00,8e-06,8e-06,8e-06,8e-06,8e-06,80033128
2023-01-04 00:00:00,8e-06,8e-06,8e-06,8e-06,8e-06,143577598
2023-01-05 00:00:00,8e-06,9e-06,8e-06,8e-06,8e-06,262459409


In [3]:
# View head and tail
print(df.head())
print(df.tail())
print(df.tail(3))


shape: (5, 7)
┌─────────────────────┬──────────┬──────────┬──────────┬──────────┬───────────┬───────────┐
│ Date                ┆ Open     ┆ High     ┆ Low      ┆ Close    ┆ Adj Close ┆ Volume    │
│ ---                 ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---       ┆ ---       │
│ datetime[ns]        ┆ f64      ┆ f64      ┆ f64      ┆ f64      ┆ f64       ┆ i64       │
╞═════════════════════╪══════════╪══════════╪══════════╪══════════╪═══════════╪═══════════╡
│ 2023-01-01 00:00:00 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ 54630954  │
│ 2023-01-02 00:00:00 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ 87586024  │
│ 2023-01-03 00:00:00 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ 80033128  │
│ 2023-01-04 00:00:00 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ 143577598 │
│ 2023-01-05 00:00:00 ┆ 0.000008 ┆ 0.000009 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ 262459409 │
└─────────────────────┴──────────┴──────────┴──────────┴──────────

In [4]:
# Add a column for returns
df = df.with_columns(
    pl.col("Close").pct_change().alias("Returns")
)
print(df.head(3))

shape: (3, 8)
┌─────────────────────┬──────────┬──────────┬──────────┬──────────┬───────────┬──────────┬─────────┐
│ Date                ┆ Open     ┆ High     ┆ Low      ┆ Close    ┆ Adj Close ┆ Volume   ┆ Returns │
│ ---                 ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---       ┆ ---      ┆ ---     │
│ datetime[ns]        ┆ f64      ┆ f64      ┆ f64      ┆ f64      ┆ f64       ┆ i64      ┆ f64     │
╞═════════════════════╪══════════╪══════════╪══════════╪══════════╪═══════════╪══════════╪═════════╡
│ 2023-01-01 00:00:00 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ 54630954 ┆ null    │
│ 2023-01-02 00:00:00 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ 87586024 ┆ 0.0     │
│ 2023-01-03 00:00:00 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ 80033128 ┆ 0.0     │
└─────────────────────┴──────────┴──────────┴──────────┴──────────┴───────────┴──────────┴─────────┘


In [5]:
import numpy as np

# Calculate log returns
df = df.with_columns(
    (pl.col("Close") / pl.col("Close").shift(1)).map_batches(np.log).alias("Log Returns")
)
print(df.head(3))

shape: (3, 9)
┌──────────────┬──────────┬──────────┬──────────┬───┬───────────┬──────────┬─────────┬─────────────┐
│ Date         ┆ Open     ┆ High     ┆ Low      ┆ … ┆ Adj Close ┆ Volume   ┆ Returns ┆ Log Returns │
│ ---          ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---       ┆ ---      ┆ ---     ┆ ---         │
│ datetime[ns] ┆ f64      ┆ f64      ┆ f64      ┆   ┆ f64       ┆ i64      ┆ f64     ┆ f64         │
╞══════════════╪══════════╪══════════╪══════════╪═══╪═══════════╪══════════╪═════════╪═════════════╡
│ 2023-01-01   ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ … ┆ 0.000008  ┆ 54630954 ┆ null    ┆ null        │
│ 00:00:00     ┆          ┆          ┆          ┆   ┆           ┆          ┆         ┆             │
│ 2023-01-02   ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ … ┆ 0.000008  ┆ 87586024 ┆ 0.0     ┆ 0.0         │
│ 00:00:00     ┆          ┆          ┆          ┆   ┆           ┆          ┆         ┆             │
│ 2023-01-03   ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ … ┆ 0.000008  ┆ 80033128 ┆ 

In [6]:
# Cumulative sum of log returns
df = df.with_columns(
    pl.col("Log Returns").cum_sum().alias("CumSum")
)
print(df.head())


shape: (5, 10)
┌────────────────┬──────────┬──────────┬──────────┬───┬───────────┬─────────┬─────────────┬────────┐
│ Date           ┆ Open     ┆ High     ┆ Low      ┆ … ┆ Volume    ┆ Returns ┆ Log Returns ┆ CumSum │
│ ---            ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---       ┆ ---     ┆ ---         ┆ ---    │
│ datetime[ns]   ┆ f64      ┆ f64      ┆ f64      ┆   ┆ i64       ┆ f64     ┆ f64         ┆ f64    │
╞════════════════╪══════════╪══════════╪══════════╪═══╪═══════════╪═════════╪═════════════╪════════╡
│ 2023-01-01     ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ … ┆ 54630954  ┆ null    ┆ null        ┆ null   │
│ 00:00:00       ┆          ┆          ┆          ┆   ┆           ┆         ┆             ┆        │
│ 2023-01-02     ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ … ┆ 87586024  ┆ 0.0     ┆ 0.0         ┆ 0.0    │
│ 00:00:00       ┆          ┆          ┆          ┆   ┆           ┆         ┆             ┆        │
│ 2023-01-03     ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ … ┆ 80033128  ┆ 0.0     

In [7]:
# Normalize log returns
df = df.with_columns(
    (np.exp(pl.col("CumSum")) - 1).alias("RetNormal")
)
print(df.head())

shape: (5, 11)
┌────────────────┬──────────┬──────────┬──────────┬───┬─────────┬─────────────┬────────┬───────────┐
│ Date           ┆ Open     ┆ High     ┆ Low      ┆ … ┆ Returns ┆ Log Returns ┆ CumSum ┆ RetNormal │
│ ---            ┆ ---      ┆ ---      ┆ ---      ┆   ┆ ---     ┆ ---         ┆ ---    ┆ ---       │
│ datetime[ns]   ┆ f64      ┆ f64      ┆ f64      ┆   ┆ f64     ┆ f64         ┆ f64    ┆ f64       │
╞════════════════╪══════════╪══════════╪══════════╪═══╪═════════╪═════════════╪════════╪═══════════╡
│ 2023-01-01     ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ … ┆ null    ┆ null        ┆ null   ┆ null      │
│ 00:00:00       ┆          ┆          ┆          ┆   ┆         ┆             ┆        ┆           │
│ 2023-01-02     ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ … ┆ 0.0     ┆ 0.0         ┆ 0.0    ┆ 0.0       │
│ 00:00:00       ┆          ┆          ┆          ┆   ┆         ┆             ┆        ┆           │
│ 2023-01-03     ┆ 0.000008 ┆ 0.000008 ┆ 0.000008 ┆ … ┆ 0.0     ┆ 0.0       

In [9]:
df_new = df.clone()

In [10]:
df_new.head()

Date,Open,High,Low,Close,Adj Close,Volume,Returns,Log Returns,CumSum,RetNormal
datetime[ns],f64,f64,f64,f64,f64,i64,f64,f64,f64,f64
2023-01-01 00:00:00,8e-06,8e-06,8e-06,8e-06,8e-06,54630954,,,,
2023-01-02 00:00:00,8e-06,8e-06,8e-06,8e-06,8e-06,87586024,0.0,0.0,0.0,0.0
2023-01-03 00:00:00,8e-06,8e-06,8e-06,8e-06,8e-06,80033128,0.0,0.0,0.0,0.0
2023-01-04 00:00:00,8e-06,8e-06,8e-06,8e-06,8e-06,143577598,0.0,0.0,0.0,0.0
2023-01-05 00:00:00,8e-06,9e-06,8e-06,8e-06,8e-06,262459409,0.0,0.0,0.0,0.0


In [11]:
# Drop columns
df_new = df_new.drop(['High', 'Low', 'Adjusted Close'])
print(df_new.head())


shape: (5, 9)
┌───────────────┬──────────┬──────────┬───────────┬───┬─────────┬─────────────┬────────┬───────────┐
│ Date          ┆ Open     ┆ Close    ┆ Adj Close ┆ … ┆ Returns ┆ Log Returns ┆ CumSum ┆ RetNormal │
│ ---           ┆ ---      ┆ ---      ┆ ---       ┆   ┆ ---     ┆ ---         ┆ ---    ┆ ---       │
│ datetime[ns]  ┆ f64      ┆ f64      ┆ f64       ┆   ┆ f64     ┆ f64         ┆ f64    ┆ f64       │
╞═══════════════╪══════════╪══════════╪═══════════╪═══╪═════════╪═════════════╪════════╪═══════════╡
│ 2023-01-01    ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ … ┆ null    ┆ null        ┆ null   ┆ null      │
│ 00:00:00      ┆          ┆          ┆           ┆   ┆         ┆             ┆        ┆           │
│ 2023-01-02    ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ … ┆ 0.0     ┆ 0.0         ┆ 0.0    ┆ 0.0       │
│ 00:00:00      ┆          ┆          ┆           ┆   ┆         ┆             ┆        ┆           │
│ 2023-01-03    ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ … ┆ 0.0     ┆ 0.0        

In [12]:
# Work with certain rows and columns (similar to iloc)
print(df_new.slice(1, 3).select(df_new.columns[2:-1]))

shape: (3, 6)
┌──────────┬───────────┬───────────┬─────────┬─────────────┬────────┐
│ Close    ┆ Adj Close ┆ Volume    ┆ Returns ┆ Log Returns ┆ CumSum │
│ ---      ┆ ---       ┆ ---       ┆ ---     ┆ ---         ┆ ---    │
│ f64      ┆ f64       ┆ i64       ┆ f64     ┆ f64         ┆ f64    │
╞══════════╪═══════════╪═══════════╪═════════╪═════════════╪════════╡
│ 0.000008 ┆ 0.000008  ┆ 87586024  ┆ 0.0     ┆ 0.0         ┆ 0.0    │
│ 0.000008 ┆ 0.000008  ┆ 80033128  ┆ 0.0     ┆ 0.0         ┆ 0.0    │
│ 0.000008 ┆ 0.000008  ┆ 143577598 ┆ 0.0     ┆ 0.0         ┆ 0.0    │
└──────────┴───────────┴───────────┴─────────┴─────────────┴────────┘


# Conditionals 

In [13]:
# Add conditional statements
df_new = df_new.with_columns([
    pl.when(pl.col('Close').shift(-1) > pl.col('Close'))
    .then(1)
    .otherwise(-1)
    .alias('Target')
])
print(df_new.head())


shape: (5, 10)
┌────────────────┬──────────┬──────────┬───────────┬───┬─────────────┬────────┬───────────┬────────┐
│ Date           ┆ Open     ┆ Close    ┆ Adj Close ┆ … ┆ Log Returns ┆ CumSum ┆ RetNormal ┆ Target │
│ ---            ┆ ---      ┆ ---      ┆ ---       ┆   ┆ ---         ┆ ---    ┆ ---       ┆ ---    │
│ datetime[ns]   ┆ f64      ┆ f64      ┆ f64       ┆   ┆ f64         ┆ f64    ┆ f64       ┆ i32    │
╞════════════════╪══════════╪══════════╪═══════════╪═══╪═════════════╪════════╪═══════════╪════════╡
│ 2023-01-01     ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ … ┆ null        ┆ null   ┆ null      ┆ -1     │
│ 00:00:00       ┆          ┆          ┆           ┆   ┆             ┆        ┆           ┆        │
│ 2023-01-02     ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ … ┆ 0.0         ┆ 0.0    ┆ 0.0       ┆ -1     │
│ 00:00:00       ┆          ┆          ┆           ┆   ┆             ┆        ┆           ┆        │
│ 2023-01-03     ┆ 0.000008 ┆ 0.000008 ┆ 0.000008  ┆ … ┆ 0.0         ┆ 0.0  

# Iterations 

In [15]:
# Iterations
for i, row in enumerate(df_new.iter_rows(named=True)):
    print(row['Date'], row['Close'], row['Target'])
    if i >= 4:
        break
        '''df_new.iter_rows(named=True):

This method iterates over the rows of the DataFrame.
named=True means each row will be returned as a dictionary where the keys are column names.


enumerate(...):

This Python function adds a counter to an iterable.
It allows us to keep track of the iteration count.


for i, row in ...:

i is the counter (starting from 0).
row is the dictionary representing each row of the DataFrame.


print(row['Date'], row['Close'], row['Target']):

This prints the values of the 'Date', 'Close', and 'Target' columns for each row.
We access these values using dictionary syntax because of named=True.


if i >= 4: break:

This condition checks if we've processed 5 rows (remember, i starts at 0).
If so, it breaks out of the loop, limiting the output to the first 5 rows.



This code effectively:

Iterates through the DataFrame row by row.
Prints specific column values for each row.
Stops after processing 5 rows to avoid printing the entire DataFrame. '''

2023-01-01 00:00:00 7.999999979801942e-06 -1
2023-01-02 00:00:00 7.999999979801942e-06 -1
2023-01-03 00:00:00 7.999999979801942e-06 -1
2023-01-04 00:00:00 7.999999979801942e-06 -1
2023-01-05 00:00:00 7.999999979801942e-06 -1


Key differences and notes:

Copying a DataFrame: Use .clone() in Polars instead of .copy().
Dropping columns: In Polars, drop() returns a new DataFrame, so we reassign it.
Selecting rows and columns: Polars uses slice() for row selection and select() for column selection. The syntax is a bit different from Pandas' iloc.
Conditional statements: Polars uses a different syntax for conditional operations. We use with_columns() to add or modify columns based on conditions.
Iterations: Polars provides iter_rows() method which is similar to Pandas' iterrows(). We use named=True to get dictionary-like access to column values.

Remember that Polars operations generally return new DataFrames instead of modifying in-place. This is why we often reassign the result back to the variable (e.g., df_new = df_new.drop(...)).
Also, note that Polars doesn't have an exact equivalent to Pandas' inplace=True parameter. Instead, you typically reassign the result of operations back to the same variable if you want to update the DataFrame.