# Module 01


In [1]:
import arrow
import numpy as np
import pandas as pd
import polars as pl

In [2]:
df = pl.DataFrame(
    {
        "integer": [1, 2, 3],
        "date": [
            arrow.get(2025, 1, 1).datetime,
            arrow.get(2025, 1, 2).datetime,
            arrow.get(2025, 1, 3).datetime,
        ],
        "float": [4.0, 5.0, 6.0],
        "string": ["a", "b", "c"],
    }
)

In [3]:
df

integer,date,float,string
i64,"datetime[μs, UTC]",f64,str
1,2025-01-01 00:00:00 UTC,4.0,"""a"""
2,2025-01-02 00:00:00 UTC,5.0,"""b"""
3,2025-01-03 00:00:00 UTC,6.0,"""c"""


In [4]:
df.write_csv("../data/output.csv")

In [5]:
df_csv = pl.read_csv("../data/output.csv")

In [6]:
print(df_csv)

shape: (3, 4)
┌─────────┬─────────────────────────────────┬───────┬────────┐
│ integer ┆ date                            ┆ float ┆ string │
│ ---     ┆ ---                             ┆ ---   ┆ ---    │
│ i64     ┆ str                             ┆ f64   ┆ str    │
╞═════════╪═════════════════════════════════╪═══════╪════════╡
│ 1       ┆ 2025-01-01T00:00:00.000000+000… ┆ 4.0   ┆ a      │
│ 2       ┆ 2025-01-02T00:00:00.000000+000… ┆ 5.0   ┆ b      │
│ 3       ┆ 2025-01-03T00:00:00.000000+000… ┆ 6.0   ┆ c      │
└─────────┴─────────────────────────────────┴───────┴────────┘


In [7]:
df.write_parquet("../data/output.parquet")

In [8]:
pl.read_parquet("../data/output.parquet")

integer,date,float,string
i64,"datetime[μs, UTC]",f64,str
1,2025-01-01 00:00:00 UTC,4.0,"""a"""
2,2025-01-02 00:00:00 UTC,5.0,"""b"""
3,2025-01-03 00:00:00 UTC,6.0,"""c"""


In [9]:
df.select("*")

integer,date,float,string
i64,"datetime[μs, UTC]",f64,str
1,2025-01-01 00:00:00 UTC,4.0,"""a"""
2,2025-01-02 00:00:00 UTC,5.0,"""b"""
3,2025-01-03 00:00:00 UTC,6.0,"""c"""


In [10]:
df.select("date", "integer")

date,integer
"datetime[μs, UTC]",i64
2025-01-01 00:00:00 UTC,1
2025-01-02 00:00:00 UTC,2
2025-01-03 00:00:00 UTC,3


In [11]:
df.select(["date", "integer"])

date,integer
"datetime[μs, UTC]",i64
2025-01-01 00:00:00 UTC,1
2025-01-02 00:00:00 UTC,2
2025-01-03 00:00:00 UTC,3


In [12]:
df.select(pl.col("date", "integer"))

date,integer
"datetime[μs, UTC]",i64
2025-01-01 00:00:00 UTC,1
2025-01-02 00:00:00 UTC,2
2025-01-03 00:00:00 UTC,3


In [13]:
df.select(pl.col(["date", "integer"]))

date,integer
"datetime[μs, UTC]",i64
2025-01-01 00:00:00 UTC,1
2025-01-02 00:00:00 UTC,2
2025-01-03 00:00:00 UTC,3


In [14]:
df.select(pl.col("string") + "1", pl.col("float") + 1, "date")

string,float,date
str,f64,"datetime[μs, UTC]"
"""a1""",5.0,2025-01-01 00:00:00 UTC
"""b1""",6.0,2025-01-02 00:00:00 UTC
"""c1""",7.0,2025-01-03 00:00:00 UTC


In [15]:
df.filter("string" == "a")

integer,date,float,string
i64,"datetime[μs, UTC]",f64,str


In [16]:
df.filter(pl.col("string") == "a")

integer,date,float,string
i64,"datetime[μs, UTC]",f64,str
1,2025-01-01 00:00:00 UTC,4.0,"""a"""


In [17]:
df.select(pl.col("string") == "a")

string
bool
True
False
False


In [18]:
df.filter(df.select(pl.col("string") == "a"))

TypeError: invalid predicate for `filter`: shape: (3, 1)
┌────────┐
│ string │
│ ---    │
│ bool   │
╞════════╡
│ true   │
│ false  │
│ false  │
└────────┘

In [19]:
df.filter((pl.col("string") <= "b") & (pl.col("float").is_not_nan()))

integer,date,float,string
i64,"datetime[μs, UTC]",f64,str
1,2025-01-01 00:00:00 UTC,4.0,"""a"""
2,2025-01-02 00:00:00 UTC,5.0,"""b"""


In [20]:
df.filter((pl.col("string") <= "b") & (pl.col("float").is_not_nan())).select(
    pl.col("float")
)

float
f64
4.0
5.0


In [21]:
df.with_columns(
    pl.col("integer").sum().alias("sum_int"), (pl.col("float") + 10).alias("float + 10")
)

integer,date,float,string,sum_int,float + 10
i64,"datetime[μs, UTC]",f64,str,i64,f64
1,2025-01-01 00:00:00 UTC,4.0,"""a""",6,14.0
2,2025-01-02 00:00:00 UTC,5.0,"""b""",6,15.0
3,2025-01-03 00:00:00 UTC,6.0,"""c""",6,16.0


In [22]:
df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

In [23]:
df2

x,y
i64,str
0,"""A"""
1,"""A"""
2,"""A"""
3,"""B"""
4,"""B"""
5,"""C"""
6,"""X"""
7,"""X"""


In [24]:
df2.group_by("y", maintain_order=True).len()

y,len
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [25]:
df2.group_by("y", "x", maintain_order=True).len()

y,x,len
str,i64,u32
"""A""",0,1
"""A""",1,1
"""A""",2,1
"""B""",3,1
"""B""",4,1
"""C""",5,1
"""X""",6,1
"""X""",7,1


In [26]:
df2.group_by("y", maintain_order=True).count()

  df2.group_by("y", maintain_order=True).count()


y,count
str,u32
"""A""",3
"""B""",2
"""C""",1
"""X""",2


In [27]:
df2.group_by("y", maintain_order=True).agg(
    pl.col("*").count().alias("count"), pl.col("*").sum().alias("sum")
)

y,count,sum
str,u32,i64
"""A""",3,3
"""B""",2,7
"""C""",1,5
"""X""",2,13


In [28]:
df.select(pl.all())

integer,date,float,string
i64,"datetime[μs, UTC]",f64,str
1,2025-01-01 00:00:00 UTC,4.0,"""a"""
2,2025-01-02 00:00:00 UTC,5.0,"""b"""
3,2025-01-03 00:00:00 UTC,6.0,"""c"""


In [29]:
df = pl.DataFrame(
    {
        "a": range(8),
        "b": np.random.rand(8),
        "d": [1.0, 2.0, float("nan"), float("nan"), 0.0, -5.0, -42.0, None],
    }
)

df2 = pl.DataFrame(
    {
        "x": range(8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

In [30]:
df

a,b,d
i64,f64,f64
0,0.822403,1.0
1,0.963259,2.0
2,0.303286,
3,0.981165,
4,0.436694,0.0
5,0.67776,-5.0
6,0.974445,-42.0
7,0.595284,


In [31]:
df2

x,y
i64,str
0,"""A"""
1,"""A"""
2,"""A"""
3,"""B"""
4,"""B"""
5,"""C"""
6,"""X"""
7,"""X"""


In [32]:
df.join(df2, left_on=pl.col("a"), right_on=pl.col("x"))

a,b,d,y
i64,f64,f64,str
0,0.822403,1.0,"""A"""
1,0.963259,2.0,"""A"""
2,0.303286,,"""A"""
3,0.981165,,"""B"""
4,0.436694,0.0,"""B"""
5,0.67776,-5.0,"""C"""
6,0.974445,-42.0,"""X"""
7,0.595284,,"""X"""


In [33]:
df.hstack(df2)

a,b,d,x,y
i64,f64,f64,i64,str
0,0.822403,1.0,0,"""A"""
1,0.963259,2.0,1,"""A"""
2,0.303286,,2,"""A"""
3,0.981165,,3,"""B"""
4,0.436694,0.0,4,"""B"""
5,0.67776,-5.0,5,"""C"""
6,0.974445,-42.0,6,"""X"""
7,0.595284,,7,"""X"""


In [35]:
df.select(pl.col("a", "b")).vstack(df2.rename({"x": "a", "y": "b"}))

SchemaError: type String is incompatible with expected type Float64

In [36]:
df.describe()

statistic,a,b,d
str,f64,f64,f64
"""count""",8.0,8.0,7.0
"""null_count""",0.0,0.0,1.0
"""mean""",3.5,0.719287,
"""std""",2.44949,0.260181,
"""min""",0.0,0.303286,-42.0
"""25%""",2.0,0.595284,0.0
"""50%""",4.0,0.822403,1.0
"""75%""",5.0,0.963259,
"""max""",7.0,0.981165,2.0


In [37]:
df.shape

(8, 3)

In [40]:
df

a,b,d
i64,f64,f64
0,0.822403,1.0
1,0.963259,2.0
2,0.303286,
3,0.981165,
4,0.436694,0.0
5,0.67776,-5.0
6,0.974445,-42.0
7,0.595284,


In [101]:
u = np.array([2, 7, 5, 6])
v = np.array([3, 4, 8, 6])

V = np.array(
    [
        [1, 1, 2, 0],
        [0, 0.5, 1, 2],
        [0, 2, 1, 0.5],
    ]
)

In [67]:
# addition
u + v

array([ 5, 11, 13, 12])

In [68]:
# subtraction
u - v

array([-1,  3, -3,  0])

In [69]:
# scalar multiplication
2 * v

array([ 6,  8, 16, 12])

In [70]:
u @ v

np.int64(110)

In [71]:
u.shape[0]

4

In [76]:
def vector_vector_multiplication(u, v):
    assert u.shape[0] == v.shape[0]
    n = u.shape[0]
    result = 0
    for i in range(n):
        result += u[i] * v[i]
    return result


vector_vector_multiplication(u, v)

np.int64(110)

In [102]:
u

array([2, 7, 5, 6])

In [103]:
V

array([[1. , 1. , 2. , 0. ],
       [0. , 0.5, 1. , 2. ],
       [0. , 2. , 1. , 0.5]])

In [105]:
V @ u

array([19. , 20.5, 22. ])

In [106]:
u.shape

(4,)

In [107]:
V.shape

(3, 4)

In [114]:
def matrix_vector_multiplication(U, v):
    assert U.shape[1] == v.shape[0]
    num_rows = U.shape[0]
    result = np.zeros(num_rows)
    for i in range(num_rows):
        result[i] = vector_vector_multiplication(U[i], v)
    return result


matrix_vector_multiplication(V, u)


array([19. , 20.5, 22. ])

In [130]:
V @ np.transpose(V)

array([[6.  , 2.5 , 4.  ],
       [2.5 , 5.25, 3.  ],
       [4.  , 3.  , 5.25]])

In [131]:
V

array([[1. , 1. , 2. , 0. ],
       [0. , 0.5, 1. , 2. ],
       [0. , 2. , 1. , 0.5]])

In [132]:
V.transpose()

array([[1. , 0. , 0. ],
       [1. , 0.5, 2. ],
       [2. , 1. , 1. ],
       [0. , 2. , 0.5]])

In [141]:
V.transpose().shape

(4, 3)

In [142]:
V.transpose()[:, 0]

array([1., 1., 2., 0.])