# Import Polars 

In [1]:
import polars as pl
print(pl.__version__)

1.5.0


In [2]:
insurance_df = pl.read_csv("Insurance.csv")

In [3]:
type(insurance_df)

polars.dataframe.frame.DataFrame

In [4]:
insurance_df.head()

id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
i64,str,i64,i64,f64,i64,str,str,f64,f64,i64,i64
0,"""Male""",21,1,35.0,0,"""1-2 Year""","""Yes""",65101.0,124.0,187,0
1,"""Male""",43,1,28.0,0,"""> 2 Years""","""Yes""",58911.0,26.0,288,1
2,"""Female""",25,1,14.0,1,"""< 1 Year""","""No""",38043.0,152.0,254,0
3,"""Female""",35,1,1.0,0,"""1-2 Year""","""Yes""",2630.0,156.0,76,0
4,"""Female""",36,1,15.0,1,"""1-2 Year""","""No""",31951.0,152.0,294,0


#  Attributes Methods 

In [6]:
series1  = insurance_df["Annual_Premium"]

In [7]:
type(series1)

polars.series.series.Series

## polars.Series.dtype

In [8]:
series1.dtype

Float64

## polars.Series.name

In [9]:
series1.name

'Annual_Premium'

## polars.Series.shape

In [10]:
series1.shape

(11504798,)

## polars.Series.flags

In [11]:
series1.flags

{'SORTED_ASC': False, 'SORTED_DESC': False}

# Boolean Methods

## polars.Series.all

In [12]:
bool_series = pl.Series([True, True, True])
bool_series.all()

True

In [13]:
mixed_series = pl.Series([True, False, True])
mixed_series.all()

False

In [14]:
(series1 >0).all() # series1 is the col from the insurance table 

True

## polars.Series.any

In [16]:
bool_series = pl.Series([False, False, True])
bool_series.any()

True

In [17]:
(series1 <1000).any()

False

In [18]:
(series1 > 500).any

<bound method Series.any of shape: (11_504_798,)
Series: 'Annual_Premium' [bool]
[
	true
	true
	true
	true
	true
	…
	true
	true
	true
	true
	true
]>

## polars.Series.not_ explanation

In [19]:
bool_series = pl.Series([False, False, True])
inverted_series = bool_series.not_()
inverted_series

true
True
False


In [21]:
condition = series1 >500 
inverted_condition = condition.not_()
print(inverted_condition)

shape: (11_504_798,)
Series: 'Annual_Premium' [bool]
[
	false
	false
	false
	false
	false
	…
	false
	false
	false
	false
	false
]


# Categories Methods 

In [24]:
series2 = insurance_df["Gender"]
cat_series = series2.cast(pl.Categorical)

## polars.Series.cat.get_categories

In [26]:
cat_series.cat.get_categories()

Gender
str
"""Male"""
"""Female"""


## polars.Series.cat.is_local

In [27]:
cat_series.cat.is_local()

True

## Descriptive 

## polars.Series.describe

In [28]:
series1.describe()

statistic,value
str,f64
"""count""",11504798.0
"""null_count""",0.0
"""mean""",30461.370411
"""std""",16454.745205
"""min""",2630.0
"""25%""",25277.0
"""50%""",31824.0
"""75%""",39451.0
"""max""",540165.0


## polars.Series.has_nulls

In [29]:
series2.has_nulls()

False

## polars.Series.is_duplicated

In [31]:
series1.is_duplicated()

Annual_Premium
bool
true
true
true
true
true
…
true
true
true
true


## polars.Series.is_empty

In [32]:
series1.is_empty()

False

In [33]:
series2.is_empty()

False

## polars.Series.is_finite

In [34]:
series1.is_finite()

Annual_Premium
bool
true
true
true
true
true
…
true
true
true
true


## polars.Series.n_unique

In [36]:
series1.n_unique()

51728

In [37]:
series2.n_unique()

2

## polars.Series.unique_counts

In [39]:
series2.unique_counts()

Gender
u32
6228134
5276664


## polars.Series.value_counts

In [40]:
series2.value_counts()

Gender,count
str,u32
"""Female""",5276664
"""Male""",6228134


## Compuatation 

In [41]:
# polars.Series.cum_count
series1.cum_count()

Annual_Premium
u32
1
2
3
4
5
…
11504794
11504795
11504796
11504797


In [42]:
# polars.Series.log
series1.log()

Annual_Premium
f64
11.083695
10.983783
10.546472
7.874739
10.371959
…
10.218736
10.292451
7.874739
10.788143


In [43]:
# polars.Series.kurtosis

series1.kurtosis()

24.59795591513306

In [44]:
# polars.Series.skew

series1.skew()

0.7769367289427773

In [45]:
# polars.Series.cos

series1.cos()

Annual_Premium
f64
0.608179
0.989442
-0.115977
-0.883765
0.538017
…
0.033295
-0.999796
-0.883765
0.936348


In [48]:
# polars.Series.cos

s = pl.Series([11, 12, 31])
s.exp()

59874.141715
162754.791419
29049000000000.0


In [49]:
# polars.Series.abs

s = pl.Series([21,32,-43,91,-12])
s.abs()

21
32
43
91
12


# Export Methods 

In [50]:
series_arrow = series1.to_arrow()
type(series_arrow)

pyarrow.lib.DoubleArray

In [51]:
series_frame = series1.to_frame()
type(series_frame)

polars.dataframe.frame.DataFrame

In [53]:
series_list = series1.to_list()
type(series_list)

list

In [54]:
series_list - series1.to_numpy()
type(series_list)

list

In [55]:
series_pandas = series1.to_pandas()
type(series_pandas)

pandas.core.series.Series

# List Methods 

In [58]:
s_list1 = pl.Series("Course",[["Maths","Programming","Product","Engineering"],["Data"]])
s_list2 = pl.Series("Training",[["Online"],["Assessment",None]])

In [59]:
# polars.Series.list.concat
s_list1.list.concat(s_list2)

Course
list[str]
"[""Maths"", ""Programming"", … ""Online""]"
"[""Data"", ""Assessment"", null]"


In [62]:
s_list1

Course
list[str]
"[""Maths"", ""Programming"", … ""Engineering""]"
"[""Data""]"


In [69]:
# polars.Series.list.contains
s_list1 = pl.Series("Course",[["Maths","Programming","Product","Engineering"],["Data"]])
s_list1.list.contains("Maths")

Course
bool
True
False


In [70]:
# polars.Series.list.explode

s_list1.list.explode()

Course
str
"""Maths"""
"""Programming"""
"""Product"""
"""Engineering"""
"""Data"""


In [74]:
## polars.Series.list.mean

s_list =pl.Series("Marks",[[20,21],[45,60]])
s_list.list.mean()

Marks
f64
20.5
52.5


In [75]:
# polars.Series.list.reverse
s_list1 = pl.Series("Course",[["Maths","Programming","Product","Engineering"],["Data"]])
s_list1.list.reverse()


Course
list[str]
"[""Engineering"", ""Product"", … ""Maths""]"
"[""Data""]"


In [78]:
s_list1

Course
list[str]
"[""Maths"", ""Programming"", … ""Engineering""]"
"[""Data""]"


In [79]:
# polars.Series.list.slice

list_series = pl.Series("lists", [[1, 2, 3, 4], [5, 6, 7], [8, 9]])
sliced_series = list_series.list.slice(1, 2)
print(sliced_series)

shape: (3,)
Series: 'lists' [list[i64]]
[
	[2, 3]
	[6, 7]
	[9]
]


In [80]:
sliced_series = list_series.list.slice(-2, 2)
print(sliced_series)

shape: (3,)
Series: 'lists' [list[i64]]
[
	[3, 4]
	[6, 7]
	[8, 9]
]


In [81]:
# polars.Series.list.n_unique

list_series = pl.Series("lists", [[1, 2, 2, 3], [4, 5, 4], [6, 6, 6]])
unique_counts = list_series.list.n_unique()
print(unique_counts)

shape: (3,)
Series: 'lists' [u32]
[
	3
	2
	1
]


In [82]:
## polars.Series.list.unique
list_series = pl.Series("lists", [[1, 2, 2, 3], [4, 5, 4], [6, 6, 6]])
unique_elements = list_series.list.unique()
print(unique_elements)


shape: (3,)
Series: 'lists' [list[i64]]
[
	[1, 2, 3]
	[4, 5]
	[6]
]


# Array Methods 

In [88]:
## polars.Series.arr.max
series_arr = pl.Series("Array", [[2, 3, 4], [5, 3, 7], [8, 9,10]], dtype=pl.Array(pl.Int64, 3))
series_arr.arr.max()

Array
i64
4
7
10


In [89]:
# polars.Series.arr.to_list
series_arr.to_list()

[[2, 3, 4], [5, 3, 7], [8, 9, 10]]

In [90]:
# polars.Series.arr.unique
series_arr.arr.unique()

Array
list[i64]
"[2, 3, 4]"
"[3, 5, 7]"
"[8, 9, 10]"


In [91]:
# polars.Series.arr.n_unique
series_arr.arr.n_unique()

Array
u32
3
3
3


In [93]:
# polars.Series.arr.sort

series_arr.arr.sort(descending=True)

Array
"array[i64, 3]"
"[4, 3, 2]"
"[7, 5, 3]"
"[10, 9, 8]"


In [94]:
# polars.Series.arr.first

series_arr.arr.first()

Array
i64
2
5
8


In [95]:
# polars.Series.arr.explode

series_arr.arr.explode()

Array
i64
2
3
4
5
3
7
8
9
10


In [96]:
# polars.Series.arr.contains

series_arr.arr.contains(3)

Array
bool
True
True
False


# Aggregation Methods  

In [97]:
series1  = insurance_df["Annual_Premium"]

In [98]:
# polars.Series.arg_min
series1.arg_min()

3

In [99]:
# polars.Series.arg_max
series1.arg_max()

586353

In [100]:
# polars.Series.max
series1.max()

540165.0

In [101]:
# polars.Series.mean
series1.mean()

30461.370410588694

In [102]:
# polars.Series.median
series1.median()

31824.0

In [103]:
# polars.Series.min
series1.min()

2630.0

In [104]:
# polars.Series.mode
series1.mode()

Annual_Premium
f64
2630.0


In [108]:
# polars.Series.quantile
series1.quantile(0.5)

31824.0

In [109]:
# polars.Series.std
series1.std()

16454.745205061357

In [110]:
# polars.Series.sum
series1.sum()

350451913377.0

In [111]:
# polars.Series.var
series1.var()

270758639.7634897

# Manipulation/Selection

In [112]:
# polars.Series.alias

series1.alias("Premium")

Premium
f64
65101.0
58911.0
38043.0
2630.0
31951.0
…
27412.0
29509.0
2630.0
48443.0


In [113]:
# polars.Series.append

s1 = pl.Series("countries", ["India","Srilanka","Sudan","Germany"])
s2 = pl.Series("more_countries",["Australia","Pakistan","Norway"])
s1.append(s2)

countries
str
"""India"""
"""Srilanka"""
"""Sudan"""
"""Germany"""
"""Australia"""
"""Pakistan"""
"""Norway"""


In [115]:
# polars.Series.clear
s1.clear()

countries
str


In [123]:
# polars.Series.clip

series = pl.Series("values", [1, 5, 10, 15, 20])
clipped_series = series.clip(6,18)

print(clipped_series)

shape: (5,)
Series: 'values' [i64]
[
	6
	6
	10
	15
	18
]


In [124]:
clipped_min_series = series.clip(lower_bound=5)
clipped_max_series = series.clip(upper_bound=15)
print(clipped_min_series)
print(clipped_max_series) 

shape: (5,)
Series: 'values' [i64]
[
	5
	5
	10
	15
	20
]
shape: (5,)
Series: 'values' [i64]
[
	1
	5
	10
	15
	15
]


In [126]:
#polars.Series.clone
series_clone = series1.clone()
series_clone

Annual_Premium
f64
65101.0
58911.0
38043.0
2630.0
31951.0
…
27412.0
29509.0
2630.0
48443.0


In [128]:
# polars.Series.filter

series1.filter(series1 > 3000)

Annual_Premium
f64
65101.0
58911.0
38043.0
31951.0
28150.0
…
29974.0
27412.0
29509.0
48443.0


In [131]:
# polars.Series.floor
s = pl.Series("a", [1.12345, 2.56789, 3.901234,4.1567])
s.floor()

a
f64
1.0
2.0
3.0
4.0


In [132]:
# polars.Series.gather

s.gather([1,3])

a
f64
2.56789
4.1567


In [137]:
# polars.Series.reshape

series = pl.Series("values", list(range(9)))
reshaped_series = series.reshape((3, 3))
print(reshaped_series)

shape: (3,)
Series: 'values' [array[i64, 3]]
[
	[0, 1, 2]
	[3, 4, 5]
	[6, 7, 8]
]


In [138]:
#polars.Series.shuffle

series1.shuffle()

Annual_Premium
f64
29390.0
36263.0
31389.0
32863.0
25166.0
…
29057.0
34709.0
31904.0
37223.0


In [141]:
# polars.Series.to_dummies
series2.to_dummies()

Gender_Female,Gender_Male
u8,u8
0,1
0,1
1,0
1,0
1,0
…,…
0,1
1,0
1,0
1,0


In [142]:
# polars.Series.unique
series2.unique().sort()

Gender
str
"""Female"""
"""Male"""


In [143]:
# polars.Series.drop_nulls

series1.drop_nulls()

Annual_Premium
f64
65101.0
58911.0
38043.0
2630.0
31951.0
…
27412.0
29509.0
2630.0
48443.0


In [144]:
# polars.Series.drop_nans

series1.drop_nans()

Annual_Premium
f64
65101.0
58911.0
38043.0
2630.0
31951.0
…
27412.0
29509.0
2630.0
48443.0


# Miscellaneous 

In [145]:
# polars.Series.equals
s1 = pl.Series("countries", ["India","Srilanka","Sudan","Germany"])
s2 = pl.Series("more_countries",["Australia","Pakistan","Norway"])
s3 = pl.Series("countries", ["India","Srilanka","Sudan","Germany"])

s1.equals(s3)

True

In [146]:
s1.equals(s2)

False

In [149]:
# polars.Series.map_elements
series = pl.Series("numbers", [1, 2, 3, 4, 5])
def square(x):
    return x * x

squared_series = series.map_elements(square,return_dtype=pl.Int64)
squared_series

Series.map_elements is significantly slower than the native series API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - srs.map_elements(square)
with this one instead:
  + s * s

  squared_series = series.map_elements(square,return_dtype=pl.Int64)


numbers
i64
1
4
9
16
25


In [150]:
# polars.Series.get_chunks

series1.get_chunks()

[shape: (1_458_252,)
 Series: 'Annual_Premium' [f64]
 [
 	65101.0
 	58911.0
 	38043.0
 	2630.0
 	31951.0
 	…
 	2630.0
 	2630.0
 	2630.0
 	48511.0
 	2630.0
 ],
 shape: (1_438_979,)
 Series: 'Annual_Premium' [f64]
 [
 	55976.0
 	36289.0
 	31821.0
 	41823.0
 	2630.0
 	…
 	51478.0
 	27382.0
 	33887.0
 	28420.0
 	28983.0
 ],
 shape: (1_438_940,)
 Series: 'Annual_Premium' [f64]
 [
 	20447.0
 	25949.0
 	51512.0
 	34347.0
 	52388.0
 	…
 	47965.0
 	34695.0
 	2630.0
 	32584.0
 	57239.0
 ],
 shape: (1_438_982,)
 Series: 'Annual_Premium' [f64]
 [
 	32111.0
 	27658.0
 	32965.0
 	2630.0
 	2630.0
 	…
 	39647.0
 	29317.0
 	43762.0
 	24169.0
 	27147.0
 ],
 shape: (1_438_972,)
 Series: 'Annual_Premium' [f64]
 [
 	36141.0
 	31831.0
 	2630.0
 	2630.0
 	34122.0
 	…
 	27124.0
 	23589.0
 	2630.0
 	26644.0
 	46190.0
 ],
 shape: (1_438_921,)
 Series: 'Annual_Premium' [f64]
 [
 	30061.0
 	31565.0
 	48409.0
 	31939.0
 	37492.0
 	…
 	44231.0
 	32144.0
 	2630.0
 	45011.0
 	2630.0
 ],
 shape: (1_437_357,)
 Series: 

# Struct Methods

In [None]:
# polars.Series.struct.field

series = pl.Series("people", [{"name": "Alice", "age": 30}, {"name": "Bob", "age": 25}])
age_series = series.struct.field("name")
print(age_series)

# polars.Series.struct.rename_fields
series.struct.fields

In [155]:
series.struct.rename_fields(["Name", "Experience"])

people
struct[2]
"{""Alice"",30}"
"{""Bob"",25}"


In [156]:
series.struct.fields

['name', 'age']

In [157]:
# polars.Series.struct.unnest

series.struct.unnest()


name,age
str,i64
"""Alice""",30
"""Bob""",25


# Temporal Functions 

In [None]:
datetime_series = pl.Series("datetime_column", [
    "2024-03-01 12:34:40",
    "2024-04-02 15:20:30",
    "2024-05-03 09:15:45"
])
datetime_series = datetime_series.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")

In [158]:
# polars.Series.dt.date
date_series = datetime_series.dt.date()
print(date_series)

shape: (3,)
Series: 'datetime_column' [date]
[
	2024-03-01
	2024-04-02
	2024-05-03
]


In [159]:
# polars.Series.dt.datetime

datetime_series.dt.datetime()

  datetime_series.dt.datetime()


datetime_column
datetime[μs]
2024-03-01 12:34:40
2024-04-02 15:20:30
2024-05-03 09:15:45


In [160]:
# polars.Series.dt.day

datetime_series.dt.day()

datetime_column
i8
1
2
3


In [161]:
# datetime_series.dt.hour

datetime_series.dt.hour()

datetime_column
i8
12
15
9


In [162]:
# polars.Series.dt.time

datetime_series.dt.time()

datetime_column
time
12:34:40
15:20:30
09:15:45


In [163]:
# polars.Series.dt.timestamp
datetime_series.dt.timestamp()

datetime_column
i64
1709296480000000
1712071230000000
1714727745000000


In [164]:
# polars.Series.dt.week
datetime_series.dt.week()

datetime_column
i8
9
14
18


# There are also methods for String and Binary operations. 