### Polars data analysis and transformation

The examples below use `.head()` to reduce the output to a few rows and take up less space.
If you want the full output, remove `.head()` from the code

This notebooked is divided into sections. If your code editor supports it, you can use the **Outline** functionality to easily go to the code section you are interested in.

For more details on Polars functions, check out the Polars API reference: https://pola-rs.github.io/polars/py-polars/html/reference/index.html

In [1]:
import polars as pl

In [2]:
# Configure the number of characters to show for each string column
pl.Config.set_fmt_str_lengths(30)

polars.config.Config

In [3]:
# Create a sample dataset for videogames reviews
data = {
    "title": ["Super Mario", "Zelda", "Super Mario", None, "Halo Infinite", "Zelda"],
    "rating": [9.5, 10, 9.5, None, 9, 8],
    "review_date": [
        "2023-01-15",
        "2023-01-20",
        "2023-01-15",
        None,
        "2023-02-01",
        "2023-03-01",
    ],
    "review_text": ["Amazing game!", "Fun", "Amazing game!", None, None, "Good game!"],
}

video_games_reviews = pl.DataFrame(data)

In [4]:
video_games_reviews

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
,,,
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


#### Select a few rows of data

In [5]:
# Show the first 2 rows of the DataFrame
# We will use head extensively to show few rows of a dataframe
video_games_reviews.head(2)

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""


In [6]:
# Show the last 2 rows of the DataFrame
video_games_reviews.tail(2)

title,rating,review_date,review_text
str,f64,str,str
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [7]:
# Show a sample of 2 rows of the DataFrame
video_games_reviews.sample(2)

title,rating,review_date,review_text
str,f64,str,str
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""
,,,


#### Information about the table

In [8]:
# Schema of the table: column names and types
video_games_reviews.schema

Schema([('title', String),
        ('rating', Float64),
        ('review_date', String),
        ('review_text', String)])

In [9]:
# Column names
video_games_reviews.columns

['title', 'rating', 'review_date', 'review_text']

In [10]:
# Column data types
video_games_reviews.dtypes

[String, Float64, String, String]

In [11]:
# Shape of the DataFrame: number of rows and columns
video_games_reviews.shape

(6, 4)

In [12]:
# Number of rows
video_games_reviews.height

6

In [13]:
# Number of columns
video_games_reviews.width

4

In [14]:
# Visualize statistics about the columns: count, mean, std, min, max, etc.
video_games_reviews.describe()

statistic,title,rating,review_date,review_text
str,str,f64,str,str
"""count""","""5""",5.0,"""5""","""4"""
"""null_count""","""1""",1.0,"""1""","""2"""
"""mean""",,9.2,,
"""std""",,0.758288,,
"""min""","""Halo Infinite""",8.0,"""2023-01-15""","""Amazing game!"""
"""25%""",,9.0,,
"""50%""",,9.5,,
"""75%""",,9.5,,
"""max""","""Zelda""",10.0,"""2023-03-01""","""Good game!"""


In [15]:
# Estimate the memory usage of the DataFrame
video_games_reviews.estimated_size("kb")

0.1787109375

#### Chain methods together

In [16]:
# Visualize statistics, and show only the first 3 rows
video_games_reviews.describe().head(3)

statistic,title,rating,review_date,review_text
str,str,f64,str,str
"""count""","""5""",5.0,"""5""","""4"""
"""null_count""","""1""",1.0,"""1""","""2"""
"""mean""",,9.2,,


#### Sort and add row numbers

In [17]:
# Sort the DataFrame by the time of review in descending order
video_games_reviews.sort("review_date", descending=True)

title,rating,review_date,review_text
str,f64,str,str
,,,
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""


In [18]:
# Sort the DataFrame by the title and  time of review, both in descending order
video_games_reviews.sort(["title", "review_date"], descending=[True, True])

title,rating,review_date,review_text
str,f64,str,str
,,,
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Halo Infinite""",9.0,"""2023-02-01""",


In [19]:
# Add a row number to the dataframe
video_games_reviews.with_row_index()

index,title,rating,review_date,review_text
u32,str,f64,str,str
0,"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
1,"""Zelda""",10.0,"""2023-01-20""","""Fun"""
2,"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
3,,,,
4,"""Halo Infinite""",9.0,"""2023-02-01""",
5,"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [20]:
# Add a row number to the dataframe, then sort in reverse order
video_games_reviews.with_row_index().reverse()

index,title,rating,review_date,review_text
u32,str,f64,str,str
5,"""Zelda""",8.0,"""2023-03-01""","""Good game!"""
4,"""Halo Infinite""",9.0,"""2023-02-01""",
3,,,,
2,"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
1,"""Zelda""",10.0,"""2023-01-20""","""Fun"""
0,"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""


#### Saving modifications

In [21]:
# To keep a modified Polars dataframe, we should save it
# We can save it to a new variable, or to the same variable
# Here we save it to a new variable
video_games_modified = video_games_reviews.with_row_index().reverse()

#### Select and rename columns

##### Select

In [22]:
# Select columns using their names
video_games_reviews.select("title", "review_text")

title,review_text
str,str
"""Super Mario""","""Amazing game!"""
"""Zelda""","""Fun"""
"""Super Mario""","""Amazing game!"""
,
"""Halo Infinite""",
"""Zelda""","""Good game!"""


In [23]:
# Select columns using a list with their names
video_games_reviews.select(["title", "review_text"]).head(2)

title,review_text
str,str
"""Super Mario""","""Amazing game!"""
"""Zelda""","""Fun"""


In [24]:
# Select columns using their names with pl.col()
video_games_reviews.select(pl.col("title"), pl.col("review_text")).head(2)

title,review_text
str,str
"""Super Mario""","""Amazing game!"""
"""Zelda""","""Fun"""


In [78]:
# Add suffix or prefix
video_games_reviews.select(
    pl.col("title").name.prefix("videogame_"),
    pl.col("rating").name.suffix("_out_of_10"),
    pl.col("review_text"),
).head(2)

videogame_title,rating_out_of_10,review_text
str,f64,str
"""Super Mario""",9.5,"""Amazing game!"""
"""Zelda""",10.0,"""Fun"""


In [79]:
# Select columns with pl.col() and rename them using keywords arguments (example: videogame_title = )
# Important: columns selected with keywords arguments should come after columns without keywords arguments
video_games_reviews.select(
    videogame_title=pl.col("title"),
    rating_out_of_10=pl.col("rating"),
    review=pl.col("review_text"),
).head(2)

videogame_title,rating_out_of_10,review
str,f64,str
"""Super Mario""",9.5,"""Amazing game!"""
"""Zelda""",10.0,"""Fun"""


In [80]:
# Select columns with pl.col() and rename them using alias
# alias should always be at the end of the expression
video_games_reviews.select(
    pl.col("title").alias("videogame_title"),
    pl.col("rating").alias("rating_out_of_10"),
    pl.col("review_text").alias("review"),
).head(2)

videogame_title,rating_out_of_10,review
str,f64,str
"""Super Mario""",9.5,"""Amazing game!"""
"""Zelda""",10.0,"""Fun"""


In [28]:
# Select columns based on their data type: select all string columns
video_games_reviews.select(pl.col(pl.String))

title,review_date,review_text
str,str,str
"""Super Mario""","""2023-01-15""","""Amazing game!"""
"""Zelda""","""2023-01-20""","""Fun"""
"""Super Mario""","""2023-01-15""","""Amazing game!"""
,,
"""Halo Infinite""","""2023-02-01""",
"""Zelda""","""2023-03-01""","""Good game!"""


In [30]:
# pl.all() can be used to select all columns
video_games_reviews.select(pl.all())

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
,,,
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [32]:
# Select columns based on regex: all column names that contain 'review'
video_games_reviews.select(pl.col("^.*review.*$")).head(2)

review_date,review_text
str,str
"""2023-01-15""","""Amazing game!"""
"""2023-01-20""","""Fun"""


In [33]:
# Exclude columns that contain 'review'
# Exclude can be used with regex, or with column names, or with data types
video_games_reviews.select(pl.all().exclude("^.*review.*$")).head(2)

title,rating
str,f64
"""Super Mario""",9.5
"""Zelda""",10.0


##### Using Polars selectors

In [34]:
import polars.selectors as cs

# returns the first column
video_games_reviews.select(cs.first()).head(2)

title
str
"""Super Mario"""
"""Zelda"""


In [35]:
# returns all numeric columns
video_games_reviews.select(cs.numeric()).head(2)

rating
f64
9.5
10.0


In [36]:
# returns all columns containing "date"
video_games_reviews.select(cs.contains("date")).head(2)

review_date
str
"""2023-01-15"""
"""2023-01-20"""


In [37]:
# returns all numeric columns except columns containing "rating"
video_games_reviews.select(cs.numeric() - cs.contains("rating")).head(2)

##### With Columns

In [38]:
# Another way to select columns is using with_columns() instead of select()
# with_columns() includes all columns by default, and
# adds or modifies the specified columns
video_games_reviews.with_columns(
    pl.col("rating") * 20,  # modified column
)

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",190.0,"""2023-01-15""","""Amazing game!"""
"""Zelda""",200.0,"""2023-01-20""","""Fun"""
"""Super Mario""",190.0,"""2023-01-15""","""Amazing game!"""
,,,
"""Halo Infinite""",180.0,"""2023-02-01""",
"""Zelda""",160.0,"""2023-03-01""","""Good game!"""


#### Add columns

In [39]:
# Add and modify columns
# if the column has the same name => modifies a column
# if the column has a different name => adds a new column
# Use pl.lit() to add a new column based on a constant value
video_games_reviews.select(
    pl.col("title"), pl.col("rating"), category=pl.lit("videogame")
)

title,rating,category
str,f64,str
"""Super Mario""",9.5,"""videogame"""
"""Zelda""",10.0,"""videogame"""
"""Super Mario""",9.5,"""videogame"""
,,"""videogame"""
"""Halo Infinite""",9.0,"""videogame"""
"""Zelda""",8.0,"""videogame"""


In [40]:
# Add and modify columns
# All columns are calculated in parallel, so rating_out_of_10 depends on the initial rating column
# and not on the modified rating column
video_games_reviews.select(
    pl.col("rating") * 20,  # modified column
    rating_out_of_10=pl.col("rating"),  # new column
).head(2)

rating,rating_out_of_10
f64,f64
190.0,9.5
200.0,10.0


In [41]:
# If we want the new column to depend on the modified column, we can add a new context
# where we do the calculation
# A new context is a new select or with_columns
video_games_reviews.select(
    pl.col("rating") * 20,  # modified column
).with_columns(  # new context
    rating_out_of_10=pl.col("rating")  # new column
).head(2)

rating,rating_out_of_10
f64,f64
190.0,190.0
200.0,200.0


In [42]:
# When adding columns with with_columns(), non-specified columns are included by default
# with_columns() includes all columns by default, and
# adds or modifies the specified columns
video_games_reviews.with_columns(
    pl.col("rating") * 20,  # modified column
    rating_out_of_10=pl.col("rating"),  # new column
).head(2)

title,rating,review_date,review_text,rating_out_of_10
str,f64,str,str,f64
"""Super Mario""",190.0,"""2023-01-15""","""Amazing game!""",9.5
"""Zelda""",200.0,"""2023-01-20""","""Fun""",10.0


In [43]:
# Combine 2 columns together
# We need to cast the rating column to a string data type (we see this later)
video_games_reviews.select(
    review_text_and_score=pl.col("review_text")
    + pl.lit(" - ")
    + pl.col("rating").cast(pl.String)
).head(2)

review_text_and_score
str
"""Amazing game! - 9.5"""
"""Fun - 10.0"""


In [44]:
# Combine columns using pl.format
video_games_reviews.select(
    review_text_and_score=pl.format("review: {}, on {}", "review_text", "review_date")
).head(2)

review_text_and_score
str
"""review: Amazing game!, on 2023…"
"""review: Fun, on 2023-01-20"""


#### Methods on dataframes and expressions

In [45]:
# Methods can be applied on the dataframe
video_games_reviews.min()

title,rating,review_date,review_text
str,f64,str,str
"""Halo Infinite""",8.0,"""2023-01-15""","""Amazing game!"""


In [46]:
# Methods can also be applied on expressions
# In this example, the result is the same
video_games_reviews.select(pl.all().min())

title,rating,review_date,review_text
str,f64,str,str
"""Halo Infinite""",8.0,"""2023-01-15""","""Amazing game!"""


In [47]:
# Sorting the dataframe sorts all rows in the dataframe based on the review date
video_games_reviews.sort(by="review_date")

title,rating,review_date,review_text
str,f64,str,str
,,,
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [48]:
# Sorting the column using an expression sorts only the column
# In this example, the result of applying on the dataframe and on the column is not the same
# With expressions, each column is calculated independently of the other ones
video_games_reviews.with_columns(pl.col("review_date").sort())

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,,"""Amazing game!"""
"""Zelda""",10.0,"""2023-01-15""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
,,"""2023-01-20""",
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


#### Advanced column selection

In [49]:
pl.Config.set_tbl_cols(7)

polars.config.Config

In [50]:
# Calculate the quantiles of the rating column: one decile per column
video_games_reviews.select(
    pl.col("rating").quantile(i / 100).alias(f"perc_{i}") for i in range(0, 101, 10)
)

perc_0,perc_10,perc_20,…,perc_80,perc_90,perc_100
f64,f64,f64,…,f64,f64,f64
8.0,8.0,9.0,…,9.5,10.0,10.0


In [51]:
# Count the number of unique values for each column
video_games_reviews.select(pl.all().n_unique())

title,rating,review_date,review_text
u32,u32,u32,u32
4,5,5,4


In [52]:
# Verify which columns have less than 5 unique values
video_games_reviews.select(pl.all().n_unique() < 5)

title,rating,review_date,review_text
bool,bool,bool,bool
True,False,False,True


In [53]:
# Extract the columns with 4 unique values or less
# .any() returns True if any value is True so it filters the columns
[
    column.name
    for column in video_games_reviews.select(pl.all().n_unique() < 5)
    if column.any()
]

['title', 'review_text']

In [54]:
# Select columns that have 4 unique values or less
video_games_reviews.select(
    column.name
    for column in video_games_reviews.select(pl.all().n_unique() <= 4)
    if column.any()
).head(2)

title,review_text
str,str
"""Super Mario""","""Amazing game!"""
"""Zelda""","""Fun"""


#### Filter columns

In [55]:
# Filter dataframe based on one value
video_games_reviews.filter(pl.col("title") == "Zelda")

title,rating,review_date,review_text
str,f64,str,str
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [56]:
# Filter dataframe based on multiple values
video_games_reviews.filter(pl.col("title").is_in(["Zelda", "Super Mario"]))

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [57]:
# Filter dataframe based on multiple values : use AND &
video_games_reviews.filter((pl.col("rating") == 8) & (pl.col("title") == "Zelda"))

title,rating,review_date,review_text
str,f64,str,str
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [58]:
# Filter dataframe based on multiple values : use OR |
video_games_reviews.filter((pl.col("rating") == 9.5) | (pl.col("title") == "Zelda"))

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [59]:
# Filter dataframe using numerical values: ratings bigger or equal to 8 and less than 10
video_games_reviews.filter((pl.col("rating") >= 8) & (pl.col("rating") < 10))

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [60]:
# Filter dataframe using regex: filter all videogame titles that contain 'Super' or 'Infinite'
video_games_reviews.filter(pl.col("title").str.contains("Super|Infinite"))

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Halo Infinite""",9.0,"""2023-02-01""",


In [61]:
# Filter based on a calculation: show the rating with the latest date
video_games_reviews.filter(pl.col("review_date") == pl.col("review_date").max())

title,rating,review_date,review_text
str,f64,str,str
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


#### Unique and duplicated values

In [62]:
# For reviews that appear more than once, keep only one review
# I keep only one review for Super Mario that has duplicated reviews
video_games_reviews.unique()

title,rating,review_date,review_text
str,f64,str,str
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
,,,
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [63]:
# Show only the reviews appearing once
# This fully remove Super Mario who had the same review twice
video_games_reviews.filter(video_games_reviews.is_unique())

title,rating,review_date,review_text
str,f64,str,str
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
,,,
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [64]:
# Show only duplicated reviews:
# Show only reviews appearing more than once in the table
video_games_reviews.filter(video_games_reviews.is_duplicated())

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""


In [65]:
# Show only games with one review
# Applying is_unique() to title gives the rows where the title is unique
video_games_reviews.filter(pl.col("title").is_unique())

title,rating,review_date,review_text
str,f64,str,str
,,,
"""Halo Infinite""",9.0,"""2023-02-01""",


In [66]:
# Show only games with 2 or more reviews
video_games_reviews.filter(pl.col("title").is_duplicated())

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [67]:
# Count the number of unique reviews
# Excludes the duplicated review for Super Mario
video_games_reviews.n_unique()

5

In [68]:
# Count the number of reviews that are duplicates
video_games_reviews.height - video_games_reviews.n_unique()

1

In [69]:
# Keep only one review for each combination of title and rating
video_games_reviews.unique(subset=["title", "rating"])

title,rating,review_date,review_text
str,f64,str,str
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
,,,
"""Halo Infinite""",9.0,"""2023-02-01""",
"""Zelda""",10.0,"""2023-01-20""","""Fun"""


In [70]:
# Show all unique values for rating
# Equivalent to the SQL command: SELECT DISTINCT rating FROM video_games_reviews
video_games_reviews.select(pl.col("rating").unique())

rating
f64
""
8.0
9.0
9.5
10.0


In [71]:
# Count the unique values for each column
video_games_reviews.select(pl.all().n_unique())

title,rating,review_date,review_text
u32,u32,u32,u32
4,5,5,4


In [72]:
# Show unique value and their count for a column
video_games_reviews.select(pl.col("title").value_counts(sort=True))

title
struct[2]
"{""Super Mario"",2}"
"{""Zelda"",2}"
"{null,1}"
"{""Halo Infinite"",1}"


#### Expression expansion and pl.struct()

In [77]:
# Polars expressions referring to more than one column expand to a list of expressions
# This applies to all expressions
# pl.col('title','rating').is_duplicated() expands to
# [pl.col('title').is_duplicated(), pl.col('rating').is_duplicated()]
video_games_reviews.with_columns(
    pl.col("title", "rating").is_duplicated().name.suffix("_is_duplicated")
)

title,rating,review_date,review_text,title_is_duplicated,rating_is_duplicated
str,f64,str,str,bool,bool
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!""",True,True
"""Zelda""",10.0,"""2023-01-20""","""Fun""",True,False
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!""",True,True
,,,,False,False
"""Halo Infinite""",9.0,"""2023-02-01""",,False,False
"""Zelda""",8.0,"""2023-03-01""","""Good game!""",True,False


In [None]:
# To use a function on the combination of multiple columns, and not on each column separately,
# we can use the pl.struct() function
# pl.struct() creates a new column with a 'struct' that corresponds to the combination of the columns
video_games_reviews.with_columns(
    pl.struct("title", "rating").alias("struct_title_rating")
)

title,rating,review_date,review_text,struct_title_rating
str,f64,str,str,struct[2]
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!""","{""Super Mario"",9.5}"
"""Zelda""",10.0,"""2023-01-20""","""Fun""","{""Zelda"",10.0}"
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!""","{""Super Mario"",9.5}"
,,,,"{null,null}"
"""Halo Infinite""",9.0,"""2023-02-01""",,"{""Halo Infinite"",9.0}"
"""Zelda""",8.0,"""2023-03-01""","""Good game!""","{""Zelda"",8.0}"


In [None]:
# We can apply functions to the struct column
video_games_reviews.with_columns(
    pl.struct("title", "rating").is_duplicated().alias("title_rating_is_duplicated")
)

title,rating,review_date,review_text,title_rating_is_duplicated
str,f64,str,str,bool
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!""",True
"""Zelda""",10.0,"""2023-01-20""","""Fun""",False
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!""",True
,,,,False
"""Halo Infinite""",9.0,"""2023-02-01""",,False
"""Zelda""",8.0,"""2023-03-01""","""Good game!""",False


#### Missing Values

In [None]:
# Count missing values for each column
video_games_reviews.null_count()

title,rating,review_date,review_text
u32,u32,u32,u32
1,1,1,2


In [None]:
# Filter rows with missing values for text column
video_games_reviews.filter(pl.col("review_text").is_null())

title,rating,review_date,review_text
str,f64,str,str
,,,
"""Halo Infinite""",9.0,"""2023-02-01""",


In [None]:
# Drop rows with missing values for review text column
video_games_reviews.filter(pl.col("review_text").is_not_null())

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [None]:
# Drop all rows where at least one value is missing
video_games_reviews.drop_nulls()

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [None]:
# Drop rows where either review text and title are null
video_games_reviews.drop_nulls(subset=["title", "review_text"])

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [None]:
# Fill null values for review_text column with 'No review'
video_games_reviews.with_columns(pl.col("review_text").fill_null("No review"))

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
"""Zelda""",10.0,"""2023-01-20""","""Fun"""
"""Super Mario""",9.5,"""2023-01-15""","""Amazing game!"""
,,,"""No review"""
"""Halo Infinite""",9.0,"""2023-02-01""","""No review"""
"""Zelda""",8.0,"""2023-03-01""","""Good game!"""


In [None]:
# Create a new dataset to demonstrate additional functions for filling nulls
df_missing_values = pl.DataFrame(
    {"a": [1, None, None, 9], "b": [1, None, 9, 10], "c": [6, None, 6, None]}
)

df_missing_values

a,b,c
i64,i64,i64
1.0,1.0,6.0
,,
,9.0,6.0
9.0,10.0,


In [None]:
# Replace missing values with the median of the column
df_missing_values.with_columns(pl.all().fill_null(pl.all().median()))

a,b,c
f64,f64,f64
1.0,1.0,6.0
5.0,9.0,6.0
5.0,9.0,6.0
9.0,10.0,6.0


In [None]:
# Interpolate missing values for numbers
df_missing_values.interpolate()

a,b,c
f64,f64,f64
1.0,1.0,6.0
3.666667,5.0,6.0
6.333333,9.0,6.0
9.0,10.0,


In [None]:
# Forward fill missing values for a column
df_missing_values.with_columns(pl.col("a").forward_fill())

a,b,c
i64,i64,i64
1,1.0,6.0
1,,
1,9.0,6.0
9,10.0,


In [None]:
# Forward fill missing values for all columns
df_missing_values.fill_null(strategy="forward")

a,b,c
i64,i64,i64
1,1,6
1,1,6
1,9,6
9,10,6


In [None]:
# Backward fill missing values for all columns
df_missing_values.fill_null(strategy="backward")

a,b,c
i64,i64,i64
1,1,6.0
9,9,6.0
9,9,6.0
9,10,


#### Conditions using `all` and `any`

In [None]:
# We use the dataframe missing_values to demonstrate verifying multiple conditions
df_missing_values = pl.DataFrame(
    {"a": [1, None, None, 9], "b": [1, None, 9, 10], "c": [6, None, 6, None]}
)

df_missing_values

a,b,c
i64,i64,i64
1.0,1.0,6.0
,,
,9.0,6.0
9.0,10.0,


In [None]:
# Identify which values are missing for all columns
df_missing_values.with_columns(pl.col("a", "b").is_null().name.suffix("_is_null"))

a,b,c,a_is_null,b_is_null
i64,i64,i64,bool,bool
1.0,1.0,6.0,False,False
,,,True,True
,9.0,6.0,True,False
9.0,10.0,,False,False


In [None]:
df_missing_values.with_columns(
    pl.all_horizontal(pl.col("a", "b").is_null()).alias("cols_a_b_are_null")
)

a,b,c,cols_a_b_are_null
i64,i64,i64,bool
1.0,1.0,6.0,False
,,,True
,9.0,6.0,False
9.0,10.0,,False


In [None]:
# Filter all rows with all values missing
df_missing_values.filter(pl.all_horizontal(pl.col("a", "b").is_null()))

a,b,c
i64,i64,i64
,,


In [None]:
# Filter all rows with at least one missing value
df_missing_values.filter(pl.any_horizontal(pl.all().is_null()))

a,b,c
i64,i64,i64
,,
,9.0,6.0
9.0,10.0,


In [None]:
# Filter all rows with no missing values
df_missing_values.filter(pl.all_horizontal(pl.col("a", "b").is_not_null()))

a,b,c
i64,i64,i64
1,1,6.0
9,10,


#### Slicing and indexing

In [None]:
# Keep 2 rows starting at row 3
video_games_reviews.slice(3, 2)

title,rating,review_date,review_text
str,f64,str,str
,,,
"""Halo Infinite""",9.0,"""2023-02-01""",


In [None]:
# Square brackets indexing should be limited to:
# 1. Extract a scalar value (another option is using .item() )
# 2. Convert a DataFrame to a Series (another option is using .get_column() )
# 3. Inspecting some rows or columns
# In general, select, with_columns, and filter should be used instead of square brackets indexing
# ! Disadvantages: Square brackets selecting only works in eager mode, and is not parallelized
# Select 2 rows starting at row 3
video_games_reviews[3:5]

title,rating,review_date,review_text
str,f64,str,str
,,,
"""Halo Infinite""",9.0,"""2023-02-01""",


In [None]:
# Select 3 rows starting at row 5 for columns title and rating
video_games_reviews[3:5, ["title", "rating"]]

title,rating
str,f64
,
"""Halo Infinite""",9.0


In [None]:
# Select 3 rows starting at row 5 for columns 0 and 1
video_games_reviews[3:5, [0, 1]]

title,rating
str,f64
,
"""Halo Infinite""",9.0


In [None]:
# Select columns title and rating using indexes
video_games_reviews[["title", "rating"]]

title,rating
str,f64
"""Super Mario""",9.5
"""Zelda""",10.0
"""Super Mario""",9.5
,
"""Halo Infinite""",9.0
"""Zelda""",8.0


#### Extracting columns, rows, dataframes and items

In [None]:
# Extract a column from a DataFrame and convert it to a Series
video_games_reviews.get_column("title").head(2)

title
str
"""Super Mario"""
"""Zelda"""


In [None]:
# Extract all columns and convert them to a list of Series
video_games_reviews.head(1).get_columns()

[shape: (1,)
 Series: 'title' [str]
 [
 	"Super Mario"
 ],
 shape: (1,)
 Series: 'rating' [f64]
 [
 	9.5
 ],
 shape: (1,)
 Series: 'review_date' [str]
 [
 	"2023-01-15"
 ],
 shape: (1,)
 Series: 'review_text' [str]
 [
 	"Amazing game!"
 ]]

In [None]:
# Convert a column to a list using .to_list()
video_games_reviews.get_column("title").to_list()

['Super Mario', 'Zelda', 'Super Mario', None, 'Halo Infinite', 'Zelda']

In [None]:
# Extract rows from a dataframe as a list of tuples
# ! This materializes all the rows in memory. It's expensive and should be avoided when possible
video_games_reviews.rows()

[('Super Mario', 9.5, '2023-01-15', 'Amazing game!'),
 ('Zelda', 10.0, '2023-01-20', 'Fun'),
 ('Super Mario', 9.5, '2023-01-15', 'Amazing game!'),
 (None, None, None, None),
 ('Halo Infinite', 9.0, '2023-02-01', None),
 ('Zelda', 8.0, '2023-03-01', 'Good game!')]

In [None]:
# Extract rows from a dataframe as a list of dicts (more expensive)
# ! This materializes all the rows in memory. It's expensive and should be avoided when possible
video_games_reviews.rows(named=True)

[{'title': 'Super Mario',
  'rating': 9.5,
  'review_date': '2023-01-15',
  'review_text': 'Amazing game!'},
 {'title': 'Zelda',
  'rating': 10.0,
  'review_date': '2023-01-20',
  'review_text': 'Fun'},
 {'title': 'Super Mario',
  'rating': 9.5,
  'review_date': '2023-01-15',
  'review_text': 'Amazing game!'},
 {'title': None, 'rating': None, 'review_date': None, 'review_text': None},
 {'title': 'Halo Infinite',
  'rating': 9.0,
  'review_date': '2023-02-01',
  'review_text': None},
 {'title': 'Zelda',
  'rating': 8.0,
  'review_date': '2023-03-01',
  'review_text': 'Good game!'}]

In [None]:
# Extract rows from a dataframe as an iterator
# ! Export methods of Polars should be preferred instead of iterating over rows
video_games_reviews.iter_rows()

<generator object DataFrame.iter_rows at 0x7f0f2c179700>

In [None]:
# Extract slices of 1000 rows from a dataframe as an iterator
video_games_reviews.iter_slices(n_rows=1000)

<generator object DataFrame.iter_slices at 0x7f0f2c1799a0>

In [None]:
# Extract a list of Dataframes partitioned based on the specified column
video_games_reviews.partition_by("title")

[shape: (2, 4)
 ┌─────────────┬────────┬─────────────┬───────────────┐
 │ title       ┆ rating ┆ review_date ┆ review_text   │
 │ ---         ┆ ---    ┆ ---         ┆ ---           │
 │ str         ┆ f64    ┆ str         ┆ str           │
 ╞═════════════╪════════╪═════════════╪═══════════════╡
 │ Super Mario ┆ 9.5    ┆ 2023-01-15  ┆ Amazing game! │
 │ Super Mario ┆ 9.5    ┆ 2023-01-15  ┆ Amazing game! │
 └─────────────┴────────┴─────────────┴───────────────┘,
 shape: (2, 4)
 ┌───────┬────────┬─────────────┬─────────────┐
 │ title ┆ rating ┆ review_date ┆ review_text │
 │ ---   ┆ ---    ┆ ---         ┆ ---         │
 │ str   ┆ f64    ┆ str         ┆ str         │
 ╞═══════╪════════╪═════════════╪═════════════╡
 │ Zelda ┆ 10.0   ┆ 2023-01-20  ┆ Fun         │
 │ Zelda ┆ 8.0    ┆ 2023-03-01  ┆ Good game!  │
 └───────┴────────┴─────────────┴─────────────┘,
 shape: (1, 4)
 ┌───────┬────────┬─────────────┬─────────────┐
 │ title ┆ rating ┆ review_date ┆ review_text │
 │ ---   ┆ ---    ┆ ---  

In [None]:
# Extract a scalar using .item()
# .item() transforms a table of one row and one column into a scalar
video_games_reviews.head(1).select("title").item()

'Super Mario'

In [None]:
# Convert a dataframe to a Python dictionary using .to_dict()
video_games_reviews.to_dict(as_series=False)

{'title': ['Super Mario',
  'Zelda',
  'Super Mario',
  None,
  'Halo Infinite',
  'Zelda'],
 'rating': [9.5, 10.0, 9.5, None, 9.0, 8.0],
 'review_date': ['2023-01-15',
  '2023-01-20',
  '2023-01-15',
  None,
  '2023-02-01',
  '2023-03-01'],
 'review_text': ['Amazing game!',
  'Fun',
  'Amazing game!',
  None,
  None,
  'Good game!']}

In [None]:
# Convert the dataframe to a Pandas using .to_pandas()
video_games_reviews.to_pandas()

Unnamed: 0,title,rating,review_date,review_text
0,Super Mario,9.5,2023-01-15,Amazing game!
1,Zelda,10.0,2023-01-20,Fun
2,Super Mario,9.5,2023-01-15,Amazing game!
3,,,,
4,Halo Infinite,9.0,2023-02-01,
5,Zelda,8.0,2023-03-01,Good game!


In [None]:
# Convert the dataframe to a Numpy array using .to_numpy()
video_games_reviews.to_numpy()

array([['Super Mario', 9.5, '2023-01-15', 'Amazing game!'],
       ['Zelda', 10.0, '2023-01-20', 'Fun'],
       ['Super Mario', 9.5, '2023-01-15', 'Amazing game!'],
       [None, nan, None, None],
       ['Halo Infinite', 9.0, '2023-02-01', None],
       ['Zelda', 8.0, '2023-03-01', 'Good game!']], dtype=object)

#### Mathematical and statistical operations

In [None]:
# Sums all values of a column
video_games_reviews.sum()

title,rating,review_date,review_text
str,f64,str,str
,46.0,,


In [None]:
# Get the maximum values of each column
video_games_reviews.max()

title,rating,review_date,review_text
str,f64,str,str
"""Zelda""",10.0,"""2023-03-01""","""Good game!"""


In [None]:
# Get the minimum values of each column
video_games_reviews.min()

title,rating,review_date,review_text
str,f64,str,str
"""Halo Infinite""",8.0,"""2023-01-15""","""Amazing game!"""


In [None]:
# Get the median value of each column
# Does not work for string or date columns
video_games_reviews.median()

title,rating,review_date,review_text
str,f64,str,str
,9.5,,


In [None]:
# Get the mean value of each column
# Does not work for string or date columns
video_games_reviews.mean()

title,rating,review_date,review_text
str,f64,str,str
,9.2,,


In [None]:
# Get the 10th percentile value of each column
# Does not work for string or date columns
video_games_reviews.quantile(0.1)

title,rating,review_date,review_text
str,f64,str,str
,8.0,,


In [None]:
# Calculates the variance between values in a column
video_games_reviews.var()

title,rating,review_date,review_text
str,f64,str,str
,0.575,,


In [None]:
# Calculates the standard deviation between values in a column
video_games_reviews.std()

title,rating,review_date,review_text
str,f64,str,str
,0.758288,,


In [None]:
import numpy as np

# Use a Numpy universal function: np.exp on a column
video_games_reviews.with_columns(np.exp(pl.col("rating")))

title,rating,review_date,review_text
str,f64,str,str
"""Super Mario""",13359.72683,"""2023-01-15""","""Amazing game!"""
"""Zelda""",22026.465795,"""2023-01-20""","""Fun"""
"""Super Mario""",13359.72683,"""2023-01-15""","""Amazing game!"""
,,,
"""Halo Infinite""",8103.083928,"""2023-02-01""",
"""Zelda""",2980.957987,"""2023-03-01""","""Good game!"""
