### Polars data analysis and transformation

The examples below use `.head()` to reduce the output to a few rows and take up less space.
If you want the full output, remove `.head()` from the code

This notebooked is divided into sections. If your code editor supports it, you can use the **Outline** functionality to easily go to the code section you are interested in.

For more details, check out the Polars API reference: https://pola-rs.github.io/polars/py-polars/html/reference/index.html

In [3]:
import polars as pl

In [4]:
# load a JSON file to a Polars DataFrame
video_games_reviews = pl.read_csv('../datasets/reviews_videogames_350_simplified.csv', infer_schema_length=10000, try_parse_dates=True)

In [5]:
# Configure the number of characters to show for each string column
pl.Config.set_fmt_str_lengths(15)

polars.config.Config

#### Select a few rows of data

In [6]:
# Show the first 5 rows of the DataFrame
# We will use head extensively to show few rows of a dataframe
video_games_reviews.head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Killzone: Shad…",5.0,2016-12-02,"""First time hav…"
"""Resident Evil …",5.0,2014-07-29,"""good"""


In [7]:
# Show the last 5 rows of the DataFrame
video_games_reviews.tail(2)

title,rating,review_date,review_text
str,f64,date,str
"""Grand Theft Au…",5.0,2015-12-03,"""Fantastic game…"
"""Resident Evil …",5.0,2013-03-06,"""muy buen produ…"


In [8]:
# Show a sample of 5 rows of the DataFrame
video_games_reviews.sample(2)

title,rating,review_date,review_text
str,f64,date,str
"""Mass Effect - …",1.0,2008-10-14,"""Buyer beware -…"
"""Wii Fit Game w…",5.0,2009-01-02,"""Our family has…"


#### Information about the table

In [9]:
# Schema of the table: column names and types
video_games_reviews.schema

{'title': Utf8, 'rating': Float64, 'review_date': Date, 'review_text': Utf8}

In [10]:
# Column names
video_games_reviews.columns

['title', 'rating', 'review_date', 'review_text']

In [11]:
# Column data types
video_games_reviews.dtypes

[Utf8, Float64, Date, Utf8]

In [12]:
# Shape of the DataFrame: number of rows and columns
video_games_reviews.shape

(49577, 4)

In [13]:
# Number of rows
video_games_reviews.height

49577

In [14]:
# Number of columns
video_games_reviews.width

4

In [15]:
# Visualize statistics about the columns: count, mean, std, min, max, etc.
video_games_reviews.describe()

describe,title,rating,review_date,review_text
str,str,f64,str,str
"""count""","""49577""",49577.0,"""49577""","""49577"""
"""null_count""","""418""",0.0,"""0""","""17"""
"""mean""",,4.317345,,
"""std""",,1.16256,,
"""min""","""Alan Wake - Xb…",1.0,"""2001-10-01""",""" batman"""
"""max""","""inFAMOUS: Seco…",5.0,"""2018-08-27""","""~PROS~ +Fantas…"
"""median""",,5.0,,


In [16]:
# Estimate the memory usage of the DataFrame
video_games_reviews.estimated_size('mb')

33.72313117980957

#### Chain methods together

In [17]:
# Visualize statistics, and show only the first 3 rows
video_games_reviews.describe().head(3)

describe,title,rating,review_date,review_text
str,str,f64,str,str
"""count""","""49577""",49577.0,"""49577""","""49577"""
"""null_count""","""418""",0.0,"""0""","""17"""
"""mean""",,4.317345,,


#### Sort and add row numbers

In [18]:
# Sort the DataFrame by the time of review in descending order and show the first 2 rows
video_games_reviews.sort('review_date', descending=True).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Batman: Arkham…",5.0,2018-08-27,"""Was good but I…"
"""Call of Duty: …",5.0,2018-08-14,"""Great game"""


In [19]:
# Sort the DataFrame by the time of review in ascending order and show the first 2 rows
video_games_reviews.sort('review_date', descending=False).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Halo - Xbox""",5.0,2001-10-01,"""Forget all you…"
"""Halo - Xbox""",5.0,2001-10-01,"""Forget all you…"


In [20]:
# Sort the DataFrame by the video game code (asin) in ascending order and by the time of review in descending order
video_games_reviews.sort(['title','review_date'], descending=[True, True]).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""inFAMOUS: Seco…",4.0,2018-06-23,"""game case got …"
"""inFAMOUS: Seco…",4.0,2018-06-11,"""The boy played…"


In [21]:
# Add a row number to the dataframe
video_games_reviews.with_row_count().head(2)

row_nr,title,rating,review_date,review_text
u32,str,f64,date,str
0,"""Killzone: Shad…",5.0,2016-12-02,"""First time hav…"
1,"""Resident Evil …",5.0,2014-07-29,"""good"""


In [22]:
# Add a row number to the dataframe, then sort in reverse order
video_games_reviews.with_row_count().reverse().head(2)

row_nr,title,rating,review_date,review_text
u32,str,f64,date,str
49576,"""Resident Evil …",5.0,2013-03-06,"""muy buen produ…"
49575,"""Grand Theft Au…",5.0,2015-12-03,"""Fantastic game…"


#### Saving modifications

In [23]:
# To keep a modified Polars dataframe, we should save it
# We can save it to a new variable, or to the same variable
# Here we save it to a new variable 
video_games_modified = video_games_reviews.with_row_count().reverse()

#### Select and rename columns

##### Select

In [24]:
# Select columns using their names
video_games_reviews.select('title', 'review_text').head(2)

title,review_text
str,str
"""Killzone: Shad…","""First time hav…"
"""Resident Evil …","""good"""


In [25]:
# Select columns using a dictionary with their names
video_games_reviews.select(['title', 'review_text']).head(2)

title,review_text
str,str
"""Killzone: Shad…","""First time hav…"
"""Resident Evil …","""good"""


In [26]:
# Select columns using their names with pl.col()
video_games_reviews.select(pl.col('title'), pl.col('review_text')).head(2)

title,review_text
str,str
"""Killzone: Shad…","""First time hav…"
"""Resident Evil …","""good"""


In [27]:
# Add suffix or prefix
video_games_reviews.select(
    pl.col('title').prefix('videogame_'), 
    pl.col('rating').suffix('_out_of_5'), 
    pl.col('review_text')
    ).head(2)

videogame_title,rating_out_of_5,review_text
str,f64,str
"""Killzone: Shad…",5.0,"""First time hav…"
"""Resident Evil …",5.0,"""good"""


In [28]:
# Select columns with pl.col() and rename them using keywords
# Important: columns selected with keywords (example: videogame_title = ) should come after columns without keywords
video_games_reviews.select(
    videogame_title = pl.col('title'), 
    rating_out_of_5 = pl.col('rating'), 
    review = pl.col('review_text')
    ).head(2)

videogame_title,rating_out_of_5,review
str,f64,str
"""Killzone: Shad…",5.0,"""First time hav…"
"""Resident Evil …",5.0,"""good"""


In [29]:
# Select columns with pl.col() and rename them using alias
video_games_reviews.select(
    pl.col('title').alias('videogame_title'), 
    pl.col('rating').alias('rating_out_of_5'), 
    pl.col('review_text').alias('review')
    ).head(2)

videogame_title,rating_out_of_5,review
str,f64,str
"""Killzone: Shad…",5.0,"""First time hav…"
"""Resident Evil …",5.0,"""good"""


In [30]:
# Select columns based on their data type: select all string columns
# pl.col(pl.Uft8) refers to all columns with a string data type
# and is converted by Polars to a list of matching column names : pl.col('title'), pl.col('review_text') 
video_games_reviews.select(pl.col(pl.Utf8)).head(2)

title,review_text
str,str
"""Killzone: Shad…","""First time hav…"
"""Resident Evil …","""good"""


In [31]:
# Select columns based on their data type: select all numerical columns
video_games_reviews.select(pl.col(pl.NUMERIC_DTYPES)).head(2)

rating
f64
5.0
5.0


In [32]:
# pl.all() can be used to select all columns
video_games_reviews.select(pl.all()).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Killzone: Shad…",5.0,2016-12-02,"""First time hav…"
"""Resident Evil …",5.0,2014-07-29,"""good"""


In [33]:
# pl.all() and exclude can be used to exclude some columns, for example exclude all numerical columns
video_games_reviews.select(pl.all().exclude(pl.NUMERIC_DTYPES)).head(2)

title,review_date,review_text
str,date,str
"""Killzone: Shad…",2016-12-02,"""First time hav…"
"""Resident Evil …",2014-07-29,"""good"""


In [34]:
# Select columns based on regex: all column names that contain 'review'
video_games_reviews.select(pl.col('^.*review.*$')).head(2)

review_date,review_text
date,str
2016-12-02,"""First time hav…"
2014-07-29,"""good"""


In [35]:
# Exclude certains columns: all columns except those that contain 'review'
# Exclude can be used with regex, or with column names, or with data types
video_games_reviews.select(pl.all().exclude('^.*review.*$')).head(2)

title,rating
str,f64
"""Killzone: Shad…",5.0
"""Resident Evil …",5.0


##### With Columns

In [36]:
# Another way to select columns is using with_columns()
# with_columns() includes all columns by default, and 
# adds or modifies the specified columns
video_games_reviews.with_columns(
    pl.col('rating') * 20, # modified column 
    ).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Killzone: Shad…",100.0,2016-12-02,"""First time hav…"
"""Resident Evil …",100.0,2014-07-29,"""good"""


#### Add columns

In [37]:
# Add a new column with the string 'videogame'
video_games_reviews.select(
    pl.col('title'),
    pl.col('rating'),
    category = pl.lit('videogame')
).head(2)

title,rating,category
str,f64,str
"""Killzone: Shad…",5.0,"""videogame"""
"""Resident Evil …",5.0,"""videogame"""


In [38]:
# Add and modify columns
# if the column has the same name => modifies a column
# if the column has a different name => adds a new column 
# All columns are calculated in parallel, so rating_out_of_5 depends on the initial rating column
# and not on the modified rating column
video_games_reviews.select(
    pl.col('rating') * 20, # modified column 
    rating_out_of_5 = pl.col('rating'), # new column
    ).head(2)

rating,rating_out_of_5
f64,f64
100.0,5.0
100.0,5.0


In [39]:
# If we want the new column to depend on the modified column, we can add a new context
# where we do the calculation
print(video_games_reviews.select(
    pl.col('rating') * 20, # modified column
).with_columns(
    rating_out_of_5 = pl.col('rating') # new column
).head(2))

shape: (2, 2)
┌────────┬─────────────────┐
│ rating ┆ rating_out_of_5 │
│ ---    ┆ ---             │
│ f64    ┆ f64             │
╞════════╪═════════════════╡
│ 100.0  ┆ 100.0           │
│ 100.0  ┆ 100.0           │
└────────┴─────────────────┘


In [40]:
# When adding columns with with_columns(), non-specified columns are included by default
# with_columns() includes all columns by default, and 
# adds or modifies the specified columns
video_games_reviews.with_columns(
    pl.col('rating') / 5 * 100, # modified column 
    rating_out_of_5 = pl.col('rating'), # new column
    ).head(2)

title,rating,review_date,review_text,rating_out_of_5
str,f64,date,str,f64
"""Killzone: Shad…",100.0,2016-12-02,"""First time hav…",5.0
"""Resident Evil …",100.0,2014-07-29,"""good""",5.0


In [41]:
# Combine 2 columns together
# We need to cast the rating column to a string data type (we see this later)
video_games_reviews.select(
    review_text_and_score = pl.col('review_text') + pl.lit(' - ') + pl.col('rating').cast(pl.Utf8)
    ).head(2)

review_text_and_score
str
"""First time hav…"
"""good - 5.0"""


In [42]:
# Combine columns using pl.format
video_games_reviews.select(
    review_text_and_score = pl.format('review: {}, on {}', 'review_text', 'review_date')
    ).head(2)

review_text_and_score
str
"""review: First …"
"""review: good, …"


#### Functions on dataframes and expressions

In [43]:
# Functions can be applied on the dataframe
video_games_reviews.min()

title,rating,review_date,review_text
str,f64,date,str
"""Alan Wake - Xb…",1.0,2001-10-01,""" batman"""


In [44]:
# Functions can also be applied on expressions
# In this example, the result is the same
video_games_reviews.select(pl.all().min())

title,rating,review_date,review_text
str,f64,date,str
"""Alan Wake - Xb…",1.0,2001-10-01,""" batman"""


In [45]:
# Here the result is different
# Sorting the dataframe sorts all rows in the dataframe based on the review date 
video_games_reviews.sort(by='review_date').head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Halo - Xbox""",5.0,2001-10-01,"""Forget all you…"
"""Halo - Xbox""",5.0,2001-10-01,"""Forget all you…"


In [46]:
# Sorting the column using an expression sorts only the column
# the first review dates are for 'Halo' and not for 'Killzone' and 'Resident Evil'
# With expressions, each column is calculated independently of the other ones
# With dataframes, the columns are calculated together
video_games_reviews.with_columns(pl.col('review_date').sort()).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Killzone: Shad…",5.0,2001-10-01,"""First time hav…"
"""Resident Evil …",5.0,2001-10-01,"""good"""


In [47]:
# Sorting the non-string columns (which is 'rating' and 'review_date') 
# there is not a row with a rating of 1 and a date of 2001-10-01 in the dataset
# what we see here are the columns rating and review_date sorted independently of each other
video_games_reviews.with_columns(pl.all().exclude(pl.Utf8).sort()).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Killzone: Shad…",1.0,2001-10-01,"""First time hav…"
"""Resident Evil …",1.0,2001-10-01,"""good"""


#### Advanced column selection

In [48]:
# Calculate the quantiles of the rating column: one decile per column
video_games_reviews.select(
    pl.col('rating').quantile(i/100).alias(f'perc_{i}') for i in range(0, 101, 10)
)

perc_0,perc_10,perc_20,perc_30,perc_40,perc_50,perc_60,perc_70,perc_80,perc_90,perc_100
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,2.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [49]:
# Count the number of unique values for each column
print(video_games_reviews.select(pl.all().n_unique()))

shape: (1, 4)
┌───────┬────────┬─────────────┬─────────────┐
│ title ┆ rating ┆ review_date ┆ review_text │
│ ---   ┆ ---    ┆ ---         ┆ ---         │
│ u32   ┆ u32    ┆ u32         ┆ u32         │
╞═══════╪════════╪═════════════╪═════════════╡
│ 102   ┆ 5      ┆ 4406        ┆ 35823       │
└───────┴────────┴─────────────┴─────────────┘


In [50]:
# Verify which columns have less than 200 unique values
print(video_games_reviews.select(pl.all().n_unique() < 200))

shape: (1, 4)
┌───────┬────────┬─────────────┬─────────────┐
│ title ┆ rating ┆ review_date ┆ review_text │
│ ---   ┆ ---    ┆ ---         ┆ ---         │
│ bool  ┆ bool   ┆ bool        ┆ bool        │
╞═══════╪════════╪═════════════╪═════════════╡
│ true  ┆ true   ┆ false       ┆ false       │
└───────┴────────┴─────────────┴─────────────┘


In [51]:
# Extract the columns with less than 200 unique values
# .any() returns True if any value is True so it filters the columns
[column.name for column in
    video_games_reviews.select(pl.all().n_unique() < 200)
    if column.any()]

['title', 'rating']

In [52]:
# Select columns that have less than 200 unique values
video_games_reviews.select(
    column.name for column in
    video_games_reviews.select(pl.all().n_unique() < 200)
    if column.any()).head(2)

title,rating
str,f64
"""Killzone: Shad…",5.0
"""Resident Evil …",5.0


#### Filter columns

In [53]:
# Filter dataframe based on one value
video_games_reviews.filter(
    pl.col('title') == 'The Last of Us Remastered - PlayStation 4'
    ).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""The Last of Us…",5.0,2017-01-12,"""brings the DLC…"
"""The Last of Us…",5.0,2015-02-12,"""wow umm yea th…"


In [54]:
# Filter dataframe based on multiple values
video_games_reviews.filter(
    pl.col('title').is_in(['The Last of Us Remastered - PlayStation 4', 'Resident Evil 5 - Xbox 360'])
    ).head(2)


title,rating,review_date,review_text
str,f64,date,str
"""Resident Evil …",5.0,2014-07-29,"""good"""
"""The Last of Us…",5.0,2017-01-12,"""brings the DLC…"


In [55]:
# Filter dataframe based on multiple values : use AND &
video_games_reviews.filter(
    (pl.col('rating') == 1) &
    (pl.col('title') == 'inFAMOUS: Second Son Standard Edition (PlayStation 4)')
    ).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""inFAMOUS: Seco…",1.0,2014-07-12,"""Going through …"
"""inFAMOUS: Seco…",1.0,2014-06-16,"""Horrible game,…"


In [56]:
# Filter dataframe based on multiple values : use OR |
video_games_reviews.filter(
    (pl.col('rating') == 1) |
    (pl.col('title') == 'inFAMOUS: Second Son Standard Edition (PlayStation 4)')
    ).head(2)


title,rating,review_date,review_text
str,f64,date,str
"""Bloodborne""",1.0,2015-07-13,"""Game blowd"""
"""inFAMOUS: Seco…",5.0,2017-01-23,"""Just awesome. …"


In [57]:
# Filter dataframe using numerical values: ratings bigger or equal to 4 and less than 3
video_games_reviews.filter(
    (pl.col('rating') >= 1) &
    (pl.col('rating') < 3)
    ).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Final Fantasy …",2.0,2014-08-24,"""This game didn…"
"""Bloodborne""",1.0,2015-07-13,"""Game blowd"""


In [58]:
# Filter dataframe using regex: filter all videogame titles that contain 'PlayStation 4' or 'Xbox 360'
video_games_reviews.filter(
    pl.col('title').str.contains('PlayStation 4|Xbox 360')
).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Killzone: Shad…",5.0,2016-12-02,"""First time hav…"
"""Resident Evil …",5.0,2014-07-29,"""good"""


In [59]:
# Filter based on a calculation: show the rating with the latest date
video_games_reviews.filter(
    pl.col('review_date') == pl.col('review_date').max()
).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Batman: Arkham…",5.0,2018-08-27,"""Was good but I…"


#### Unique and duplicated values

In [60]:
# Show only unique reviews
# Remove all reviews that appear more than once
video_games_reviews.filter(
    video_games_reviews.is_unique()
).height

33999

In [61]:
# Show only duplicated reviews: 
# Show only reviews appearing more than once in the table
video_games_reviews.filter(
    video_games_reviews.is_duplicated()
).height

15578

In [62]:
# Show only days with one review 
video_games_reviews.filter(
    pl.col('review_date').is_unique()
).height

317

In [63]:
# Show only days that have 2 or more reviews
video_games_reviews.filter(
    pl.col('review_date').is_duplicated()
).height

49260

In [64]:
# For reviews that appear more than once, keep only one review
video_games_reviews.unique().head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Uncharted 4: A…",5.0,2016-05-14,"""It's the best …"
"""Playstation Pl…",5.0,2017-05-31,"""PSN what else …"


In [65]:
# For reviews that appear more than once, keep only the first review
# Not specifying the keep parameter will result in keeping any review, and will be faster
video_games_reviews.unique(keep='first').head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Assassin's Cre…",2.0,2015-02-19,"""not really the…"
"""Assassin's Cre…",5.0,2014-06-20,"""Arrrrrr matee!…"


In [66]:
# Count the number of unique reviews
video_games_reviews.n_unique()

41786

In [67]:
# Count the number of reviews that are duplicates
video_games_reviews.height - video_games_reviews.n_unique()

7791

In [68]:
# Keep only one review for each combination of title and rating
video_games_reviews.unique(subset=['title','rating']).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Razer DeathAdd…",3.0,2016-02-04,"""For years I ha…"
"""Borderlands - …",1.0,2013-03-21,"""Waste of time.…"


In [69]:
# Show all unique values for rating
# Equivalent to the SQL command: SELECT DISTINCT rating FROM video_games_reviews
video_games_reviews.select(pl.col('rating').unique())

rating
f64
1.0
2.0
3.0
4.0
5.0


In [70]:
# Count the unique values for each column
video_games_reviews.select(pl.all().n_unique())

title,rating,review_date,review_text
u32,u32,u32,u32
102,5,4406,35823


In [71]:
# Show unique value and their count for a column
print(video_games_reviews.select(pl.col('rating').value_counts(sort=True)))

shape: (5, 1)
┌─────────────┐
│ rating      │
│ ---         │
│ struct[2]   │
╞═════════════╡
│ {5.0,32858} │
│ {4.0,7632}  │
│ {3.0,4018}  │
│ {1.0,2969}  │
│ {2.0,2100}  │
└─────────────┘


#### Missing Values

In [111]:
# Count missing values for each column
video_games_reviews.null_count()

title,rating,review_date,review_text
u32,u32,u32,u32
418,0,0,17


In [113]:
# Filter missing values for text column
video_games_reviews.filter(pl.col('review_text').is_null()).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Batman: Arkham…",5.0,2016-07-26,
"""Minecraft""",5.0,2015-03-02,


In [74]:
# Drop null values for title column
video_games_reviews.filter(
    pl.col('review_text').is_not_null()
).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Killzone: Shad…",5.0,2016-12-02,"""First time hav…"
"""Resident Evil …",5.0,2014-07-29,"""good"""


In [115]:
# Verify that dropping nulls for title column worked
video_games_reviews.filter(
    pl.col('review_text').is_not_null()
).null_count()

title,rating,review_date,review_text
u32,u32,u32,u32
418,0,0,0


In [76]:
# Fill null values for review_text column with 'No review'
video_games_reviews.with_columns(
    pl.col('review_text').fill_null('! No review')
).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Killzone: Shad…",5.0,2016-12-02,"""First time hav…"
"""Resident Evil …",5.0,2014-07-29,"""good"""


In [117]:
# Verify if filling nulls for review_text column worked
video_games_reviews.with_columns(
    pl.col('review_text').fill_null('! No review')
).filter(
    pl.col('review_text') == '! No review'
).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Batman: Arkham…",5.0,2016-07-26,"""! No review"""
"""Minecraft""",5.0,2015-03-02,"""! No review"""


In [126]:
# Create a new dataset to demonstrate additional functions for filling nulls
df_missing_values = pl.DataFrame({
    'a': [1, None, None, 9],
    'b': [1, None, 9, 10],
    'c': [6, None, 6, None]
})

df_missing_values

a,b,c
i64,i64,i64
1.0,1.0,6.0
,,
,9.0,6.0
9.0,10.0,


In [135]:
# Replace missing values with the median of the column
df_missing_values.with_columns(
    pl.all().fill_null(pl.all().median())
)

a,b,c
f64,f64,f64
1.0,1.0,6.0
5.0,9.0,6.0
5.0,9.0,6.0
9.0,10.0,6.0


In [137]:
# Interpolate missing values for numbers
df_missing_values.interpolate()

a,b,c
i64,i64,i64
1,1,6.0
3,5,6.0
6,9,6.0
9,10,


In [139]:
# Forward fill missing values for a column
df_missing_values.with_columns(
    pl.col('a').forward_fill()
)

a,b,c
i64,i64,i64
1,1.0,6.0
1,,
1,9.0,6.0
9,10.0,


In [140]:
# Forward fill missing values for all columns
print(df_missing_values.fill_null(strategy='forward'))

shape: (4, 3)
┌─────┬─────┬─────┐
│ a   ┆ b   ┆ c   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1   ┆ 1   ┆ 6   │
│ 1   ┆ 1   ┆ 6   │
│ 1   ┆ 9   ┆ 6   │
│ 9   ┆ 10  ┆ 6   │
└─────┴─────┴─────┘


In [124]:
# Backward fill missing values for all columns
df_missing_values.fill_null(strategy='backward')

a,b,c
i64,i64,i64
1,1,6.0
9,9,6.0
9,9,6.0
9,10,


#### Conditions using `all` and `any`

In [142]:
# We use the dataframe missing_values to demonstrate verifying multiple conditions
df_missing_values = pl.DataFrame({
    'a': [1, None, None, 9],
    'b': [1, None, 9, 10],
    'c': [6, None, 6, None]
})

df_missing_values

a,b,c
i64,i64,i64
1.0,1.0,6.0
,,
,9.0,6.0
9.0,10.0,


In [143]:
# Identify which values are missing for all columns
print(df_missing_values.select(
    pl.all().is_null()
))

shape: (4, 3)
┌───────┬───────┬───────┐
│ a     ┆ b     ┆ c     │
│ ---   ┆ ---   ┆ ---   │
│ bool  ┆ bool  ┆ bool  │
╞═══════╪═══════╪═══════╡
│ false ┆ false ┆ false │
│ true  ┆ true  ┆ true  │
│ true  ┆ false ┆ false │
│ false ┆ false ┆ true  │
└───────┴───────┴───────┘


In [145]:
# Filter all rows with all values missing
df_missing_values.filter(
    pl.all(pl.all().is_null())
)

a,b,c
i64,i64,i64
,,


In [147]:
# Filter all rows with at least one missing value
df_missing_values.filter(
    pl.any(pl.all().is_null())
)

a,b,c
i64,i64,i64
,,
,9.0,6.0
9.0,10.0,


In [150]:
# Filter all rows with no missing values
df_missing_values.filter(
    pl.all(pl.col('b','c').is_not_null())
)

a,b,c
i64,i64,i64
1.0,1,6
,9,6


#### Slicing and indexing

In [152]:
# Keep 3 rows starting at row 5
video_games_reviews.slice(5, 3)

title,rating,review_date,review_text
str,f64,date,str
"""Titanfall - Xb…",5.0,2015-03-04,"""As expected an…"
"""Assassin's Cre…",5.0,2012-10-31,"""It's a typical…"
"""Mario Kart 8 -…",5.0,2017-03-24,"""This is withou…"


In [89]:
# Square brackets indexing should be limited to:
# 1. Extract a scalar value (another option is using .item() )
# 2. Convert a DataFrame to a Series (another option is using .get_column() )
# 3. Inspecting some rows or columns
# In general, select, with_columns, and filter should be used instead of square brackets indexing
# ! Disadvantages: Square brackets selecting only works in eager mode, and is not parallelized
# Select 3 rows starting at row 5
video_games_reviews[5:8]

title,rating,review_date,review_text
str,f64,date,str
"""Titanfall - Xb…",5.0,2015-03-04,"""As expected an…"
"""Assassin's Cre…",5.0,2012-10-31,"""It's a typical…"
"""Mario Kart 8 -…",5.0,2017-03-24,"""This is withou…"


In [154]:
# Select 3 rows starting at row 5 for columns title and rating
video_games_reviews[5:8, ['title', 'rating']]

title,rating
str,f64
"""Titanfall - Xb…",5.0
"""Assassin's Cre…",5.0
"""Mario Kart 8 -…",5.0


In [156]:
# Select 3 rows starting at row 5 for columns 0 and 1
video_games_reviews[5:8, [0, 1]]

title,rating
str,f64
"""Titanfall - Xb…",5.0
"""Assassin's Cre…",5.0
"""Mario Kart 8 -…",5.0


In [92]:
# Select columns title and rating using indexes
video_games_reviews[['title', 'rating']].head(2)

title,rating
str,f64
"""Killzone: Shad…",5.0
"""Resident Evil …",5.0


#### Extracting columns, rows, dataframes and items

In [159]:
# Extract a column from a DataFrame and convert it to a Series
video_games_reviews.get_column('title').head(2).to_frame()

title
str
"""Killzone: Shad…"
"""Resident Evil …"


In [94]:
# Extract all columns and convert them to a list of Series
video_games_reviews.head(1).get_columns()

[shape: (1,)
 Series: 'title' [str]
 [
 	"Killzone: Shad…
 ],
 shape: (1,)
 Series: 'rating' [f64]
 [
 	5.0
 ],
 shape: (1,)
 Series: 'review_date' [date]
 [
 	2016-12-02
 ],
 shape: (1,)
 Series: 'review_text' [str]
 [
 	"First time hav…
 ]]

In [161]:
# Convert a column to a list using .to_list()
video_games_reviews.get_column('title').head(5).to_list()

['Killzone: Shadow Fall (PlayStation 4)',
 'Resident Evil 5 - Xbox 360',
 'Bioshock 2 - Xbox 360',
 'Grip-iT Analog Stick Covers, Set of 4',
 'Dead Space (X-BOX 360) Platinum hits']

In [95]:
# Extract rows from a dataframe as a list of tuples
# ! This materializes all the rows in memory. It's expensive and should be avoided when possible
video_games_reviews.head(2).rows()

[('Killzone: Shadow Fall (PlayStation 4)',
  5.0,
  datetime.date(2016, 12, 2),
  'First time having this game! It pretty good and the graphics are very nice'),
 ('Resident Evil 5 - Xbox 360', 5.0, datetime.date(2014, 7, 29), 'good')]

In [96]:
# Extract rows from a dataframe as a list of dicts (more expensive)
# ! This materializes all the rows in memory. It's expensive and should be avoided when possible
video_games_reviews.head(2).rows(named=True)

[{'title': 'Killzone: Shadow Fall (PlayStation 4)',
  'rating': 5.0,
  'review_date': datetime.date(2016, 12, 2),
  'review_text': 'First time having this game! It pretty good and the graphics are very nice'},
 {'title': 'Resident Evil 5 - Xbox 360',
  'rating': 5.0,
  'review_date': datetime.date(2014, 7, 29),
  'review_text': 'good'}]

In [97]:
# Extract rows from a dataframe as an iterator
# ! Export methods of Polars should be preferred instead of iterating over rows
video_games_reviews.iter_rows()

<generator object DataFrame.iter_rows at 0x7f77cc14e400>

In [98]:
# Extract slices of 1000 rows from a dataframe as an iterator
video_games_reviews.iter_slices(n_rows=1000)

<generator object DataFrame.iter_slices at 0x7f77bcf4b010>

In [99]:
# Extract a list of Dataframes partitioned based on the specified column
video_games_reviews.head(2).partition_by('title')

[shape: (1, 4)
 ┌──────────────────┬────────┬─────────────┬──────────────────┐
 │ title            ┆ rating ┆ review_date ┆ review_text      │
 │ ---              ┆ ---    ┆ ---         ┆ ---              │
 │ str              ┆ f64    ┆ date        ┆ str              │
 ╞══════════════════╪════════╪═════════════╪══════════════════╡
 │ Killzone: Shado… ┆ 5.0    ┆ 2016-12-02  ┆ First time havi… │
 └──────────────────┴────────┴─────────────┴──────────────────┘,
 shape: (1, 4)
 ┌──────────────────┬────────┬─────────────┬─────────────┐
 │ title            ┆ rating ┆ review_date ┆ review_text │
 │ ---              ┆ ---    ┆ ---         ┆ ---         │
 │ str              ┆ f64    ┆ date        ┆ str         │
 ╞══════════════════╪════════╪═════════════╪═════════════╡
 │ Resident Evil 5… ┆ 5.0    ┆ 2014-07-29  ┆ good        │
 └──────────────────┴────────┴─────────────┴─────────────┘]

In [162]:
# Extract a scalar using .item()
# .item() transforms a table of one row and one column into a scalar
video_games_reviews.head(1).select('title').item()

'Killzone: Shadow Fall (PlayStation 4)'

In [102]:
# Convert a dataframe to a Python dictionary using .to_dict()
video_games_reviews.head(2).to_dict(as_series=False)

{'title': ['Killzone: Shadow Fall (PlayStation 4)',
  'Resident Evil 5 - Xbox 360'],
 'rating': [5.0, 5.0],
 'review_date': [datetime.date(2016, 12, 2), datetime.date(2014, 7, 29)],
 'review_text': ['First time having this game! It pretty good and the graphics are very nice',
  'good']}

In [173]:
# Convert the dataframe to a Pandas using .to_pandas()
video_games_reviews.head(2).to_pandas()

Unnamed: 0,title,rating,review_date,review_text
0,Killzone: Shadow Fall (PlayStation 4),5.0,2016-12-02,First time having this game! It pretty good an...
1,Resident Evil 5 - Xbox 360,5.0,2014-07-29,good


In [172]:
# Convert the dataframe to a Numpy array using .to_numpy()
video_games_reviews.head(2).to_numpy()

array([['Killzone: Shadow Fall (PlayStation 4)', 5.0,
        datetime.date(2016, 12, 2),
        'First time having this game! It pretty good and the graphics are very nice'],
       ['Resident Evil 5 - Xbox 360', 5.0, datetime.date(2014, 7, 29),
        'good']], dtype=object)

#### Mathematical and statistical operations

In [165]:
# Sums all values of a column
video_games_reviews.sum()

title,rating,review_date,review_text
str,f64,date,str
,214041.0,,


In [163]:
# Get the maximum values of each column
video_games_reviews.max()

title,rating,review_date,review_text
str,f64,date,str
"""inFAMOUS: Seco…",5.0,2018-08-27,"""~PROS~ +Fantas…"


In [104]:
# Get the minimum values of each column
video_games_reviews.min()

title,rating,review_date,review_text
str,f64,date,str
"""Alan Wake - Xb…",1.0,2001-10-01,""" batman"""


In [105]:
# Get the median value of each column
# Does not work for string or date columns
video_games_reviews.median()

title,rating,review_date,review_text
str,f64,date,str
,5.0,,


In [106]:
# Get the mean value of each column
# Does not work for string or date columns
video_games_reviews.mean()

title,rating,review_date,review_text
str,f64,date,str
,4.317345,,


In [107]:
# Get the 10th percentile value of each column
# Does not work for string or date columns
video_games_reviews.quantile(0.1)

title,rating,review_date,review_text
str,f64,date,str
,2.0,,


In [109]:
# Calculates the variance between values in a column
video_games_reviews.var()

title,rating,review_date,review_text
str,f64,date,str
,1.351545,,


In [110]:
# Calculates the standard deviation between values in a column
video_games_reviews.std()

title,rating,review_date,review_text
str,f64,date,str
,1.16256,,


In [168]:
# Calculate the correlation between columns
video_games_reviews.select('rating','review_date').corr()

shape: (2, 2)
┌──────────┬─────────────┐
│ rating   ┆ review_date │
│ ---      ┆ ---         │
│ f64      ┆ f64         │
╞══════════╪═════════════╡
│ 1.0      ┆ 0.071473    │
│ 0.071473 ┆ 1.0         │
└──────────┴─────────────┘


In [171]:
import numpy as np

# Use a Numpy universal function: np.exp on a column
video_games_reviews.with_columns(
    np.exp(pl.col('rating'))
).head(2)

title,rating,review_date,review_text
str,f64,date,str
"""Killzone: Shad…",148.413159,2016-12-02,"""First time hav…"
"""Resident Evil …",148.413159,2014-07-29,"""good"""
