# Chapter 3: Introduction to Data Analysis in Python Polars 

## Inspecting a DataFrame

### How to do it...

In [2]:
import polars as pl

In [3]:
df = pl.read_csv('../data/covid_19_deaths.csv')

In [4]:
df.head(5)

Data As Of,Start Date,End Date,Group,Year,Month,State,Sex,Age Group,COVID-19 Deaths,Total Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,,"""United States""","""All Sexes""","""All Ages""",1146774,12303399,1162844,569264,22229,1760095,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,,"""United States""","""All Sexes""","""Under 1 year""",519,73213,1056,95,64,1541,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,,"""United States""","""All Sexes""","""0-17 years""",1696,130970,2961,424,509,4716,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,,"""United States""","""All Sexes""","""1-4 years""",285,14299,692,66,177,1079,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,,"""United States""","""All Sexes""","""5-14 years""",509,22008,818,143,219,1390,


In [5]:
df.tail(5)

Data As Of,Start Date,End Date,Group,Year,Month,State,Sex,Age Group,COVID-19 Deaths,Total Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str
"""09/27/2023""","""09/01/2023""","""09/23/2023""","""By Month""","""2023""","""9""","""Puerto Rico""","""Female""","""50-64 years""",,75,14.0,,0.0,14.0,"""One or more data cells have co…"
"""09/27/2023""","""09/01/2023""","""09/23/2023""","""By Month""","""2023""","""9""","""Puerto Rico""","""Female""","""55-64 years""",0.0,65,10.0,0.0,0.0,10.0,
"""09/27/2023""","""09/01/2023""","""09/23/2023""","""By Month""","""2023""","""9""","""Puerto Rico""","""Female""","""65-74 years""",,91,,,0.0,,"""One or more data cells have co…"
"""09/27/2023""","""09/01/2023""","""09/23/2023""","""By Month""","""2023""","""9""","""Puerto Rico""","""Female""","""75-84 years""",,211,36.0,,0.0,38.0,"""One or more data cells have co…"
"""09/27/2023""","""09/01/2023""","""09/23/2023""","""By Month""","""2023""","""9""","""Puerto Rico""","""Female""","""85 years and over""",,265,42.0,,,44.0,"""One or more data cells have co…"


In [6]:
df.glimpse(max_items_per_column=3)

Rows: 137700
Columns: 16
$ Data As Of                               <str> '09/27/2023', '09/27/2023', '09/27/2023'
$ Start Date                               <str> '01/01/2020', '01/01/2020', '01/01/2020'
$ End Date                                 <str> '09/23/2023', '09/23/2023', '09/23/2023'
$ Group                                    <str> 'By Total', 'By Total', 'By Total'
$ Year                                     <str> None, None, None
$ Month                                    <str> None, None, None
$ State                                    <str> 'United States', 'United States', 'United States'
$ Sex                                      <str> 'All Sexes', 'All Sexes', 'All Sexes'
$ Age Group                                <str> 'All Ages', 'Under 1 year', '0-17 years'
$ COVID-19 Deaths                          <i64> 1146774, 519, 1696
$ Total Deaths                             <i64> 12303399, 73213, 130970
$ Pneumonia Deaths                         <i64> 1162844, 1056, 2961
$ P

In [7]:
df.estimated_size('mb')

26.869319915771484

In [101]:
import polars.selectors as cs
"""
This script imports the `selectors` module from the `polars` library and uses it to select numeric columns from a DataFrame `df`. 
It then generates descriptive statistics for these numeric columns.

Functions:
    cs.numeric(): Returns a selector for numeric columns in the DataFrame.
    df.select(): Selects columns from the DataFrame based on the provided selector.
    df.describe(): Generates descriptive statistics for the selected columns.

Usage:
    Ensure that the DataFrame `df` is defined and contains numeric columns before running this script.
"""
df.select(cs.numeric()).describe()

statistic,sepal_length,sepal_width,petal_length,petal_width,species_id
str,f64,f64,f64,f64,f64
"""count""",150.0,150.0,150.0,150.0,150.0
"""null_count""",0.0,0.0,0.0,0.0,0.0
"""mean""",5.843333,3.054,3.758667,1.198667,2.0
"""std""",0.828066,0.433594,1.76442,0.763161,0.819232
"""min""",4.3,2.0,1.0,0.1,1.0
"""25%""",5.1,2.8,1.6,0.3,1.0
"""50%""",5.8,3.0,4.4,1.3,2.0
"""75%""",6.4,3.3,5.1,1.8,3.0
"""max""",7.9,4.4,6.9,2.5,3.0


In [9]:
df.null_count()

Data As Of,Start Date,End Date,Group,Year,Month,State,Sex,Age Group,COVID-19 Deaths,Total Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,2754,13770,0,0,0,39430,19509,44864,36884,26688,44233,39804


### There is more...

In [10]:
print(df.head())

shape: (5, 16)
┌────────────┬───────────┬───────────┬──────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Data As Of ┆ Start     ┆ End Date  ┆ Group    ┆ … ┆ Pneumonia ┆ Influenza ┆ Pneumonia ┆ Footnote │
│ ---        ┆ Date      ┆ ---       ┆ ---      ┆   ┆ and       ┆ Deaths    ┆ , Influen ┆ ---      │
│ str        ┆ ---       ┆ str       ┆ str      ┆   ┆ COVID-19  ┆ ---       ┆ za, or    ┆ str      │
│            ┆ str       ┆           ┆          ┆   ┆ Deaths    ┆ i64       ┆ COVID…    ┆          │
│            ┆           ┆           ┆          ┆   ┆ ---       ┆           ┆ ---       ┆          │
│            ┆           ┆           ┆          ┆   ┆ i64       ┆           ┆ i64       ┆          │
╞════════════╪═══════════╪═══════════╪══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 09/27/2023 ┆ 01/01/202 ┆ 09/23/202 ┆ By Total ┆ … ┆ 569264    ┆ 22229     ┆ 1760095   ┆ null     │
│            ┆ 0         ┆ 3         ┆          ┆   ┆           ┆           

In [11]:
with pl.Config() as config:
    """
    This code snippet sets the table column width configuration for displaying DataFrames using the Polars library.
    It sets the number of columns to display to 11 and then prints the first 2 rows of the DataFrame `df`.

    - `pl.Config()`: Context manager for Polars configuration settings.
    - `config.set_tbl_cols(11)`: Sets the number of columns to display in the table to 11.
    - `df.head(2)`: Retrieves the first 2 rows of the DataFrame `df`.
    """
    config.set_tbl_cols(11)
    print(df.head(2))

shape: (2, 16)
┌────────┬────────┬────────┬────────┬──────┬───────┬───┬────────┬────────┬────────┬────────┬───────┐
│ Data   ┆ Start  ┆ End    ┆ Group  ┆ Year ┆ Month ┆ … ┆ Pneumo ┆ Pneumo ┆ Influe ┆ Pneumo ┆ Footn │
│ As Of  ┆ Date   ┆ Date   ┆ ---    ┆ ---  ┆ ---   ┆   ┆ nia    ┆ nia    ┆ nza    ┆ nia,   ┆ ote   │
│ ---    ┆ ---    ┆ ---    ┆ str    ┆ str  ┆ str   ┆   ┆ Deaths ┆ and    ┆ Deaths ┆ Influe ┆ ---   │
│ str    ┆ str    ┆ str    ┆        ┆      ┆       ┆   ┆ ---    ┆ COVID- ┆ ---    ┆ nza,   ┆ str   │
│        ┆        ┆        ┆        ┆      ┆       ┆   ┆ i64    ┆ 19     ┆ i64    ┆ or     ┆       │
│        ┆        ┆        ┆        ┆      ┆       ┆   ┆        ┆ Deaths ┆        ┆ COVID… ┆       │
│        ┆        ┆        ┆        ┆      ┆       ┆   ┆        ┆ ---    ┆        ┆ ---    ┆       │
│        ┆        ┆        ┆        ┆      ┆       ┆   ┆        ┆ i64    ┆        ┆ i64    ┆       │
╞════════╪════════╪════════╪════════╪══════╪═══════╪═══╪════════╪════════╪══

In [12]:
pl.Config.set_tbl_cols(11)
print(df.head(2))

shape: (2, 16)
┌────────┬────────┬────────┬────────┬──────┬───────┬───┬────────┬────────┬────────┬────────┬───────┐
│ Data   ┆ Start  ┆ End    ┆ Group  ┆ Year ┆ Month ┆ … ┆ Pneumo ┆ Pneumo ┆ Influe ┆ Pneumo ┆ Footn │
│ As Of  ┆ Date   ┆ Date   ┆ ---    ┆ ---  ┆ ---   ┆   ┆ nia    ┆ nia    ┆ nza    ┆ nia,   ┆ ote   │
│ ---    ┆ ---    ┆ ---    ┆ str    ┆ str  ┆ str   ┆   ┆ Deaths ┆ and    ┆ Deaths ┆ Influe ┆ ---   │
│ str    ┆ str    ┆ str    ┆        ┆      ┆       ┆   ┆ ---    ┆ COVID- ┆ ---    ┆ nza,   ┆ str   │
│        ┆        ┆        ┆        ┆      ┆       ┆   ┆ i64    ┆ 19     ┆ i64    ┆ or     ┆       │
│        ┆        ┆        ┆        ┆      ┆       ┆   ┆        ┆ Deaths ┆        ┆ COVID… ┆       │
│        ┆        ┆        ┆        ┆      ┆       ┆   ┆        ┆ ---    ┆        ┆ ---    ┆       │
│        ┆        ┆        ┆        ┆      ┆       ┆   ┆        ┆ i64    ┆        ┆ i64    ┆       │
╞════════╪════════╪════════╪════════╪══════╪═══════╪═══╪════════╪════════╪══

## Casting data types

### How to do it...

In [13]:
import polars as pl

In [113]:
df = pl.read_csv('../data/covid_19_deaths.csv')
df.head()

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
str,str,str,str,str,…,i64,i64,i64,i64,str
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,1162844,569264,22229,1760095,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,1056,95,64,1541,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,2961,424,509,4716,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,692,66,177,1079,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,818,143,219,1390,


In [114]:
# This code snippet performs the following operations on a DataFrame `df` using the Polars library:
# 
# 1. Converts the 'Data As Of' column from a string to a date format using the '%m/%d/%Y' pattern.
# 2. Converts the 'Start Date' column from a string to a date format using the '%m/%d/%Y' pattern.
# 3. Converts the 'End Date' column from a string to a date format using the '%m/%d/%Y' pattern.
# 4. Converts the 'End Date' column from a string to a date format using the '%m/%d/%Y' pattern and renames it to 'End Date 2'.
# 5. Casts the 'Year' column to a 64-bit integer type.
# 
# Finally, it returns the first few rows of the modified DataFrame using the `head()` method.

df = df.rename({col.strip(): col.strip() for col in df.columns})

df = df.with_columns(
    pl.col('Data As Of').cast(pl.Utf8).str.strptime(pl.Date, '%m/%d/%Y'),
    pl.col('Start Date').cast(pl.Utf8).str.strptime(pl.Date, '%m/%d/%Y'),
    pl.col('End Date').cast(pl.Utf8).str.strptime(pl.Date, '%m/%d/%Y'),
    pl.col('End Date').cast(pl.Utf8).str.to_date('%m/%d/%Y').alias('End Date 2'),
    pl.col('Year').cast(pl.Int64)
)

df.head()


Data As Of,Start Date,End Date,Group,Year,…,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote,End Date 2
date,date,date,str,i64,…,i64,i64,i64,str,date
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,569264,22229,1760095,,2023-09-23
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,95,64,1541,,2023-09-23
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,424,509,4716,,2023-09-23
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,66,177,1079,,2023-09-23
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,143,219,1390,,2023-09-23


In [122]:
"""
Updates the DataFrame by performing the following transformations on specified columns:
- Casts the 'Data As Of' column to a string and then converts it to a date with the format '%Y-%m-%d'.
- Casts the 'Start Date' column to a string and then parses it to a date with the format '%Y-%m-%d' without strict validation.
- Casts the 'End Date' column to a string and then parses it to a date with the format '%m/%d/%Y' without strict validation, renaming the resulting column to 'End Date 2'.
- Casts the 'Year' column to a 64-bit integer.

Returns:
    DataFrame: The updated DataFrame with the transformed columns.
"""
updated_df = df.with_columns(
    pl.col('Data As Of').cast(pl.Utf8).str.to_date('%Y-%m-%d'),
    pl.col('Start Date').cast(pl.Utf8).str.strptime(pl.Date, '%Y-%m-%d', strict=False),
    pl.col('End Date').cast(pl.Utf8).str.strptime(pl.Date, '%m/%d/%Y', strict=False).alias('End Date 2'),
    pl.col('Year').cast(pl.Int64)
)

In [124]:
lf = pl.scan_csv('../data/covid_19_deaths.csv')
"""
This script reads a CSV file containing COVID-19 death data and processes the date columns.

Steps:
1. Reads the CSV file located at '../data/covid_19_deaths.csv' using Polars' lazy frame.
2. Converts the 'Data As Of', 'Start Date', and 'End Date' columns from string to date format using the specified date format '%m/%d/%Y'.
3. Converts the 'End Date' column to date format again and aliases it as 'End Date 2'.
4. Casts the 'Year' column to 64-bit integer type.
5. Collects the processed data and displays the first few rows.

Returns:
        A DataFrame containing the processed data with the first few rows displayed.
"""
lf.with_columns(
        pl.col('Data As Of').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('Start Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.to_date('%m/%d/%Y').alias('End Date 2'),
        pl.col('Year').cast(pl.Int64)
).collect().head()

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote,End Date 2
date,date,date,str,i64,…,i64,i64,i64,str,date
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,569264,22229,1760095,,2023-09-23
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,95,64,1541,,2023-09-23
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,424,509,4716,,2023-09-23
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,66,177,1079,,2023-09-23
2023-09-27,2020-01-01,2023-09-23,"""By Total""",,…,143,219,1390,,2023-09-23


## Finding and removing duplicates values 

### How to do it

In [64]:
import polars as pl

In [65]:
df = pl.read_csv('../data/covid_19_deaths.csv')
df.head()

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
str,str,str,str,str,…,i64,i64,i64,i64,str
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,1162844,569264,22229,1760095,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,1056,95,64,1541,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,2961,424,509,4716,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,692,66,177,1079,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,818,143,219,1390,


In [66]:
df.shape

(137700, 16)

In [67]:
df.is_duplicated().sum()

0

In [68]:
df.is_unique().sum()

137700

In [69]:
df.n_unique()

137700

In [70]:
df.select(pl.all().n_unique())

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
u32,u32,u32,u32,u32,…,u32,u32,u32,u32,u32
1,45,45,3,5,…,3556,2533,493,4264,2


In [71]:
df.n_unique(subset=['Start Date', 'End Date'])

50

In [72]:
(
    df.unique(subset=['Start Date', 'End Date'], keep='first')
    .head()
)

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
str,str,str,str,str,…,i64,i64,i64,i64,str
"""09/27/2023""","""04/01/2020""","""04/30/2020""","""By Month""","""2020""",…,46427,28399,1237,84003,
"""09/27/2023""","""01/01/2020""","""12/31/2020""","""By Year""","""2020""",…,352010,180086,8787,565226,
"""09/27/2023""","""01/01/2022""","""12/31/2022""","""By Year""","""2022""",…,267652,110393,8751,411412,
"""09/27/2023""","""03/01/2023""","""03/31/2023""","""By Month""","""2023""",…,17115,2697,240,22222,
"""09/27/2023""","""06/01/2021""","""06/30/2021""","""By Month""","""2021""",…,15624,4361,36,19316,


In [73]:
rows_to_keep = df.select(['Year', 'COVID-19 Deaths']).is_unique()
rows_to_keep.sum()

3940

In [74]:
df.filter(rows_to_keep).shape

(3940, 16)

In [75]:
df.filter(rows_to_keep).head()

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
str,str,str,str,str,…,i64,i64,i64,i64,str
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,1162844,569264,22229,1760095,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,1056,95,64,1541,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,2961,424,509,4716,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,692,66,177,1079,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,818,143,219,1390,


### There is more...

In [76]:
df.select(pl.all().approx_n_unique())

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
u32,u32,u32,u32,u32,…,u32,u32,u32,u32,u32
1,45,45,3,5,…,3544,2539,491,4294,2


## Masking sensitive data

### How to do it...

In [77]:
import polars as pl

In [32]:
df = pl.read_csv('../data/covid_19_deaths.csv')
df.head()

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
str,str,str,str,str,…,i64,i64,i64,i64,str
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,1162844,569264,22229,1760095,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,1056,95,64,1541,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,2961,424,509,4716,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,692,66,177,1079,
"""09/27/2023""","""01/01/2020""","""09/23/2023""","""By Total""",,…,818,143,219,1390,


In [125]:
import random

def get_random_nums(num_list, length):
    """
    Generate a string of random numbers from a given list.

    Args:
        num_list (list): A list of numbers to sample from.
        length (int): The number of random numbers to select.

    Returns:
        str: A string of randomly selected numbers concatenated together.
    """
    random_nums = ''.join(str(n) for n in random.sample(num_list, length))
    return random_nums

In [126]:
fake_ssns = []
"""
Generates a list of fake Social Security Numbers (SSNs) and creates a Polars DataFrame with these SSNs.

The code performs the following steps:
1. Initializes an empty list `fake_ssns` to store the generated SSNs.
2. Creates a list `nums` containing digits from 0 to 9.
3. Iterates over the range of the DataFrame's height (`df.height`).
4. For each iteration, generates three parts of the SSN:
    - `part_1`: A random 3-digit number.
    - `part_2`: A random 2-digit number.
    - `part_3`: A random 4-digit number.
5. Combines the three parts into a single SSN string in the format `XXX-XX-XXXX`.
6. Appends the generated SSN to the `fake_ssns` list.
7. Seeds the random number generator with a fixed value (10) for reproducibility.
8. Creates a Polars DataFrame `fake_ssns_df` with a single column 'SSN' containing the generated SSNs.
9. Displays the first few rows of the DataFrame using the `head()` method.

Note:
- The function `get_random_nums(nums, n)` is assumed to generate a random number with `n` digits from the list `nums`.
- The DataFrame `df` and the Polars library `pl` should be defined/imported elsewhere in the code.
"""
nums = [n for n in range(10)]

for i in range(df.height):
    part_1 = get_random_nums(nums, 3)
    part_2 = get_random_nums(nums, 2)
    part_3 = get_random_nums(nums, 4)
    fake_ssn = f'{part_1}-{part_2}-{part_3}'
    fake_ssns.append(fake_ssn)

random.seed(10)
fake_ssns_df = pl.DataFrame({'SSN': fake_ssns})
fake_ssns_df.head()

SSN
str
"""906-70-3786"""
"""420-87-5139"""
"""506-25-6948"""
"""472-45-2738"""
"""960-90-3298"""


In [129]:
fake_ssns_df = fake_ssns_df.rename({"SSN": "Fake_SSN"})
df = pl.concat([df, fake_ssns_df], how='horizontal')
"""
Concatenates the existing DataFrame `df` with `fake_ssns_df` horizontally.

Parameters:
    df (pl.DataFrame): The original DataFrame.
    fake_ssns_df (pl.DataFrame): The DataFrame containing fake SSNs to be concatenated.

Returns:
    pl.DataFrame: The concatenated DataFrame with columns from both `df` and `fake_ssns_df`.
"""

'\nConcatenates the existing DataFrame `df` with `fake_ssns_df` horizontally.\n\nParameters:\n    df (pl.DataFrame): The original DataFrame.\n    fake_ssns_df (pl.DataFrame): The DataFrame containing fake SSNs to be concatenated.\n\nReturns:\n    pl.DataFrame: The concatenated DataFrame with columns from both `df` and `fake_ssns_df`.\n'

In [131]:
"""
This code snippet selects and masks the 'SSN' column in a DataFrame using the Polars library.

The 'SSN' column is expected to contain Social Security Numbers (SSNs). The code masks the first five digits of the SSN with 'XXX-XX-XX' and retains the last four digits.

Returns:
    DataFrame: A DataFrame with a single column 'SSN Masked' containing the masked SSNs.
"""
df.select(
    ('XXX-XX-XX' + pl.col('SSN').str.slice(9, 2)).alias('SSN Masked')
).head()

SSN Masked
str
"""XXX-XX-XX86"""
"""XXX-XX-XX39"""
"""XXX-XX-XX48"""
"""XXX-XX-XX38"""
"""XXX-XX-XX98"""


In [133]:
"""
Selects and masks the 'SSN' column in the DataFrame.
This function selects the 'SSN' column from the DataFrame, slices the last two characters,
and concatenates them with the string 'XXX-XX-XX' to create a masked version of the SSN.
The resulting masked SSN is then aliased as 'SSN Masked'.
Returns:
    DataFrame: A DataFrame with the masked SSN column.
"""
df.select(
    ('XXX-XX-XX' + pl.col('SSN').str.slice(9, 2)).alias('SSN Masked')
).head()

SSN Masked
str
"""XXX-XX-XX86"""
"""XXX-XX-XX39"""
"""XXX-XX-XX48"""
"""XXX-XX-XX38"""
"""XXX-XX-XX98"""


In [136]:
"""
Selects the 'SSN' column from the DataFrame and applies a hash function to it.

Returns:
    DataFrame: A DataFrame containing the hashed values of the 'SSN' column.
"""
df.select(
    pl.col('SSN').hash().alias('SSN Hashed')
).head()

SSN Hashed
u64
6883711336300221710
1353787506755705260
582257754172128729
4546623555158951452
16632570998477912075


## Visualizing data using Plotly

### How to do it...

In [84]:
import polars as pl
import plotly.express as px

In [137]:
age_groups = ['0-17 years', '18-29 years', '30-39 years', '40-49 years', '50-64 years', '65-74 years', '75-84 years', '85 years and over', 'All Ages']
"""
This script reads a CSV file containing COVID-19 death data, filters the data to include only rows where the 'Month' column is not null and the 'Age Group' column matches specific age groups, and then displays the first few rows of the filtered DataFrame.

Variables:
    age_groups (list of str): A list of age group categories to filter the data by.

DataFrame Operations:
    - Reads the CSV file located at '../data/covid_19_deaths.csv'.
    - Filters the DataFrame to include only rows where the 'Month' column is not null.
    - Filters the DataFrame to include only rows where the 'Age Group' column is in the specified age_groups list.
    - Displays the first few rows of the filtered DataFrame.
"""

df = (
    pl.read_csv('../data/covid_19_deaths.csv')
    .filter(
        pl.col('Month').is_not_null(),
        pl.col('Age Group').is_in(age_groups),
    )
)
df.head()

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
str,str,str,str,str,…,i64,i64,i64,i64,str
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""",…,17909,3,2125,20037,
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""",…,90,0,63,153,
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""",…,114,0,54,168,
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""",…,246,0,112,358,
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""",…,485,0,151,636,


In [86]:
df = (
    df.
    with_columns(
        pl.col('Data As Of').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('Start Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('Year').cast(pl.Int64),
        pl.col('Month').cast(pl.Int64)
    )
) 
df.head()

Data As Of,Start Date,End Date,Group,Year,…,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
date,date,date,str,i64,…,i64,i64,i64,i64,str
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,…,17909,3,2125,20037,
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,…,90,0,63,153,
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,…,114,0,54,168,
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,…,246,0,112,358,
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,…,485,0,151,636,


In [143]:
"""
This script filters and aggregates COVID-19 death data by age group for the year 2023 in the United States,
and then visualizes the results in a bar chart.

Steps:
1. Filter the DataFrame `df` to include only rows where:
    - 'State' is 'United States'
    - 'Year' is 2023 (cast to Int32)
    - 'Age Group' is not 'All Ages'
    - 'Sex' is 'All Sexes'
2. Group the filtered data by 'Age Group' and sum the 'COVID-19 Deaths' for each group.
3. Sort the aggregated data by 'COVID-19 Deaths' in descending order.
4. Create a bar chart using Plotly Express to visualize the number of COVID-19 deaths by age group.
5. Update the layout of the chart to remove the x-axis title.
6. Display the chart.

Variables:
- covid_deaths_by_age: DataFrame containing the aggregated and sorted COVID-19 death data by age group.
- fig: Plotly Express bar chart object visualizing the COVID-19 deaths by age group.
"""
covid_deaths_by_age = (
    df
    .filter(
        pl.col('State') == 'United States',
        pl.col('Year').cast(pl.Int32) == 2023,
        pl.col('Age Group') != 'All Ages',
        pl.col('Sex') == 'All Sexes'
    )
    .group_by('Age Group')
    .agg(pl.col('COVID-19 Deaths').sum())
    .sort(by='COVID-19 Deaths', descending=True)
)

fig = px.bar(
    covid_deaths_by_age, 
    x='Age Group', 
    y='COVID-19 Deaths', 
    title='COVID Deaths 2023 by Age Group - As of 9/27/23'
)

fig.update_layout(xaxis_title=None)
fig.show()


In [88]:
covid_deaths_by_top_5_states = (
    df
    .filter(
        pl.col('State') != 'United States',
        pl.col('Year') == 2023,
        pl.col('Age Group') == 'All Ages',
        pl.col('Sex') == 'All Sexes'
    )
    .group_by('State')
    .agg(pl.col('COVID-19 Deaths').sum())
    .sort(by='COVID-19 Deaths', descending=True)
    .head()
)

fig = px.bar(
    covid_deaths_by_top_5_states, 
    x='State', 
    y='COVID-19 Deaths', 
    title='COVID Deaths 2023 by Top 5 States - As of 9/27/23',
)

fig.update_layout(xaxis_title=None)
fig.show()

In [89]:
covid_deaths_by_sex = (
    df
    .filter(
        pl.col('State') == 'United States',
        pl.col('Year') == 2023,
        pl.col('Age Group') == 'All Ages',
        pl.col('Sex') != 'All Sexes'
    )
    .group_by('Sex')
    .agg(pl.col('COVID-19 Deaths').sum())
    .sort(by='COVID-19 Deaths', descending=True)
    .head()
)

fig = px.bar(
    covid_deaths_by_sex, 
    x='Sex', 
    y='COVID-19 Deaths', 
    title='COVID Deaths 2023 by Sex - As of 9/27/23',
    text_auto='.2s'
)

fig.update_layout(xaxis_title=None)
fig.update_traces(width = 0.3, textfont_size=12, textangle=0, textposition='inside')
fig.show()

In [90]:
from us_state_mappings import us_state_division_dict

covid_deaths_vs_flu_deaths = (
    df
    .with_columns(
        pl.col('State').replace_strict(us_state_division_dict, default='Others').alias('Division')
    )
    .filter(
        pl.col('State') != 'United States',
        pl.col('Age Group') != 'All Ages',
        pl.col('Sex') != 'All Sexes',
        pl.col('Year') == 2023
    )
    .group_by('State', 'Division')
    .agg(
        pl.col('COVID-19 Deaths').sum(),
        pl.col('Influenza Deaths').sum(),
        pl.col('Pneumonia Deaths').sum()
    )
)

fig = px.scatter(
    covid_deaths_vs_flu_deaths, 
    x='COVID-19 Deaths', 
    y='Influenza Deaths', 
    color='Division',
    size='Pneumonia Deaths',
    hover_name='State',
    title='COVID-19, Influenza, and Pneumonia Deaths 2023 by US States and Divisions'
)

fig.show()

In [145]:
"""
This script processes a DataFrame `df` to filter and aggregate COVID-19 death data for the United States,
grouped by year and month. It then creates a line plot to visualize the monthly trend of COVID-19 deaths.

Steps:
1. Filter the DataFrame `df` to include only rows where:
    - 'State' is 'United States'
    - 'Age Group' is 'All Ages'
    - 'Sex' is 'All Sexes'
2. Group the filtered data by 'Year' and 'Month'.
3. Aggregate the grouped data by summing the 'COVID-19 Deaths' column.
4. Sort the aggregated data by 'Month'.
5. Create a line plot using Plotly Express to visualize the monthly trend of COVID-19 deaths:
    - X-axis: 'Month'
    - Y-axis: 'COVID-19 Deaths'
    - Line color: 'Year'
    - Title: 'COVID-19 Deaths Monthly Trend - United States'
    - Line shape: 'spline'
6. Update the x-axis to have a tick interval of 1.
7. Update the layout to reverse the order of the legend traces.
8. Display the plot.
"""
monthly_treand_by_year = (
    df
    .filter(
        pl.col('State') == 'United States',
        pl.col('Age Group') == 'All Ages',
        pl.col('Sex') == 'All Sexes'
    )
    .group_by('Year', 'Month')
    .agg(
        pl.col('COVID-19 Deaths').sum(),
    )
    .sort(by='Month')
)

fig = px.line(
    monthly_treand_by_year, 
    x='Month', 
    y='COVID-19 Deaths', 
    color='Year',
    title='COVID-19 Deaths Monthly Trend - United States',
    line_shape='spline'
)

fig.update_xaxes(dtick = 1)
fig.update_layout(legend_traceorder='reversed')
fig.show()

## Detecting and handling outliers  

### How to do it...

In [146]:
import polars as pl
"""
This script imports the Polars and Plotly libraries, converts the Iris dataset from a Pandas DataFrame to a Polars DataFrame, and displays the first few rows of the Polars DataFrame.

Functions:
    None

Imports:
    polars as pl: Polars library for data manipulation.
    plotly: Plotly library for data visualization.

Usage:
    The script converts the Iris dataset from Plotly to a Polars DataFrame and displays the first few rows using the `head()` method.
"""
import plotly 
df = pl.from_pandas(plotly.data.iris())
df.head()

sepal_length,sepal_width,petal_length,petal_width,species,species_id
f64,f64,f64,f64,str,i64
5.1,3.5,1.4,0.2,"""setosa""",1
4.9,3.0,1.4,0.2,"""setosa""",1
4.7,3.2,1.3,0.2,"""setosa""",1
4.6,3.1,1.5,0.2,"""setosa""",1
5.0,3.6,1.4,0.2,"""setosa""",1


In [93]:
import plotly.express as px

fig = px.box(df, y='sepal_width', width=500)
fig.show()

In [147]:
q1 = pl.col('sepal_width').quantile(0.25)
"""
Filters the DataFrame to find outliers in the 'sepal_width' column based on the IQR method.

This function calculates the first quartile (Q1) and third quartile (Q3) of the 'sepal_width' column,
then computes the interquartile range (IQR) as the difference between Q3 and Q1. It defines the lower
and upper limits for outliers as Q1 - 1.5 * IQR and Q3 + 1.5 * IQR, respectively. The DataFrame is then
filtered to include only the rows where 'sepal_width' is less than the lower limit or greater than the
upper limit, effectively identifying the outliers.

Returns:
    DataFrame: A DataFrame containing the rows where 'sepal_width' is an outlier.
"""
q3 = pl.col('sepal_width').quantile(0.75)
iqr = q3 - q1
threshold = 1.5
lower_limit = q1 - iqr * threshold
upper_limit = q3 + iqr * threshold

df.filter(
    (pl.col('sepal_width') < lower_limit) | (pl.col('sepal_width') > upper_limit)
).head()

sepal_length,sepal_width,petal_length,petal_width,species,species_id
f64,f64,f64,f64,str,i64
5.7,4.4,1.5,0.4,"""setosa""",1
5.2,4.1,1.5,0.1,"""setosa""",1
5.5,4.2,1.4,0.2,"""setosa""",1
5.0,2.0,3.5,1.0,"""versicolor""",2


In [148]:
is_outlier_iqr = (pl.col('sepal_width') < lower_limit) | (pl.col('sepal_width') > upper_limit)
"""
This code snippet identifies and removes outliers from a DataFrame based on the IQR method for the 'sepal_width' column.

Variables:
- is_outlier_iqr: A boolean expression that identifies rows where 'sepal_width' is either below the lower limit or above the upper limit.
- df_iqr_outlier_removed: A DataFrame with outliers removed based on the IQR method.

Steps:
1. Define a boolean condition `is_outlier_iqr` to identify outliers in the 'sepal_width' column.
2. Filter the DataFrame `df` to remove rows where `is_outlier_iqr` is True, resulting in `df_iqr_outlier_removed`.
3. Filter `df_iqr_outlier_removed` to show rows where `is_outlier_iqr` is True, which should return an empty DataFrame if all outliers were successfully removed.
"""
df_iqr_outlier_removed = (
    df
    .filter(is_outlier_iqr.not_())
)
df_iqr_outlier_removed.filter(is_outlier_iqr)

sepal_length,sepal_width,petal_length,petal_width,species,species_id
f64,f64,f64,f64,str,i64


In [153]:
"""
This code snippet replaces outliers in the 'sepal_width' column of a DataFrame using the Interquartile Range (IQR) method.

Steps:
1. Create a new DataFrame `df_iqr_outlier_replaced` by replacing outliers in the 'sepal_width' column with the median value of the column.
2. Use the `is_outlier_iqr` condition to identify outliers.
3. Apply the `pl.when` function to check if a value is an outlier.
4. If the value is an outlier, replace it with the median of the 'sepal_width' column.
5. Otherwise, keep the original value.
6. Alias the modified column as 'sepal_width'.
7. Filter the DataFrame to show only the rows where the 'sepal_width' values were identified as outliers.

NOTE:In the context of data science and machine learning, 
sepal width is often used as a feature in datasets related to plant morphology, 
such as the famous Iris dataset, where it helps in classifying different species of flowers based on their physical characteristics.

Variables:
- df: The original DataFrame.
- is_outlier_iqr: A boolean condition that identifies outliers based on the IQR method.
- df_iqr_outlier_replaced: The DataFrame with outliers in the 'sepal_width' column replaced by the median value.
"""
df_iqr_outlier_replaced = (
    df
    .with_columns(
        pl.when(is_outlier_iqr)
        .then(pl.col('sepal_width').median())
        .otherwise(pl.col('sepal_width'))
        .alias('sepal_width')
    )
)
df_iqr_outlier_replaced.filter(is_outlier_iqr)

sepal_length,sepal_width,petal_length,petal_width,species,species_id
f64,f64,f64,f64,str,i64


In [152]:
"""
Calculate the z-score for the 'sepal_width' column in the DataFrame.

This function computes the z-score for the 'sepal_width' column by subtracting the mean of the column 
and dividing by the standard deviation of the column. The resulting z-score is added as a new column 
'sepal_width_zscore' to the DataFrame.

Returns:
    DataFrame: A DataFrame with an additional column 'sepal_width_zscore' containing the z-scores 
    of the 'sepal_width' values.
"""
df_zscore = df.with_columns(
    sepal_width_zscore=(pl.col('sepal_width') - pl.col('sepal_width').mean()) / pl.col('sepal_width').std()
)
df_zscore.head()

sepal_length,sepal_width,petal_length,petal_width,species,species_id,sepal_width_zscore
f64,f64,f64,f64,str,i64,f64
5.1,3.5,1.4,0.2,"""setosa""",1,1.028611
4.9,3.0,1.4,0.2,"""setosa""",1,-0.12454
4.7,3.2,1.3,0.2,"""setosa""",1,0.33672
4.6,3.1,1.5,0.2,"""setosa""",1,0.10609
5.0,3.6,1.4,0.2,"""setosa""",1,1.259242


In [154]:
is_outlier_z_score = (pl.col('sepal_width_zscore') > 3) | (pl.col('sepal_width_zscore') < -3)
"""
Filters out rows from the DataFrame `df_zscore` where the 'sepal_width_zscore' column
has values greater than 3 or less than -3, indicating outliers based on the z-score.

Returns:
    DataFrame: A new DataFrame `df_zscore_outliers_removed` with outliers removed.
"""
df_zscore_outliers_removed = df_zscore.filter(is_outlier_z_score.not_())

In [54]:
df_zscore.filter(is_outlier_z_score)

sepal_length,sepal_width,petal_length,petal_width,species,species_id,sepal_width_zscore
f64,f64,f64,f64,str,i64,f64
5.7,4.4,1.5,0.4,"""setosa""",1,3.104284


In [99]:
df_zscore_outliers_removed.filter(is_outlier_z_score)

sepal_length,sepal_width,petal_length,petal_width,species,species_id,sepal_width_zscore
f64,f64,f64,f64,str,i64,f64


In [156]:
"""
Replace outliers in the 'sepal_width' column of the DataFrame with the mean value of the 'sepal_width' column.

This function uses the z-score method to identify outliers in the 'sepal_width' column. If a value is identified as an outlier, it is replaced with the mean value of the 'sepal_width' column. Otherwise, the original value is retained.

Returns:
    DataFrame: A new DataFrame with outliers in the 'sepal_width' column replaced by the mean value.
"""
df_zscore_outliers_replaced = (
    df_zscore
    .with_columns(
        pl.when(is_outlier_z_score)
        .then(pl.col('sepal_width').mean())
        .otherwise(pl.col('sepal_width'))
        .alias('sepal_width')
    )
)

In [157]:
df_zscore_outliers_replaced.filter(is_outlier_z_score)

sepal_length,sepal_width,petal_length,petal_width,species,species_id,sepal_width_zscore
f64,f64,f64,f64,str,i64,f64
5.7,3.054,1.5,0.4,"""setosa""",1,3.104284
