# Chapter 3: Introduction to Data Analysis in Python Polars 

## Inspecting a DataFrame

### How to do it...

In [None]:
import polars as pl

In [None]:
df = pl.read_csv('../data/covid_19_dealths.csv')

In [None]:
df.head(3)

In [None]:
df.tail(5)

In [None]:
df.glimpse(max_items_per_column=3)

In [None]:
df.estimated_size('mb')

In [None]:
import polars.selectors as cs
df.select(cs.numeric()).describe()

In [None]:
df.null_count()

### There is more...

In [None]:
print(df.head())


In [None]:
with pl.Config() as config:
    config.set_tbl_cols(11)
    print(df.head(2))

In [None]:
pl.Config.set_tbl_cols(11)
print(df.head(2))

## Casting data types

### How to do it...

In [None]:
import polars as pl

In [None]:
df = pl.read_csv('../data/covid_19_dealths.csv')
df.head()

In [None]:
df.with_columns(
        pl.col('Data As Of').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('Start Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.to_date('%m/%d/%Y').alias('End Date 2'),
        pl.col('Year').cast(pl.Int64)
).head()


In [None]:
updated_df = (
    df.with_columns(
        pl.col('Data As Of').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('Start Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.to_date('%m/%d/%Y').alias('End Date 2'),
        pl.col('Year').cast(pl.Int64)
    )
)

In [None]:
lf = pl.scan_csv('../data/covid_19_dealths.csv')
lf.with_columns(
        pl.col('Data As Of').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('Start Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.to_date('%m/%d/%Y').alias('End Date 2'),
        pl.col('Year').cast(pl.Int64)
).collect().head()

## Finding and removing duplicates values 

### How to do it

In [None]:
import polars as pl

In [None]:
df = pl.read_csv('../data/covid_19_dealths.csv')
df.head()

In [None]:
df.shape

In [None]:
df.is_duplicated().sum()

In [None]:
df.is_unique().sum()

In [None]:
df.n_unique()

In [None]:
df.select(pl.all().n_unique())

In [None]:
df.n_unique(subset=['Start Date', 'End Date'])

In [None]:
(
    df
    .unique(subset=['Start Date', 'End Date'], keep='first')
    .select(row_count=pl.count())
)

In [None]:
rows_to_keep = df.select(['Year', 'COVID-19 Deaths']).is_unique()
rows_to_keep.sum()

In [None]:
df.filter(rows_to_keep).shape

In [None]:
df.filter(rows_to_keep).head()

### There is more...

In [None]:
df.approx_n_unique()

## Masking sensitive data

### How to do it...

In [None]:
import polars as pl

In [None]:
df = pl.read_csv('../data/covid_19_dealths.csv')
df.head()

In [None]:
import random

def get_random_nums(num_list, length):
    random_nums = ''.join(str(n) for n in random.sample(num_list, length))
    return random_nums

In [None]:
fake_ssns = []
nums = [n for n in range(0, 10)]

for i in range(0, df.height):
    part_1 = get_random_nums(nums, 3)
    part_2 = get_random_nums(nums, 2)
    part_3 = get_random_nums(nums, 4)
    fake_ssn = part_1 + '-' + part_2 + '-' + part_3
    fake_ssns.append(fake_ssn)

fake_ssns_df = pl.DataFrame({'SSN': fake_ssns})
fake_ssns_df.head()

In [None]:
df = pl.concat([df, fake_ssns_df], how='horizontal')

In [None]:
df.select(
    ('XXX-XX-XX' + pl.col('SSN').str.slice(9, 2)).alias('SSN Masked')
).head()

In [None]:
df.select(
    ('XXX-XX-XX' + pl.col('SSN').str.slice(9, 2)).alias('SSN Masked'),
    
).head()

In [None]:
df.select(
    pl.col('SSN').hash()
).head()

## Visualizing data using seaborn

### How to do it...

In [1]:
import polars as pl
import plotly.express as px

In [2]:
age_groups = ['0-17 years', '18-29 years', '30-39 years', '40-49 years', '50-64 years', '65-74 years', '75-84 years', '85 years and over', 'All Ages']

df = (
    pl.read_csv('../data/covid_19_dealths.csv')
    .filter(
        pl.col('Month').is_not_null(),
        pl.col('Age Group').is_in(age_groups),
    )
)
df.head()

Data As Of,Start Date,End Date,Group,Year,Month,State,Sex,Age Group,COVID-19 Deaths,Total Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""","""1""","""United States""","""All Sexes""","""All Ages""",6,264677,17909,3,2125,20037,
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""","""1""","""United States""","""All Sexes""","""0-17 years""",0,2966,90,0,63,153,
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""","""1""","""United States""","""All Sexes""","""18-29 years""",0,4426,114,0,54,168,
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""","""1""","""United States""","""All Sexes""","""30-39 years""",0,6475,246,0,112,358,
"""09/27/2023""","""01/01/2020""","""01/31/2020""","""By Month""","""2020""","""1""","""United States""","""All Sexes""","""40-49 years""",0,9792,485,0,151,636,


In [3]:
df = (
    df.
    with_columns(
        pl.col('Data As Of').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('Start Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('End Date').str.strptime(pl.Date, '%m/%d/%Y'),
        pl.col('Year').cast(pl.Int64),
        pl.col('Month').cast(pl.Int64)
    )
) 
df.head()

Data As Of,Start Date,End Date,Group,Year,Month,State,Sex,Age Group,COVID-19 Deaths,Total Deaths,Pneumonia Deaths,Pneumonia and COVID-19 Deaths,Influenza Deaths,"Pneumonia, Influenza, or COVID-19 Deaths",Footnote
date,date,date,str,i64,i64,str,str,str,i64,i64,i64,i64,i64,i64,str
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,1,"""United States""","""All Sexes""","""All Ages""",6,264677,17909,3,2125,20037,
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,1,"""United States""","""All Sexes""","""0-17 years""",0,2966,90,0,63,153,
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,1,"""United States""","""All Sexes""","""18-29 years""",0,4426,114,0,54,168,
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,1,"""United States""","""All Sexes""","""30-39 years""",0,6475,246,0,112,358,
2023-09-27,2020-01-01,2020-01-31,"""By Month""",2020,1,"""United States""","""All Sexes""","""40-49 years""",0,9792,485,0,151,636,


In [4]:
covid_deaths_by_age = (
    df
    .filter(
        pl.col('State')=='United States',
        pl.col('Year') == 2023,
        pl.col('Age Group') != 'All Ages',
        pl.col('Sex') == 'All Sexes'
    )
    .group_by('Age Group')
    .agg(pl.col('COVID-19 Deaths').sum())
    .sort(by='COVID-19 Deaths', descending=True)
)

fig = px.bar(
    covid_deaths_by_age, 
    x='Age Group', 
    y='COVID-19 Deaths', 
    title='COVID Deaths 2023 by Age Group - As of 9/27/23',
    labels={'Age Group': ''}
)

fig.show()

In [5]:
covid_deaths_by_top_5_states = (
    df
    .filter(
        pl.col('State') != 'United States',
        pl.col('Year') == 2023,
        pl.col('Age Group') == 'All Ages',
        pl.col('Sex') == 'All Sexes'
    )
    .group_by('State')
    .agg(pl.col('COVID-19 Deaths').sum())
    .sort(by='COVID-19 Deaths', descending=True)
    .head()
)

fig = px.bar(
    covid_deaths_by_top_5_states, 
    x='State', 
    y='COVID-19 Deaths', 
    title='COVID Deaths 2023 by Top 5 States - As of 9/27/23',
    labels={'State': ''}
)

fig.show()

In [6]:
covid_deaths_by_sex = (
    df
    .filter(
        pl.col('State') == 'United States',
        pl.col('Year') == 2023,
        pl.col('Age Group') == 'All Ages',
        pl.col('Sex') != 'All Sexes'
    )
    .group_by('Sex')
    .agg(pl.col('COVID-19 Deaths').sum())
    .sort(by='COVID-19 Deaths', descending=True)
    .head()
)

fig = px.bar(
    covid_deaths_by_sex, 
    x='Sex', 
    y='COVID-19 Deaths', 
    title='COVID Deaths 2023 by Sex - As of 9/27/23',
    labels={'Sex': ''},
    text_auto='.2s'
)

fig.update_traces(width = 0.3, textfont_size=12, textangle=0, textposition='inside')
fig.show()

In [7]:
from us_state_mappings import us_state_division_dict

covid_deaths_vs_flu_deaths = (
    df
    .with_columns(
        pl.col('State').map_dict(us_state_division_dict, default='Others').alias('Division')
    )
    .filter(
        pl.col('State') != 'United States',
        pl.col('Age Group') != 'All Ages',
        pl.col('Sex') != 'All Sexes',
        pl.col('Year') == 2023
    )
    .group_by('State', 'Division')
    .agg(
        pl.col('COVID-19 Deaths').sum(),
        pl.col('Influenza Deaths').sum(),
        pl.col('Pneumonia Deaths').sum()
    )
)

fig = px.scatter(
    covid_deaths_vs_flu_deaths, 
    x='COVID-19 Deaths', 
    y='Influenza Deaths', 
    color='Division',
    size='Pneumonia Deaths',
    hover_name='State',
    title='COVID-19, Influenza, and Pneumonia Deaths 2023 by US States and Divisions'
)

fig.show()

In [69]:
monthly_treand_by_year = (
    df
    .filter(
        pl.col('State') == 'United States',
        pl.col('Age Group') == 'All Ages',
        pl.col('Sex') == 'All Sexes'
    )
    .group_by('Year', 'Month')
    .agg(
        pl.col('COVID-19 Deaths').sum(),
    )
    .sort(by='Month')
)

fig = px.line(
    monthly_treand_by_year, 
    x='Month', 
    y='COVID-19 Deaths', 
    color='Year',
    title='COVID-19 Deaths Monthly Trend - United States',
    line_shape='spline'
)

fig.update_xaxes(dtick = 1)
fig.update_layout(legend_traceorder='reversed')
fig.show()

## Detecting and handling outliers  

### How to do it...

In [125]:
import polars as pl
import plotly 
df = pl.from_pandas(plotly.data.iris())
df.head()

sepal_length,sepal_width,petal_length,petal_width,species,species_id
f64,f64,f64,f64,str,i64
5.1,3.5,1.4,0.2,"""setosa""",1
4.9,3.0,1.4,0.2,"""setosa""",1
4.7,3.2,1.3,0.2,"""setosa""",1
4.6,3.1,1.5,0.2,"""setosa""",1
5.0,3.6,1.4,0.2,"""setosa""",1


In [126]:
import plotly.express as px

fig = px.box(df, y='sepal_width', width=500)
fig.show()

In [142]:
q1 = pl.col('sepal_width').quantile(0.25)
q3 = pl.col('sepal_width').quantile(0.75)
iqr = q3 - q1
threshold = 1.5
lower_limit = q1 - iqr * threshold
upper_limit = q3 + iqr * threshold

df.filter(
    (pl.col('sepal_width') < lower_limit) | (pl.col('sepal_width') > upper_limit)
).head()

sepal_length,sepal_width,petal_length,petal_width,species,species_id
f64,f64,f64,f64,str,i64
5.7,4.4,1.5,0.4,"""setosa""",1
5.2,4.1,1.5,0.1,"""setosa""",1
5.5,4.2,1.4,0.2,"""setosa""",1
5.0,2.0,3.5,1.0,"""versicolor""",2


In [146]:
is_outlier_iqr = (pl.col('sepal_width') < lower_limit) | (pl.col('sepal_width') > upper_limit)
df_iqr_outlier_removed = (
    df
    .filter(is_outlier_iqr == False)
)
df_iqr_outlier_removed.filter(is_outlier_iqr)

sepal_length,sepal_width,petal_length,petal_width,species,species_id
f64,f64,f64,f64,str,i64


In [151]:
df_iqr_outlier_replaced = (
    df
    .with_columns(
        pl.when(is_outlier_iqr == True)
        .then(pl.col('sepal_width').median())
        .otherwise(pl.col('sepal_width'))
        .alias('sepal_width')
    )
)
df_iqr_outlier_replaced.filter(is_outlier_iqr)

sepal_length,sepal_width,petal_length,petal_width,species,species_id
f64,f64,f64,f64,str,i64


In [152]:
df_zscore = (
    df.with_columns(
       sepal_width_zscore=(pl.col('sepal_width') - pl.col('sepal_width').mean()) / pl.col('sepal_width').std()
    )
)
df_zscore.head()

sepal_length,sepal_width,petal_length,petal_width,species,species_id,sepal_width_zscore
f64,f64,f64,f64,str,i64,f64
5.1,3.5,1.4,0.2,"""setosa""",1,1.028611
4.9,3.0,1.4,0.2,"""setosa""",1,-0.12454
4.7,3.2,1.3,0.2,"""setosa""",1,0.33672
4.6,3.1,1.5,0.2,"""setosa""",1,0.10609
5.0,3.6,1.4,0.2,"""setosa""",1,1.259242


In [153]:
is_outlier_z_score = (pl.col('sepal_width_zscore') > 3) | (pl.col('sepal_width_zscore') < -3)
df_zscore_outliers_removed = df_zscore.filter(is_outlier_z_score == False)

In [154]:
df_zscore.filter(is_outlier_z_score == True)

sepal_length,sepal_width,petal_length,petal_width,species,species_id,sepal_width_zscore
f64,f64,f64,f64,str,i64,f64
5.7,4.4,1.5,0.4,"""setosa""",1,3.104284


In [155]:
df_zscore_outliers_removed.filter(is_outlier_z_score == True)

sepal_length,sepal_width,petal_length,petal_width,species,species_id,sepal_width_zscore
f64,f64,f64,f64,str,i64,f64


In [156]:
df_zscore_outliers_replaced = (
    df_zscore
    .with_columns(
        pl.when(is_outlier_z_score == True)
        .then(pl.col('sepal_width').mean())
        .otherwise(pl.col('sepal_width'))
        .alias('sepal_width')
    )
)

In [157]:
df_zscore_outliers_replaced.filter(is_outlier_z_score == True)

sepal_length,sepal_width,petal_length,petal_width,species,species_id,sepal_width_zscore
f64,f64,f64,f64,str,i64,f64
5.7,3.054,1.5,0.4,"""setosa""",1,3.104284


In [159]:
import polars as pl

df = pl.DataFrame({'value': [1,2,3,4,5]})
df.with_columns(
    pl.when(pl.col('value') < 3)
    .then(pl.lit('low'))
    .otherwise(pl.lit('high'))
    .alias('category')
)

value,category
i64,str
1,"""low"""
2,"""low"""
3,"""high"""
4,"""high"""
5,"""high"""
