### Polars SQL and Command-Line Interface (CLI)

The examples below use `.head()` to reduce the output to a few rows and take up less space.
If you want the full output, remove `.head()` from the code

This notebooked is divided into sections. If your code editor supports it, you can use the **Outline** functionality to easily go to the code section you are interested in.

For more details on Polars functions, check out the Polars API reference: https://pola-rs.github.io/polars/py-polars/html/reference/index.html

In [2]:
import polars as pl

In [48]:
# Configure the number of characters to show for each string column, plus the display format for float columns
pl.Config.set_fmt_str_lengths(30)
pl.Config.set_fmt_float("full")

polars.config.Config

In [14]:
# Scan 3 tables for the job postings dataset
job_postings = pl.scan_csv("../datasets/job_postings/job_postings_simplified.csv")
job_skills = pl.scan_csv("../datasets/job_postings/job_details/job_skills.csv")
companies = pl.scan_csv("../datasets/job_postings/company_details/companies.csv")

#### Create a SQL context and query dataframes

In [23]:
# Create a SQL context, register all Polars dataframes and lazyframes, and automatically collect the result
ctx = pl.SQLContext(register_globals=True, eager_execution=True)

In [24]:
ctx.execute("""--sql
    SELECT *
    from job_postings
    LIMIT 5
""") 

job_id,company_id,title,max_salary,min_salary,location
i64,i64,str,f64,f64,str
85008768,,"""Licensed Insurance Agent""",52000.0,45760.0,"""Chico, CA"""
133114754,77766802.0,"""Sales Manager""",,,"""Santa Clarita, CA"""
133196985,1089558.0,"""Model Risk Auditor""",,,"""New York, NY"""
381055942,96654609.0,"""Business Manager""",,,"""Forney, TX"""
529257371,1244539.0,"""NY Studio Assistant""",,,"""New York, NY"""


In [26]:
# We can register the dataframes one by one in the SQL Context
ctx = pl.SQLContext(
    job_postings=job_postings,
    job_skills=job_skills,
    eager_execution=True
)

In [29]:
# After the SQLContext is initialized, we register additional tables or unregister tables with:
# register, register_globals, register_many, unregister

ctx.register("companies", companies)

<SQLContext [tables:3] at 0x7f1c9da8efb0>

In [31]:
# We can check the registered tables with SQL

ctx.execute("""--sql
    SHOW TABLES
""") 

name
str
"""companies"""
"""job_postings"""
"""job_skills"""


In [40]:
# We can also check the registered tables with Python

ctx.tables()

['companies', 'job_postings', 'job_skills']

In [38]:
# We are not limited to querying Polars dataframes. We can query Pandas dataframes as well
import pandas as pd

users = pd.DataFrame({
    "name": ["John", "Jane", "Alice"],
    "age": [24, 25, 26]
})

ctx.register("users", pl.from_pandas(users))

ctx.execute("""--sql
    SELECT *
    FROM users
""")

name,age
str,i64
"""John""",24
"""Jane""",25
"""Alice""",26


In [39]:
# We unregister the Pandas dataframe with unregister

ctx.unregister("users")

<SQLContext [tables:3] at 0x7f1c9da8efb0>

In [41]:
# We select which columns we want by passing the columns after SELECT

ctx.execute("""--sql
    SELECT title, max_salary, min_salary
    FROM job_postings
    LIMIT 5
""")

title,max_salary,min_salary
str,f64,f64
"""Licensed Insurance Agent""",52000.0,45760.0
"""Sales Manager""",,
"""Model Risk Auditor""",,
"""Business Manager""",,
"""NY Studio Assistant""",,


In [43]:
# We filter rows with WHERE

ctx.execute("""--sql
    SELECT title, max_salary, min_salary
    FROM job_postings
    WHERE max_salary > 100000 AND min_salary > 80000
    LIMIT 5
""")

title,max_salary,min_salary
str,f64,f64
"""Manager, Salesforce Platform""",170976.0,98640.0
"""Clinic Managers - Physical Th…",117510.65,91010.65
"""Lead Software Engineer""",190000.0,160000.0
"""Executive Director""",110000.0,100000.0
"""Emergency Veterinarian""",350000.0,200000.0


In [49]:
# We can use calculation functions such as SUM, AVG, MIN, MAX, COUNT, etc.

ctx.execute("""--sql
    SELECT SUM(max_salary) AS sum_max_salary, 
            AVG(min_salary) AS avg_min_salary, 
            MIN(min_salary) AS overall_min_salary,
            MAX(max_salary) AS overall_max_salary, 
            COUNT(*) AS number_of_jobs
    FROM job_postings
""")

sum_max_salary,avg_min_salary,overall_min_salary,overall_max_salary,number_of_jobs
f64,f64,f64,f64,u32
487704282.28,62352.2180728129,10,1300000,15886


In [54]:
# We can group rows with GROUP BY, for example group by company id

ctx.execute("""--sql
    SELECT company_id, COUNT(*) AS number_of_jobs
    FROM job_postings
    WHERE company_id IS NOT NULL
    GROUP BY company_id
    LIMIT 5
""")

company_id,number_of_jobs
i64,u32
13664,1
87720672,1
28863552,1
6592,20
44960,3


In [53]:
# And use ORDER BY to sort the results

ctx.execute("""--sql
    SELECT company_id, COUNT(*) AS number_of_jobs
    FROM job_postings
    WHERE company_id IS NOT NULL
    GROUP BY company_id
    ORDER BY number_of_jobs DESC
    LIMIT 5
""")

company_id,number_of_jobs
i64,u32
3570660,161
1103,113
11056,108
1586,93
1441,93


In [55]:
# We can use Common Table Expressions (CTE) to create temporary tables

ctx.execute("""--sql
    WITH number_of_job_postings_by_company AS (
        SELECT company_id, COUNT(*) AS number_of_jobs
        FROM job_postings
        WHERE company_id IS NOT NULL
        GROUP BY company_id
    )
    SELECT AVG(number_of_jobs) AS avg_number_of_jobs_by_company
    FROM number_of_job_postings_by_company
""")

avg_number_of_jobs_by_company
f64
2.57379767827529
