### Polars SQL and Command-Line Interface (CLI)

The examples below use `.head()` to reduce the output to a few rows and take up less space.
If you want the full output, remove `.head()` from the code

This notebooked is divided into sections. If your code editor supports it, you can use the **Outline** functionality to easily go to the code section you are interested in.

For more details on Polars functions, check out the Polars API reference: https://pola-rs.github.io/polars/py-polars/html/reference/index.html

In [1]:
import polars as pl

In [2]:
# Configure the number of characters to show for each string column, plus the display format for float columns
pl.Config.set_fmt_str_lengths(30)
pl.Config.set_fmt_float("full")

polars.config.Config

#### Create a SQL context and register tables

In [3]:
# Scan 3 tables of the job postings dataset
job_postings = pl.scan_csv("../datasets/job_postings/job_postings_simplified.csv")
job_skills = pl.scan_csv("../datasets/job_postings/job_details/job_skills.csv")
companies = pl.scan_csv("../datasets/job_postings/company_details/companies.csv")

In [4]:
# Create a SQL context, register all Polars dataframes and lazyframes, and automatically collect the result
ctx = pl.SQLContext(register_globals=True, eager_execution=True)

In [5]:
# Verify that the tables have been registered
ctx.tables()

['companies', 'job_postings', 'job_skills']

In [6]:
# We can register the dataframes one by one in the SQL Context
ctx = pl.SQLContext(
    job_postings=job_postings,
    eager_execution=True
)

In [7]:
# After the SQLContext is initialized, we register additional tables or unregister tables with:
# register, register_globals, register_many, unregister

ctx.register("companies", companies)

<SQLContext [tables:2] at 0x7f7e7c1c3f40>

In [8]:
# We can also register tables directly within SQL

ctx.execute("""--sql
    CREATE TABLE job_skills
    AS SELECT * FROM read_csv('../datasets/job_postings/job_details/job_skills.csv')
    """)

Response
str
"""Create Table"""


In [9]:
# We can check the registered tables with SQL

ctx.execute("""--sql
    SHOW TABLES
""") 

name
str
"""../datasets/job_postings/job_…"
"""companies"""
"""job_postings"""
"""job_skills"""


In [10]:
# We can also check the registered tables with Python

ctx.tables()

['../datasets/job_postings/job_details/job_skills.csv',
 'companies',
 'job_postings',
 'job_skills']

In [11]:
# We are not limited to querying Polars dataframes. We can query Pandas dataframes as well
import pandas as pd

users = pd.DataFrame({
    "name": ["John", "Jane", "Alice"],
    "age": [24, 25, 26]
})

ctx.register("users", pl.from_pandas(users))

ctx.execute("""--sql
    SELECT *
    FROM users
""")

name,age
str,i64
"""John""",24
"""Jane""",25
"""Alice""",26


In [12]:
# We unregister the Pandas dataframe with unregister

ctx.unregister("users")

<SQLContext [tables:4] at 0x7f7e7c1c3f40>

#### Query tables

In [36]:
# We select which columns we want by passing the columns after SELECT

ctx.execute("""--sql
    SELECT title, max_salary, min_salary
    FROM job_postings
    LIMIT 5
""")

title,max_salary,min_salary
str,f64,f64
"""Licensed Insurance Agent""",52000.0,45760.0
"""Sales Manager""",,
"""Model Risk Auditor""",,
"""Business Manager""",,
"""NY Studio Assistant""",,


In [14]:
# We filter rows with WHERE

ctx.execute("""--sql
    SELECT title, max_salary, min_salary
    FROM job_postings
    WHERE max_salary > 100000 AND min_salary > 80000
    LIMIT 5
""")

title,max_salary,min_salary
str,f64,f64
"""Manager, Salesforce Platform""",170976.0,98640.0
"""Clinic Managers - Physical Th…",117510.65,91010.65
"""Lead Software Engineer""",190000.0,160000.0
"""Executive Director""",110000.0,100000.0
"""Emergency Veterinarian""",350000.0,200000.0


#### Calculations and aggregations

In [38]:
# We can use calculation functions such as SUM, AVG, MIN, MAX, COUNT, etc.

ctx.execute("""--sql
    SELECT AVG(max_salary) AS avg_max_salary, 
            AVG(min_salary) AS avg_min_salary, 
            MIN(min_salary) AS overall_min_salary,
            MAX(max_salary) AS overall_max_salary, 
            COUNT(*) AS number_of_jobs
    FROM job_postings
""")

avg_max_salary,avg_min_salary,overall_min_salary,overall_max_salary,number_of_jobs
f64,f64,f64,f64,u32
88336.22211193624,62352.2180728129,10,1300000,15886


In [39]:
# We can group rows with GROUP BY, for example group by company id

ctx.execute("""--sql
    SELECT company_id, 
            AVG(max_salary) AS avg_max_salary, 
            AVG(min_salary) AS avg_min_salary, 
            MIN(min_salary) AS overall_min_salary,
            MAX(max_salary) AS overall_max_salary, 
            COUNT(*) AS number_of_jobs
    FROM job_postings
    WHERE company_id IS NOT NULL
    GROUP BY company_id
    LIMIT 5
""")

company_id,avg_max_salary,avg_min_salary,overall_min_salary,overall_max_salary,number_of_jobs
i64,f64,f64,f64,f64,u32
742016,,,,,1
17120,,,,,1
3738912,16.415,12.625,11.0,18.53,6
87110560,,,,,1
76863872,25.0,22.0,22.0,25.0,1


In [42]:
# And use ORDER BY to sort the results

ctx.execute("""--sql
    SELECT company_id, 
            AVG(max_salary) AS avg_max_salary, 
            AVG(min_salary) AS avg_min_salary, 
            MIN(min_salary) AS overall_min_salary,
            MAX(max_salary) AS overall_max_salary, 
            COUNT(*) AS number_of_jobs
    FROM job_postings
    WHERE company_id IS NOT NULL AND max_salary IS NOT NULL
    GROUP BY company_id
    ORDER BY avg_max_salary DESC
    LIMIT 5
""")

company_id,avg_max_salary,avg_min_salary,overall_min_salary,overall_max_salary,number_of_jobs
i64,f64,f64,f64,f64,u32
92699700,766666.6666666666,466666.6666666667,300000,1300000,3
106584,743750.0,431250.0,175000,1100000,4
78124,675000.0,50000.0,50000,675000,9
28439179,500000.0,90000.0,70000,500000,3
18827874,500000.0,25000.0,25000,500000,1


#### SQL Joins

In [44]:
# We can join tables with the JOIN statement
# Let's take a look at a few rows of the companies table

ctx.execute("""--sql
    SELECT company_id, name, company_size, city, country
    FROM companies
    LIMIT 2
""")

company_id,name,company_size,city,country
i64,str,i64,str,str
1009,"""IBM""",7,"""Armonk, New York""","""US"""
1016,"""GE HealthCare""",7,"""Chicago""","""US"""


In [53]:
# Join the job_postings table with the companies table on the company_id column
# Replace company_id with company name

ctx.execute("""--sql
    SELECT companies.name, 
            AVG(job_postings.max_salary) AS avg_max_salary, 
            AVG(job_postings.min_salary) AS avg_min_salary, 
            MIN(job_postings.min_salary) AS overall_min_salary,
            MAX(job_postings.max_salary) AS overall_max_salary, 
            COUNT(*) AS number_of_jobs
    FROM job_postings
    JOIN companies
    ON job_postings.company_id = companies.company_id
    WHERE job_postings.company_id IS NOT NULL AND job_postings.max_salary IS NOT NULL
    GROUP BY companies.name
    ORDER BY avg_max_salary DESC
    LIMIT 5
""")

name,avg_max_salary,avg_min_salary,overall_min_salary,overall_max_salary,number_of_jobs
str,f64,f64,f64,f64,u32
"""Goliath Partners""",766666.6666666666,466666.6666666667,300000,1300000,3
"""Selby Jennings""",743750.0,431250.0,175000,1100000,4
"""Summit Funding, Inc.""",675000.0,50000.0,50000,675000,9
"""Culver Careers (CulverCareers…",500000.0,50000.0,50000,500000,1
"""Spotter""",500000.0,90000.0,70000,500000,3


In [58]:
# Group the table to see average maximum and minimum salaries, plus number of posts by country

ctx.execute("""--sql
    SELECT companies.country,
            ROUND(AVG(job_postings.max_salary),0) AS avg_max_salary,
            ROUND(AVG(job_postings.min_salary),0) AS avg_min_salary,
            COUNT(*) AS number_of_jobs
    FROM job_postings
    LEFT JOIN companies
    ON job_postings.company_id = companies.company_id
    WHERE companies.country IS NOT NULL AND companies.country <> '0'
    GROUP BY companies.country
    ORDER BY number_of_jobs DESC
    LIMIT 5
""")

country,avg_max_salary,avg_min_salary,number_of_jobs
str,f64,f64,u32
"""US""",87895,61800,14014
"""GB""",117293,88859,407
"""CA""",40847,32098,161
"""IN""",65445,58359,92
"""DE""",174190,94774,85


#### Common Table Expressions (CTE)

In [70]:
# We can use Common Table Expressions (CTE) to create temporary tables
# Here we calculate the number of companies with 1 job posted, the number of companies with 2 jobs posted, etc.

ctx.execute("""--sql
    WITH number_of_job_postings_by_company AS (
            SELECT company_id, COUNT(*) AS number_of_jobs_posted
            FROM job_postings
            WHERE company_id IS NOT NULL
            GROUP BY company_id
            )
    SELECT number_of_jobs_posted, COUNT(company_id) AS number_of_companies
    FROM number_of_job_postings_by_company
    GROUP BY number_of_jobs_posted
    ORDER BY number_of_companies DESC
    LIMIT 5
""")

number_of_jobs_posted,number_of_companies
u32,u32
1,4008
2,815
3,391
4,214
5,115


#### Window functions

In [90]:
# Use window function OVER to compare the maximum salary of each position to the average maximum of the company

ctx.execute("""--sql
    SELECT company_id,
            title, 
            max_salary, 
            ROUND(AVG(max_salary) OVER (PARTITION BY company_id),0) AS avg_max_salary_company,
            ROUND(max_salary / (AVG(max_salary) OVER (PARTITION BY company_id)),0) AS ratio
    FROM job_postings
    WHERE max_salary IS NOT NULL AND company_id IS NOT NULL
    ORDER BY max_salary DESC
    LIMIT 5
""")

company_id,title,max_salary,avg_max_salary_company,ratio
i64,str,f64,f64,f64
92699700,"""Quantitative Developer""",1300000,766667,2
106584,"""Fintech Startup | Tech Lead M…",1100000,743750,1
106584,"""Tech Lead Manager (L6-L8) | D…",1000000,743750,1
1792,"""Faculty Affairs Administrator…",792921,192740,4
106584,"""Quantitative Prop Trading Fir…",700000,743750,1


#### Convert SQL to Polars expressions

In [98]:
# We can convert SQL expression to Polars expressions and use them inside a select or with_column context

(
    job_postings.select(
        "company_id",
        "title",
        "max_salary",
        pl.sql_expr("ROUND(AVG(max_salary) OVER (PARTITION BY company_id),0) AS avg_max_salary_company")
    )
    .filter(pl.col('max_salary').is_not_null() & pl.col('company_id').is_not_null())
    .sort(by='max_salary', descending=True)
    .head(5)
).collect()

company_id,title,max_salary,avg_max_salary_company
i64,str,f64,f64
92699700,"""Quantitative Developer""",1300000,766667
106584,"""Fintech Startup | Tech Lead M…",1100000,743750
106584,"""Tech Lead Manager (L6-L8) | D…",1000000,743750
1792,"""Faculty Affairs Administrator…",792921,192740
106584,"""Quantitative Prop Trading Fir…",700000,743750
