# Exloratory Data Analysis for Data Science Salaries in 2023

In [2]:
import numpy as np
import pandas as pd
import polars as pl

In [40]:
salary_data = pl.read_csv('./../data/raw/ds_salaries.csv')
print(salary_data.shape)
salary_data.head()

(3755, 11)


work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
i64,str,str,str,i64,str,i64,str,i64,str,str
2023,"""SE""","""FT""","""Principal Data…",80000,"""EUR""",85847,"""ES""",100,"""ES""","""L"""
2023,"""MI""","""CT""","""ML Engineer""",30000,"""USD""",30000,"""US""",100,"""US""","""S"""
2023,"""MI""","""CT""","""ML Engineer""",25500,"""USD""",25500,"""US""",100,"""US""","""S"""
2023,"""SE""","""FT""","""Data Scientist…",175000,"""USD""",175000,"""CA""",100,"""CA""","""M"""
2023,"""SE""","""FT""","""Data Scientist…",120000,"""USD""",120000,"""CA""",100,"""CA""","""M"""


## Experience Level
There's 4 categorical values in column 'Experience Level', each are:

- EN, which refers to Entry-level / Junior.

- MI, which refers to Mid-level / Intermediate.

- SE, which refers to Senior-level / Expert.

- EX, which refers to Executive-level / Director.


In [41]:
## Mapper and value counts
experience_level_dict = {
    'EN':'Entry-level/Junior',
    'MI':'Mid-level/Intermediate',
    'SE':'Senior-level/Expert',
    'EX':'Executive-level/Director'
}
salary_data = salary_data.with_columns(salary_data['experience_level'].map_dict(experience_level_dict))
salary_data['experience_level'].value_counts().sort(by='counts', descending=True)

experience_level,counts
str,u32
"""Senior-level/E…",2516
"""Mid-level/Inte…",805
"""Entry-level/Ju…",320
"""Executive-leve…",114


In [53]:
## Group by sintaxis
salary_data.groupby(by='experience_level').agg(pl.col('salary').mean())

experience_level,salary
str,f64
"""Mid-level/Inte…",248200.306832
"""Senior-level/E…",170048.965421
"""Entry-level/Ju…",188381.178125
"""Executive-leve…",246802.201754


In [58]:
## Filter all in Colombia
salary_data.filter((pl.col('employee_residence'))=='CO')

work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
i64,str,str,str,i64,str,i64,str,i64,str,str
2023,"""Senior-level/E…","""FT""","""Data Manager""",65000,"""USD""",65000,"""CO""",0,"""CO""","""M"""
2023,"""Senior-level/E…","""FT""","""Data Manager""",48000,"""USD""",48000,"""CO""",0,"""CO""","""M"""
2022,"""Senior-level/E…","""FT""","""AI Scientist""",125000,"""USD""",125000,"""CO""",100,"""CO""","""L"""
2021,"""Entry-level/Ju…","""FT""","""Machine Learni…",21844,"""USD""",21844,"""CO""",50,"""CO""","""M"""
