In [1]:
import numpy as np
import numpy.random as npr
import pandas as pd
import altair as alt

In [2]:
df = pd.read_csv("../data/raw/ds_salaries.csv")
df = df[['experience_level', 'employment_type', 'job_title', 'salary_in_usd', 'company_location','remote_ratio', 'company_size']]
df.shape

(607, 7)

In [3]:
df.head()

Unnamed: 0,experience_level,employment_type,job_title,salary_in_usd,company_location,remote_ratio,company_size
0,MI,FT,Data Scientist,79833,DE,0,L
1,SE,FT,Machine Learning Scientist,260000,JP,0,S
2,SE,FT,Big Data Engineer,109024,GB,50,M
3,MI,FT,Product Data Analyst,20000,HN,0,S
4,SE,FT,Machine Learning Engineer,150000,US,50,L


In [4]:
df

Unnamed: 0,experience_level,employment_type,job_title,salary_in_usd,company_location,remote_ratio,company_size
0,MI,FT,Data Scientist,79833,DE,0,L
1,SE,FT,Machine Learning Scientist,260000,JP,0,S
2,SE,FT,Big Data Engineer,109024,GB,50,M
3,MI,FT,Product Data Analyst,20000,HN,0,S
4,SE,FT,Machine Learning Engineer,150000,US,50,L
...,...,...,...,...,...,...,...
602,SE,FT,Data Engineer,154000,US,100,M
603,SE,FT,Data Engineer,126000,US,100,M
604,SE,FT,Data Analyst,129000,US,0,M
605,SE,FT,Data Analyst,150000,US,100,M


In [5]:
# Data Wrangling
iso3166_to_continent = {
    'DE': 'Europe',
    'JP': 'Asia',
    'GB': 'Europe',
    'HN': 'North America',
    'US': 'North America',
    'HU': 'Europe',
    'NZ': 'Oceania',
    'FR': 'Europe',
    'IN': 'Asia',
    'PK': 'Asia',
    'CN': 'Asia',
    'GR': 'Europe',
    'AE': 'Asia',
    'NL': 'Europe',
    'MX': 'North America',
    'CA': 'North America',
    'AT': 'Europe',
    'NG': 'Africa',
    'ES': 'Europe',
    'PT': 'Europe',
    'DK': 'Europe',
    'IT': 'Europe',
    'HR': 'Europe',
    'LU': 'Europe',
    'PL': 'Europe',
    'SG': 'Asia',
    'RO': 'Europe',
    'IQ': 'Asia',
    'BR': 'South America',
    'BE': 'Europe',
    'UA': 'Europe',
    'IL': 'Asia',
    'RU': 'Europe',
    'MT': 'Europe',
    'CL': 'South America',
    'IR': 'Asia',
    'CO': 'South America',
    'MD': 'Europe',
    'KE': 'Africa',
    'SI': 'Europe',
    'CH': 'Europe',
    'VN': 'Asia',
    'AS': 'Oceania',
    'TR': 'Asia',
    'CZ': 'Europe',
    'DZ': 'Africa',
    'EE': 'Europe',
    'MY': 'Asia',
    'AU': 'Oceania',
    'IE': 'Europe'
}

# data_mapping_replace(df, "remote_ratio", {100: 'Full-Remote', 50: 'Hybrid', 0:'In-Person'})
# data_mapping_replace(df, "experience_level", {'EN': 'Entry-Level', 'SE': 'Lower-Middle', 'MI':'Mid-Level', 'EX': 'Executive-Level'})
# # data_mapping_replace(df, "employment_type", {'FT': 'Full-Time', 'PT': 'Part-Time', "FL":'Freelance', "CT": "Contract"})


In [6]:
df['continent'] = df['company_location'].map(iso3166_to_continent)

df['remote_ratio'] = df['remote_ratio'].replace({100: 'Full-Remote', 50: 'Hybrid', 0:'In-Person'})

df['experience_level'] = df['experience_level'].replace({'EN': 'Entry-Level', 'SE': 'Lower-Middle', 'MI':'Mid-Level', 'EX': 'Executive-Level'})

df['employment_type'] = df['employment_type'].replace({'FT': 'Full-Time', 'PT': 'Part-Time', "FL":'Freelance', "CT": "Contract"})

In [7]:

# def data_mapping_replace(df, col_name, dict):
#     df[col_name] = df[col_name].replace(dict)
    

df

Unnamed: 0,experience_level,employment_type,job_title,salary_in_usd,company_location,remote_ratio,company_size,continent
0,Mid-Level,Full-Time,Data Scientist,79833,DE,In-Person,L,Europe
1,Lower-Middle,Full-Time,Machine Learning Scientist,260000,JP,In-Person,S,Asia
2,Lower-Middle,Full-Time,Big Data Engineer,109024,GB,Hybrid,M,Europe
3,Mid-Level,Full-Time,Product Data Analyst,20000,HN,In-Person,S,North America
4,Lower-Middle,Full-Time,Machine Learning Engineer,150000,US,Hybrid,L,North America
...,...,...,...,...,...,...,...,...
602,Lower-Middle,Full-Time,Data Engineer,154000,US,Full-Remote,M,North America
603,Lower-Middle,Full-Time,Data Engineer,126000,US,Full-Remote,M,North America
604,Lower-Middle,Full-Time,Data Analyst,129000,US,In-Person,M,North America
605,Lower-Middle,Full-Time,Data Analyst,150000,US,Full-Remote,M,North America


In [33]:

selected_continent = ['Asia']
selected_top_n = 10

filtered_df = df.copy()
if selected_continent:
    filtered_df = filtered_df[filtered_df['continent'].isin(selected_continent)]
    average_salaries = pd.DataFrame(filtered_df.groupby('job_title')['salary_in_usd'].mean())
    sorted_df = average_salaries.sort_values(by='salary_in_usd', ascending=False)
    top_n = sorted_df.head(selected_top_n).reset_index()

bar_plot = alt.Chart(top_n).mark_bar().encode(
    y= alt.Y('job_title:N', sort='-x', title='Job Title'),  
    x= alt.X('salary_in_usd:Q', title='Average Salary (USD)'),
    tooltip=[
        alt.Tooltip('salary_in_usd:Q', title='Average Salaries (USD)'),
        alt.Tooltip('job_title:N', title='Job Title')
    ] 
).properties(
    title=f'Top {selected_top_n} Paid Jobs')

bar_plot

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df['employment_type'].value_counts()

In [None]:
df['job_title'].value_counts()

In [None]:
df['company_location'].value_counts()

In [None]:
alt.Chart(df).mark_bar().encode(
    x='experience_level',
    y='mean(salary_in_usd)'
)

The dataset we are going to be visualizing includes 607 jobs in the field of data science. Each job has seven key variables that contain detailed information about the position, employer, and compensation. We assume this information could help MDS graduate students explore the job markets and narrow their job search efforts. These variables include:

- Level of experience of the role (experience_level e.g. Entry level(EN), Mid-Level(ML), Senior(SE), Experience(EX))
- Type of employment (employment_type e.g. Full Time(FT), Part Time(PT), Contract(CT), Freelance(FL))
- Specific position or role within the data science field (job_title, e.g. Data Scientist, Data Analyst, etc)
- Salaries measured in USD (salary_in_usd)
- Geographical locations of the company recorded in country code (company_location)
- Size of the employing company (company_size e.g. Large(L), Median(M), Small(S))

We will also derive a new variable (work_arrangement) from the existing variable (remote_ratio) to explore whether the position is remote (remote_ratio = 100), hybrid (0 < remote_ratio < 100), or onsite (remote_ratio = 0). Given that students might have varied preferences regarding work arrangements, this new variable could be beneficial for them to explore job opportunities based on their preferred work styles.