Import the data 

In [2]:
import pandas as pd
import numpy as np
import country_converter as coco
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import nltk

%matplotlib inline


In [3]:

data = './data/ds_salaries.csv'
df = pd.read_csv(data)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [4]:
# drop the salary and salary_currency columns, axis = 1 means column, inplace = True means change the df rather than return a new one
df.drop(df[['salary','salary_currency']], axis = 1, inplace = True)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,CA,100,CA,M


No missing values to deal with!

In [5]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

# 2. Univariate Analysis¶
## Dealing with Categorical features
Experience Level

There's 4 categorical values in column 'Experience Level', each are:
1. EN, which refers to Entry-level / Junior.
2. MI, which refers to Mid-level / Intermediate.
3. SE, which refers to Senior-level / Expert.
4. EX, which refers to Executive-level / Director.

From treemap above, we observe that Senior-level/Expert accounts for the highest, and Mid-level/Intermediate ranked the next.

Followed by Entry Level and surprisingly Executive at the end.

In [6]:
# 替換值
df['experience_level'] = df['experience_level'].replace({
    'EN': 'Entry-level/Junior',
    'MI': 'Mid-level/Intermediate',
    'SE': 'Senior-level/Expert',
    'EX': 'Executive-level/Director'
})

# 重新計算不同值的計數
ex_level = df['experience_level'].value_counts()

# 建立 TreeMap 圖表
fig = px.treemap(ex_level, path = [ex_level.index], values = ex_level.values, 
                title = 'Experience Level')
# 顯示數值
fig.show()

## Job Designation

Data Engineer, Data Scientist are the top 2 as usual, followed by Data Analyst and Machine Learning Engineer.

In [7]:
print('Different job designations altogether :', len(set(df['job_title'])))

Different job designations altogether : 93


In [8]:
top15_job_titles = df['job_title'].value_counts()[:15]
fig = px.bar(y = top15_job_titles.values, x = top15_job_titles.index, 
            text = top15_job_titles.values, title = 'Top 15 Job Designations')
fig.update_layout(xaxis_title = "Job Designations", yaxis_title = "Count")
fig.show()

## Employment Type
There are 4 employment types here :

1. PT : Part-time
2. FT : Full-time
3. CT : Contract
4. FL : Freelance


In [9]:
group = df['employment_type'].value_counts()
emp_type = ['Full-Time', 'Part-Time', 'Contract', 'Freelance']

fig = px.bar(x = emp_type, y = group.values, 
       color = group.index, text = group.values, 
       title = 'Employment Type Distribution')

fig.update_layout( xaxis_title = "Employment Type", yaxis_title = "count")
fig.show()

Almost the entirety of employee type is full-time.

### Relation between Employee Residence and Company Location

In [10]:
country = coco.convert(names = df['employee_residence'], to = "ISO3")
df['employee_residence'] = country

Most of the employees are from USA as can be seen from the chloropleth above.

In [11]:
residence = salary_location = df.groupby(['salary_in_usd', 'company_location']).size().reset_index()
means = salary_location.groupby('company_location').mean().reset_index()

fig = px.choropleth(locations = means['company_location'], color = means['salary_in_usd'],
                    title = 'Average Salary by Company Location')
fig.show()['employee_residence'].value_counts()
top_15_emp_locations = residence[:15]
fig = px.bar(y = top_15_emp_locations.values, x = top_15_emp_locations.index, 
            color = top_15_emp_locations.index, text = top_15_emp_locations.values,
            title = 'Top 15 Locations of Employees')

fig.update_layout( xaxis_title = "Location of Employees", yaxis_title = "count")
fig.show()

Company size mostly consists of medium sized comapnies and then the large sized ones followed by the small sized startups.

## Average Salary based on Company Location

In [15]:
means

Unnamed: 0,company_location,salary_in_usd,0
0,AE,100000.000000,1.00000
1,AL,10000.000000,1.00000
2,AM,50000.000000,1.00000
3,AR,25000.000000,1.00000
4,AS,29351.000000,1.00000
...,...,...,...
67,TH,23064.333333,1.00000
68,TR,19058.000000,1.00000
69,UA,57850.000000,1.00000
70,US,162183.287202,4.52381


In [16]:


salary_location = df.groupby(['salary_in_usd', 'company_location']).size().reset_index()
means = salary_location.groupby('company_location').mean().reset_index()

fig = px.choropleth(locations = means['company_location'], color = means['salary_in_usd'],
                    title = 'Average Salary by Company Location')
fig.show()