In [3]:
# data
import pandas as pd
import numpy as np

# visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

# styling
%matplotlib inline
sns.set_theme(style="dark")
mpl.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns',None)
plt.style.use('seaborn-dark-palette')
plt.style.use('dark_background')

In [4]:
df=pd.read_csv('ds_salaries.csv')

In [6]:
df.drop(df[['salary_currency', 'salary', 'Unnamed: 0']], axis=1, inplace=True)

In [7]:
df.head(3)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,109024,GB,50,GB,M


In [8]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

## 2. Univariate Analysis 

### 2.1 Categorical Columns

### 2.1.1. Experience Level

In [9]:
df['experience_level'].unique()

array(['MI', 'SE', 'EN', 'EX'], dtype=object)

EN: Entry-level/Junior  
MI: Mid-level/Intermediate  
SE: Senior-level/Expert  
EX: Executive-level/Director

In [10]:
df['experience_level']=df['experience_level'].replace('EN','Entry-level/Junior')
df['experience_level']=df['experience_level'].replace('ME','Mid-level/Intermediate')
df['experience_level']=df['experience_level'].replace('SE','Senior-level/Expert')
df['experience_level']=df['experience_level'].replace('EX','Executive-level/Director')


In [13]:
ex_level = df['experience_level'].value_counts()
fig = px.treemap(
    values= ex_level.values,
    path=[ex_level.index],
    title='Experience Level',
    color= ex_level.index,
    color_discrete_sequence=px.colors.sequential.PuBuGn,
    template='plotly_dark',
    width=1000, height=500)

percent=np.round((100*ex_level.values/sum(ex_level.values)).tolist(),2)
fig.data[0].customdata=[35.09, 46.13, 4.28, 14.5]
fig.data[0].texttemplate='%{label}<br>%{value}<br>%{customdata}%'

fig.update_layout(font=dict(size=19, family='Franklin Gothic'))
fig.show()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



### 2.1.2. Job Titles

__How many job titles in the dataset?__

In [20]:
df['job_title'].unique()

array(['Data Scientist', 'Machine Learning Scientist',
       'Big Data Engineer', 'Product Data Analyst',
       'Machine Learning Engineer', 'Data Analyst', 'Lead Data Scientist',
       'Business Data Analyst', 'Lead Data Engineer', 'Lead Data Analyst',
       'Data Engineer', 'Data Science Consultant', 'BI Data Analyst',
       'Director of Data Science', 'Research Scientist',
       'Machine Learning Manager', 'Data Engineering Manager',
       'Machine Learning Infrastructure Engineer', 'ML Engineer',
       'AI Scientist', 'Computer Vision Engineer',
       'Principal Data Scientist', 'Data Science Manager', 'Head of Data',
       '3D Computer Vision Researcher', 'Data Analytics Engineer',
       'Applied Data Scientist', 'Marketing Data Analyst',
       'Cloud Data Engineer', 'Financial Data Analyst',
       'Computer Vision Software Engineer',
       'Director of Data Engineering', 'Data Science Engineer',
       'Principal Data Engineer', 'Machine Learning Developer',
       

In [24]:
df['job_title'].nunique()

50

There are 50 job titles in this dataset.

In [29]:
top10_job_titles=df['job_title'].value_counts()[:10]
fig=px.bar(
    y=top10_job_titles.values,
    x=top10_job_titles.index,
    color=top10_job_titles.index,
    color_discrete_sequence=px.colors.sequential.PuBuGn,
    text=top10_job_titles.values,
    template='plotly_dark',
    title='Top 10 Job Title')

fig.update_layout(
    xaxis_title='Job Titles',
    yaxis_title='Count'
)

fig.show()

### 2.1.3. Employment Type

In [31]:
df['employment_type'].unique()

array(['FT', 'CT', 'PT', 'FL'], dtype=object)

There are 4 type here, each are:  
- PT: Part-time,
- FT: Full-time,
- CT: Contract,
- FL: Freelance

In [34]:
type_group = df['employment_type'].value_counts()
e_type=['Full-time', 'Part-time', 'Contract', 'Freelance']
fig=px.bar(
    y=type_group.values,
    x=e_type,
    color=type_group.index,
    color_discrete_sequence=px.colors.sequential.PuBuGn,
    text=type_group.values,
    template='plotly_dark',
    title='Employment Type Distribution')


fig.update_traces(width=0.3)
fig.update_layout(
    xaxis_title='Employment Type',
    yaxis_title='Count'
)

fig.show()

### 2.1.4. Employee Residence & Company Location