# Data Science Salaries analysis

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the data
data=pd.read_csv("ds_salaries.csv")

Data Science Job Salaries Dataset contains 11 columns, each are:

* work_year: The year the salary was paid.
* experience_level: The experience level in the job during the year
* employment_type: The type of employment for the role
* job_title: The role worked in during the year.
* salary: The total gross salary amount paid.
* salary_currency: The currency of the salary paid as an ISO 4217 currency code.
* salaryinusd: The salary in USD
* employee_residence: Employee's primary country of residence in during the work year as an ISO 3166 country code.
* remote_ratio: The overall amount of work done remotely
* company_location: The country of the employer's main office or contracting branch
* company_size: The median number of people that worked for the company during the year

## Basic Checks

In [3]:
# First five rows
data.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [4]:
# last five rows
data.tail()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
3750,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
3751,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
3752,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
3753,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L
3754,2021,SE,FT,Data Science Manager,7000000,INR,94665,IN,50,IN,L


In [5]:
# Shape of dataset
data.shape

(3755, 11)

* Dataset contains 11 features with 3755 observations.

In [6]:
# Columns in data
data.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [7]:
# Statistics of data
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
work_year,3755.0,2022.373635,0.691448,2020.0,2022.0,2022.0,2023.0,2023.0
salary,3755.0,190695.571771,671676.500508,6000.0,100000.0,138000.0,180000.0,30400000.0
salary_in_usd,3755.0,137570.38988,63055.625278,5132.0,95000.0,135000.0,175000.0,450000.0
remote_ratio,3755.0,46.271638,48.58905,0.0,0.0,0.0,100.0,100.0


In [8]:
# Statistics of data for categorical data
data.describe(include="O")

Unnamed: 0,experience_level,employment_type,job_title,salary_currency,employee_residence,company_location,company_size
count,3755,3755,3755,3755,3755,3755,3755
unique,4,4,93,20,78,72,3
top,SE,FT,Data Engineer,USD,US,US,M
freq,2516,3718,1040,3224,3004,3040,3153


In [9]:
# Information of data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


## Visualization

In [35]:
# Popular roles in Data Science
import plotly.express as px
z = data['job_title'].value_counts().head(10)
fig = px.bar(z , x=z.index , y = z.values , color = z.index , text = z.values , labels = {'index':'job_title' , 'y':'count' , 'text':'count'}
            ,template = 'seaborn' , title = '<b>Top 10 Popular Data Science Roles')
fig.show()

* The above bar plot shows us the top 10 data science roles. We can see that the Data Engineer is the most popular role

In [37]:
# Highest paid roles in Data science
fig = px.bar(data.groupby('job_title' , as_index = False)['salary_in_usd'].max().sort_values(by = 'salary_in_usd' , ascending = False).head(10),
            x = 'job_title' , y = 'salary_in_usd' , color = 'job_title' , labels = {'job_title' : 'job title' , 'salary_in_usd': 'salary in usd'},
        template = 'ggplot2' , text = 'salary_in_usd' , title = '<b>Top 10 highest paid roles in Data Science')
fig.show()

* In the above bar graph we can see that the Research Scientist earns upto 450k usd a year, which is an amazing amount. If we talk about other 9 roles the max salary is above 300k which is again a very good salary for a job.

In [39]:
# Data Science roles based on average pay
z =data.groupby('job_title',as_index=False)['salary_in_usd'].mean().sort_values(by='salary_in_usd',ascending=False)
z['salary_in_usd'] = round(z['salary_in_usd'],2)
fig = px.bar(z.head(10),x='job_title',y='salary_in_usd',color='job_title',labels ={'job_title':'job title','salary_in_usd':'avg salary in usd'},
            text='salary_in_usd',template='seaborn',title ='<b>Top 10 Roles in Data Science based on Average Pay')
fig.update_traces(textfont_size = 8)
fig.show()

* Here we can see that based on Average pay Data Science Tech Lead is the highest paying job in Data Science.

In [44]:
# Data Science jobs based on Experience level
fig = px.pie(data.groupby('experience_level',as_index = False)['salary_in_usd'].count().sort_values(by='salary_in_usd',ascending=False),
            names = 'experience_level',values='salary_in_usd',color='experience_level',hole=0.7,
            labels = {'experience_level':'Experience Level','salary_in_usd':'salary'},title ='<b> Total Jobs based on Experience Level')
fig.update_layout(title_x = 0.5,legend = dict(orientation = 'v',yanchor = 'bottom',y=1.02,xanchor='right',x=1))

* We can see from the above pie chart that senior level jobs mostly requires experience.

In [45]:
# Data science jobs based on employee type
fig = px.pie(data.groupby('employment_type',as_index=False)['salary_in_usd'].count().sort_values(by='salary_in_usd'),
            names = 'employment_type',values = 'salary_in_usd',color='employment_type',hole=0.5,
            labels = {'employment_type':'Employement Type','salary_in_usd':'salary'},template='seaborn',
            title = '<b> Data Science Jobs based on Employment Type')
fig.update_layout(title_x = 0.5)

* We can see that 99% of the jobs are Full-time jobs. Contract and freelancing jobs are not given that much importance in Data Science. Part -Time jobs are also scarce.

In [47]:
# Remote Ratio
fig = px.pie(data.groupby('remote_ratio',as_index = False)['salary_in_usd'].count().sort_values(by='salary_in_usd'),
            names = 'remote_ratio',values='salary_in_usd',color='remote_ratio',hole=0.5,labels={'remote_ratio':'Remote Ratio','salary_in_usd':'salary'},
            template = 'plotly',title = '<b> Remote Ratio')
fig.update_layout(title_x = 0.5)

* We can see that 51.2% jobs are 100% remote, which shows that work from home culture is very common among data science jobs.

In [51]:
# Company Sizes
fig = px.pie(data.groupby('company_size',as_index = False)['salary_in_usd'].count().sort_values(by='salary_in_usd',ascending = False),
            names = 'company_size',values = 'salary_in_usd',color='company_size',hole=0.6,labels = {'company_size' : 'Company Size' , 'salary_in_usd':'salary'},
            title = '<b> Company Sizes in Data Science Field')
fig.update_layout(title_x = 0.5)

* We can observe from the above pie chart that 84% companies are medium sized companies. Usually these medium sized companies manage databases of large companies.

In [53]:
# Top 15 countries having maximum Data Science Jobs
px.funnel(data.groupby('company_location',as_index = False)['employment_type'].count().sort_values(by ='employment_type',ascending = False).head(15),
         y = 'company_location',x ='employment_type',labels = {'employement_type':'employment'},
         template = 'seaborn',title = '<b> Top 15 countries having maximum Data Science Jobs')

* We can observe that US has the maximum employment in the field of Data Science.

In [54]:
# Salary Distribution
px.histogram(data, x = 'salary_in_usd',marginal = 'rug',labels ={'salary_in_usd' : 'salary'} , title = '<b> Salary Distribution'  )

In [55]:
px.violin(data,x='work_year',y='salary_in_usd',color='work_year',labels={'work_year':'year','salary_in_usd':'salary in usd'},template='seaborn',title='<b>Data Science Salaries by year')

In [57]:
px.box(data,x = 'experience_level',y='salary_in_usd',color='experience_level',labels={'experience_level':'Experience Level','salary_in_usd':'salary in usd'},title='<b>Data Science Salaries by Experience')

In [62]:
px.box(data,x='employment_type',y='salary_in_usd',color='employment_type',template='seaborn',labels={'employment_type':'Employment Type','salary_in_usd':'salary in usd'},title='<b>Data Science Salaries by type of employee')

In [63]:
px.box(data,x='company_size',y='salary_in_usd',color='company_size',template='ggplot2',labels={'company_size':'Company Size','salary_in_usd':'salary in usd'},title='<b>Data Science Salaries by Company Size')