# 1. Loading Data and Importing Libraries

In [60]:
import pandas as pd
import numpy as np
import country_converter as coco
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import nltk

%matplotlib inline

In [61]:
df = pd.read_csv('./data/ds_salaries.csv')
df.drop(df[['salary','salary_currency']], axis = 1, inplace = True)

In [62]:
print(df.shape)
df.head()

(3755, 9)


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,CA,100,CA,M


So, we have 9 columns with 3755 rows:

3 numeric columns : work_year, salary_in_usd,remote_ratio.

6 categorical columns : experience_level,employment_type, job_title, employee_residense, company_location, company_size.

In [63]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

No missing values to deal with!

# 2. Univariate Analysis

## Dealing with Categorical features

###Experience Level

There's 4 categorical values  in column 'Experience Level', each are:


*  EN, which refers to Entry-level / Junior.

*   MI, which refers to Mid-level / Intermediate.

*   SE, which refers to Senior-level / Expert.

*   EX, which refers to Executive-level / Director.


In [64]:
df['experience_level'] = df['experience_level'].replace('EN','Entry-level/Junior')
df['experience_level'] = df['experience_level'].replace('MI','Mid-level/Intermediate')
df['experience_level'] = df['experience_level'].replace('SE','Senior-level/Expert')
df['experience_level'] = df['experience_level'].replace('EX','Executive-level/Director')

ex_level = df['experience_level'].value_counts()
fig = px.treemap(ex_level, path = [ex_level.index], values = ex_level.values, 
                title = 'Experience Level')
fig.show()

In [65]:
print('Different job designations altogether :', len(set(df['job_title'])))

Different job designations altogether : 93


In [66]:
country = coco.convert(names = df['employee_residence'], to = "ISO3")
df['employee_residence'] = country

In [67]:
country = coco.convert(names=df['company_location'], to="ISO3")
df['company_location'] = country

# 6. Salary Analysis

## Average Salary based on Company Location

In [68]:
salary_location = df.groupby(['salary_in_usd', 'company_location']).size().reset_index()
means = salary_location.groupby('company_location').mean().reset_index()

fig = px.choropleth(locations = means['company_location'], color = means['salary_in_usd'],
                    title = 'Average Salary by Company Location')
fig.show()

## Highest salaries based on Designation

In [69]:

salary_designation = df.groupby(['salary_in_usd', 'job_title']).size().reset_index()
salary_designation = salary_designation[-25:]
fig = px.bar(x = salary_designation['job_title'], y = salary_designation['salary_in_usd'],
            text = salary_designation['salary_in_usd'], color = salary_designation['salary_in_usd'])

fig.update_layout( xaxis_title = "Job Designation", yaxis_title = "Salaries ")
fig.update_layout(xaxis_tickangle = -45, 
                  title = 'Top 25 Highest Salary by Designation')