# Day 6 Pandas Exploration

In [4]:
import pandas as pd
import numpy as np
import re
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [10]:
df = pd.read_csv('ds_salaries.csv', index_col= [0])
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [19]:
# What is the average salary of data scientist per year
df1 = df[df["job_title"] == "Data Scientist"].groupby(["work_year"], as_index=False).mean()
work_years = df1["work_year"].tolist()
mean_salary_usd = df1["salary_in_usd"].tolist()
work_to_salary = dict(zip(work_years, mean_salary_usd))
print("Average salary for data scientist")
for k,v in work_to_salary.items():
    print(k, v)

Average salary for data scientist
2020 85970.52380952382
2021 70671.73333333334
2022 136172.0909090909


we notice an interesting decline in 2021, could this be due to jobs coming back after quarantine? 

In [32]:
# What is average starting salary for data scientist in 2022 for different countries? 
df2 = df[(df["job_title"] == "Data Scientist") & (df["work_year"] == 2022) & (df["experience_level"] == "EN")].groupby(["company_location"], as_index=False).mean()


print("Average salary for entry level data scientist by country in 2022")
print(df2[["company_location", "salary_in_usd"]])

Average salary for entry level data scientist by country in 2022
  company_location  salary_in_usd
0               AU        86703.0
1               CA        52396.0
2               DZ       100000.0
3               IN        18442.0
4               MY        40000.0


We notice that DZ has the highest salary for entry level, lets analyze how much data we have for each nation to get better understanding

In [33]:
count_df = df[(df["job_title"] == "Data Scientist") & (df["work_year"] == 2022) & (df["experience_level"] == "EN")].groupby(["company_location"], as_index=False).count()
count_df

Unnamed: 0,company_location,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_size
0,AU,1,1,1,1,1,1,1,1,1,1
1,CA,1,1,1,1,1,1,1,1,1,1
2,DZ,1,1,1,1,1,1,1,1,1,1
3,IN,1,1,1,1,1,1,1,1,1,1
4,MY,1,1,1,1,1,1,1,1,1,1


Only one data for only 5 nations which is not satisfactory at all for analysis

In [38]:
# Average salary comparison of full time vs part time in the field of data scientist
df3 = df[df["job_title"] == "Data Scientist"].groupby(["employment_type"]).mean()
df3

Unnamed: 0_level_0,work_year,salary,salary_in_usd,remote_ratio
employment_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FL,2022.0,100000.0,100000.0,100.0
FT,2021.392857,517676.071429,108922.792857,63.928571
PT,2021.0,59500.0,60834.5,50.0


As expected Full time are getting paid more than part time and slightly more than freelancing

In [56]:
# Which is the best place to get a job related to data science? per level. 
df4 = df.groupby(["experience_level", "company_location"], as_index=False).mean().sort_values(by="salary_in_usd", ascending=False)
df5 = df.groupby(["experience_level", "company_location"], as_index=False).max().sort_values(by="salary_in_usd", ascending=False)
df6 = df.groupby(["experience_level", "company_location"], as_index=False).min().sort_values(by="salary_in_usd", ascending=False)


#For entry level
df4[df4["experience_level"] == "EN"]

Unnamed: 0,experience_level,company_location,work_year,salary,salary_in_usd,remote_ratio
1,EN,AU,2022.0,135000.0,118351.5,75.0
9,EN,DZ,2022.0,100000.0,100000.0,50.0
4,EN,CN,2021.0,100000.0,100000.0,0.0
14,EN,IQ,2021.0,100000.0,100000.0,50.0
25,EN,US,2020.935484,93112.9,93112.903226,85.483871
12,EN,GB,2021.6,48700.0,65604.6,70.0
7,EN,DE,2020.727273,50727.27,57551.818182,63.636364
2,EN,CA,2022.0,61833.33,57132.0,33.333333
11,EN,FR,2020.6,40600.0,47325.6,30.0
21,EN,NL,2020.0,42000.0,42000.0,50.0


In [76]:
# If US salary is put to 0, how dispersed are each countries avg entry level salary from US

df7 = df[(df["job_title"] == "Data Scientist") & (df["experience_level"] == "EN")].groupby(["company_location"], as_index=False).mean()
df7 = df7[["company_location", "salary_in_usd"]]
US = df7[df7["company_location"] == "US"]["salary_in_usd"].tolist()[0]
df7["salary_in_usd"] = df7["salary_in_usd"] - US
df7 = df7.sort_values(by = "salary_in_usd", ascending=False)
df7

Unnamed: 0,company_location,salary_in_usd
3,DZ,11166.666667
9,US,0.0
0,AU,-2130.333333
2,DE,-32836.333333
1,CA,-36437.333333
4,FR,-44451.833333
7,MY,-48833.333333
5,IN,-63302.666667
6,IT,-67164.333333
8,UA,-75433.333333


As we can see, leaving out DZ for lack of sufficient data, we can conclude the next best place to work as data scientist after US is Australia followed by DE, CA, FR, MY, IN, IT, UA and VN