In [26]:
import pandas as pd
import numpy as np
pd.set_option('display.float_format',  '{:.2f}'.format)

In [6]:
df = pd.read_csv("ds_salaries.csv", index_col='Unnamed: 0')
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...
602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


In [39]:
df.groupby("job_title")["work_year"].count()

job_title
3D Computer Vision Researcher                 1
AI Scientist                                  7
Analytics Engineer                            4
Applied Data Scientist                        5
Applied Machine Learning Scientist            4
BI Data Analyst                               6
Big Data Architect                            1
Big Data Engineer                             8
Business Data Analyst                         5
Cloud Data Engineer                           2
Computer Vision Engineer                      6
Computer Vision Software Engineer             3
Data Analyst                                 97
Data Analytics Engineer                       4
Data Analytics Lead                           1
Data Analytics Manager                        7
Data Architect                               11
Data Engineer                               132
Data Engineering Manager                      5
Data Science Consultant                       7
Data Science Engineer         

### Сгруппируйте вакансии по направлениями (DS, DE, Software Engenering, etc.)

In [51]:
conditions = [
    (df.job_title.str.contains("Data Scientist")),
    (df.job_title.str.contains("Data Engineer")),
    (df.job_title.str.contains("Software Engineer"))
]
choices = ['DS', 'DE', 'Software Engenering']
df['direction'] = np.select(conditions, choices, default='etc')

In [52]:
df.groupby("direction")["work_year"].count()

direction
DE                     158
DS                     159
Software Engenering      3
etc                    287
Name: work_year, dtype: int64

### Какая средняя и медианная зарплата по группам вакансий?

In [53]:
df.groupby(['direction']).agg({'salary_in_usd': ['mean', 'median']})

Unnamed: 0_level_0,salary_in_usd,salary_in_usd
Unnamed: 0_level_1,mean,median
direction,Unnamed: 1_level_2,Unnamed: 2_level_2
DE,115808.51,108912.0
DS,115134.6,109000.0
Software Engenering,105248.67,95746.0
etc,108867.3,98000.0


### Какая средняя и медианная зарплата по каждому региону?

In [54]:
df.groupby(['company_location']).agg({'salary_in_usd': ['mean', 'median']})

Unnamed: 0_level_0,salary_in_usd,salary_in_usd
Unnamed: 0_level_1,mean,median
company_location,Unnamed: 1_level_2,Unnamed: 2_level_2
AE,100000.0,115000.0
AS,18053.0,18053.0
AT,72920.75,69489.5
AU,108042.67,87425.0
BE,85699.0,85699.0
BR,18602.67,18907.0
CA,99823.73,81895.5
CH,64114.0,64114.0
CL,40038.0,40038.0
CN,71665.5,71665.5


### Какая самая высокооплачиваемая из групп вакансий, исходя из их средних зарплат?

In [67]:
print('Самая оплачиваемая группа вакансий исходя из средних зарплат это ' + 
      df.groupby("direction")["salary_in_usd"].mean()
      .reset_index(name='mean')
      .sort_values('mean', ascending=False)
      .head(1)['direction'].tolist()[0])

Самая оплачиваемая группа вакансий исходя из средних зарплат это DE


### Какое процентное соотношение каждого региона по вакансиям от всех вакансий?

In [73]:
agg ={'company_location':['count']}
grouped=df.groupby('company_location').agg(agg).reset_index()
grouped.insert(2,'procent',' ',True)
for index, row in grouped.iterrows():
    grouped.at[index,'procent']= str(round(row[1]*100/len(df),2))+'%'
grouped.sort_values([('company_location','count')],ascending=False)

Unnamed: 0_level_0,company_location,company_location,procent
Unnamed: 0_level_1,Unnamed: 1_level_1,count,Unnamed: 3_level_1
48,US,355,58.48%
18,GB,47,7.74%
6,CA,30,4.94%
12,DE,28,4.61%
25,IN,24,3.95%
17,FR,15,2.47%
16,ES,14,2.31%
19,GR,11,1.81%
29,JP,6,0.99%
40,PL,4,0.66%


### Какая корреляция уровня опыта от зарплаты?

In [78]:
conditions = [
    (df.experience_level == "EN"),
    (df.experience_level == "MI"),
    (df.experience_level == "SE"),
    (df.experience_level == "EX")
]
choices = [1, 2, 3, 4]
df['level'] = np.select(conditions, choices)
df['salary_in_usd'].corr(df['level'])

0.4842328861730616

### Сколько должностей в наборе данных?

In [79]:
print('Число должностей в датасете: ' + str(df['job_title'].nunique()))

Число должностей в датасете: 50


### Какие 10 наиболее часто встречающихся должностей?

In [91]:
df['job_title'].value_counts().reset_index().head(10)

Unnamed: 0,index,job_title
0,Data Scientist,143
1,Data Engineer,132
2,Data Analyst,97
3,Machine Learning Engineer,41
4,Research Scientist,16
5,Data Science Manager,12
6,Data Architect,11
7,Big Data Engineer,8
8,Machine Learning Scientist,8
9,Principal Data Scientist,7
