In [40]:
import pandas as pd

In [41]:
df = pd.read_csv('salaries_by_college_major.csv')
df.head()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
0,Accounting,46000.0,77100.0,42200.0,152000.0,Business
1,Aerospace Engineering,57700.0,101000.0,64300.0,161000.0,STEM
2,Agriculture,42600.0,71900.0,36300.0,150000.0,Business
3,Anthropology,36800.0,61500.0,33800.0,138000.0,HASS
4,Architecture,41600.0,76800.0,50600.0,136000.0,Business


## Data Inspection

In [42]:
df.shape

(51, 6)

In [43]:
df.columns

Index(['Undergraduate Major', 'Starting Median Salary',
       'Mid-Career Median Salary', 'Mid-Career 10th Percentile Salary',
       'Mid-Career 90th Percentile Salary', 'Group'],
      dtype='object')

In [44]:
df.isna()
df.tail()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
46,Psychology,35900.0,60400.0,31600.0,127000.0,HASS
47,Religion,34100.0,52000.0,29700.0,96400.0,HASS
48,Sociology,36500.0,58200.0,30700.0,118000.0,HASS
49,Spanish,34000.0,53100.0,31000.0,96400.0,HASS
50,Source: PayScale Inc.,,,,,


In [45]:
clean_df = df.dropna()
clean_df.tail()

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
45,Political Science,40800.0,78200.0,41200.0,168000.0,HASS
46,Psychology,35900.0,60400.0,31600.0,127000.0,HASS
47,Religion,34100.0,52000.0,29700.0,96400.0,HASS
48,Sociology,36500.0,58200.0,30700.0,118000.0,HASS
49,Spanish,34000.0,53100.0,31000.0,96400.0,HASS


## Questions

#### Find College Major with Highest Starting Salaries

In [46]:
clean_df['Undergraduate Major'][clean_df['Starting Median Salary'].idxmax()]

'Physician Assistant'

#### What college major has the highest mid-career salary? How much do graduates with this major earn? (Mid-career is defined as having 10+ years of experience).



In [47]:
major_with_high_mid_career_salary = clean_df['Undergraduate Major'].iloc[clean_df['Mid-Career Median Salary'].idxmax()]
major_with_high_mid_career_salary

'Chemical Engineering'

In [48]:
clean_df[clean_df['Undergraduate Major'] == major_with_high_mid_career_salary]['Mid-Career Median Salary'].mean()

107000.0

#### Which college major has the lowest starting salary and how much do graduates earn after university?

In [49]:
major_with_low_starting_career_salary = clean_df['Undergraduate Major'].iloc[clean_df['Starting Median Salary'].idxmin()]
major_with_low_starting_career_salary

'Spanish'

In [50]:
clean_df[clean_df['Undergraduate Major'] == major_with_low_starting_career_salary]['Starting Median Salary']

49    34000.0
Name: Starting Median Salary, dtype: float64

#### Which college major has the lowest mid-career salary and how much can people expect to earn with this degree? 

In [51]:
major_with_low_mid_career_salary = clean_df['Undergraduate Major'].iloc[clean_df['Mid-Career Median Salary'].idxmin()]
major_with_low_mid_career_salary

'Education'

In [52]:
clean_df[clean_df['Undergraduate Major'] == major_with_low_mid_career_salary]['Mid-Career Median Salary']

18    52000.0
Name: Mid-Career Median Salary, dtype: float64

#### Find the top 5 degrees with the highest values in the 90th percentile. 

In [53]:
clean_df.sort_values(by='Mid-Career 90th Percentile Salary', ascending=False).head(5)

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group
17,Economics,50100.0,98600.0,50600.0,210000.0,Business
22,Finance,47900.0,88300.0,47200.0,195000.0,Business
8,Chemical Engineering,63200.0,107000.0,71900.0,194000.0,STEM
37,Math,45400.0,92400.0,45200.0,183000.0,STEM
44,Physics,50300.0,97300.0,56000.0,178000.0,STEM


#### Find the degrees with the greatest spread in salaries. Which majors have the largest difference between high and low earners after graduation.

In [56]:
clean_df['spread'] = clean_df['Mid-Career 90th Percentile Salary'] - clean_df['Mid-Career 10th Percentile Salary']
clean_df.sort_values(by='spread', ascending= False).head(5)

Unnamed: 0,Undergraduate Major,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 90th Percentile Salary,Group,spread
17,Economics,50100.0,98600.0,50600.0,210000.0,Business,159400.0
22,Finance,47900.0,88300.0,47200.0,195000.0,Business,147800.0
37,Math,45400.0,92400.0,45200.0,183000.0,STEM,137800.0
36,Marketing,40800.0,79600.0,42100.0,175000.0,Business,132900.0
42,Philosophy,39900.0,81200.0,35500.0,168000.0,HASS,132500.0


In [55]:
clean_df.groupby('Group')['Starting Median Salary'].mean()

Group
Business    44633.333333
HASS        37186.363636
STEM        53862.500000
Name: Starting Median Salary, dtype: float64