# Creating Columns

In [7]:
import numpy as np
import pandas as pd

recent_grads = pd.read_csv("recent_grads.csv")

In [10]:
# Add sharemen column
recent_grads['sharemen'] = recent_grads["men"] / recent_grads["total"]

In [12]:
# Add gender_diff column
# Add a column named gender_diff that reports how much higher the rate of women is than the rate of men
recent_grads['gender_diff'] = recent_grads["sharewomen"] - recent_grads['sharemen']

# Select Row with Highest Value

In [11]:
# Find the maximum percentage value of men
max_men = np.max(recent_grads["sharemen"])
 
# Output the row with the highest percentage of men
print(recent_grads[recent_grads["sharemen"] == max_men])

     rank  major_code                                       major  \
120   121        2301  EDUCATIONAL ADMINISTRATION AND SUPERVISION   

    major_category  total  sample_size    men  women  sharewomen  employed  \
120      Education    804            5  78253  63698    0.448732       703   

       ...      full_time_year_round  unemployed  unemployment_rate  median  \
120    ...                       504           0                0.0   34000   

     p25th  p75th college_jobs non_college_jobs  low_wage_jobs   sharemen  
120  29000  35000          346              206            111  97.329602  

[1 rows x 22 columns]


# Updating columns

In [13]:
# Make all gender difference values positive
recent_grads['gender_diff'] = np.abs(recent_grads['gender_diff'])

# Find the 5 rows with lowest gender rate difference
print(recent_grads.nsmallest(5, 'gender_diff'))

    rank  major_code                                   major  \
8      9        2414                  MECHANICAL ENGINEERING   
74    75        5003                               CHEMISTRY   
37    38        6205                      BUSINESS ECONOMICS   
65    66        2599  MISCELLANEOUS ENGINEERING TECHNOLOGIES   
78    79        5506        POLITICAL SCIENCE AND GOVERNMENT   

       major_category   total  sample_size    men  women  sharewomen  \
8         Engineering   91227         1029  12953   2105    0.139793   
74  Physical Sciences   66530          353  32923  33607    0.505141   
37           Business   13302          199   3477   1154    0.249190   
65        Engineering    8804          125    124      0    0.000000   
78     Social Science  182621         1387  93880  88741    0.485930   

    employed     ...       unemployed  unemployment_rate  median  p25th  \
8      76442     ...             4650           0.057342   60000  48000   
74     48535     ...            

# Filtering rows

In [16]:
# Rows where gender rate difference is greater than .30 
diff_30 = recent_grads['gender_diff'] > .30

# Rows with more men
more_men = recent_grads['sharemen'] > recent_grads['sharewomen']

# Combine more_men and diff_30
more_men_and_diff_30 = np.logical_and(more_men, diff_30)

# Find rows with more men and and gender rate difference greater than .30
fewer_women = recent_grads[more_men_and_diff_30]

# Grouping with Counts

In [19]:
# Group by major category and count
print(recent_grads.groupby(['major_category']).major_category.count())
#print(recent_grads.groupby(['major_category']).count())


major_category
Agriculture & Natural Resources        10
Arts                                    8
Biology & Life Science                 14
Business                               13
Communications & Journalism             4
Computers & Mathematics                11
Education                              16
Engineering                            29
Health                                 12
Humanities & Liberal Arts              15
Industrial Arts & Consumer Services     7
Interdisciplinary                       1
Law & Public Policy                     5
Physical Sciences                      10
Psychology & Social Work                9
Social Science                          9
Name: major_category, dtype: int64


In [20]:
# Group departments that have less women by category and count
print(fewer_women.groupby(['major_category']).major_category.count())

major_category
Agriculture & Natural Resources         6
Biology & Life Science                  4
Business                                1
Computers & Mathematics                 5
Education                               5
Engineering                            20
Health                                  1
Humanities & Liberal Arts               1
Industrial Arts & Consumer Services     3
Interdisciplinary                       1
Law & Public Policy                     1
Physical Sciences                       6
Psychology & Social Work                2
Social Science                          4
Name: major_category, dtype: int64


# Grouping One Column with Means

In [21]:
# Report average gender difference by major category
print(recent_grads.groupby(['major_category']).gender_diff.mean())

major_category
Agriculture & Natural Resources        4.511111
Arts                                   0.308747
Biology & Life Science                 4.785277
Business                               0.318304
Communications & Journalism            0.483868
Computers & Mathematics                0.700728
Education                              6.869082
Engineering                            1.296387
Health                                 0.450257
Humanities & Liberal Arts              0.540437
Industrial Arts & Consumer Services    2.474325
Interdisciplinary                      0.320397
Law & Public Policy                    0.281993
Physical Sciences                      1.556373
Psychology & Social Work               5.719239
Social Science                         1.695479
Name: gender_diff, dtype: float64


In [22]:
# Find average number of low wage jobs and unemployment rate of each major category
dept_stats = recent_grads.groupby(['major_category'])['low_wage_jobs', 'unemployment_rate'].mean()
print(dept_stats)

                                     low_wage_jobs  unemployment_rate
major_category                                                       
Agriculture & Natural Resources         789.900000           0.056328
Arts                                   7514.500000           0.090173
Biology & Life Science                 3053.000000           0.060918
Business                               9752.923077           0.071064
Communications & Journalism           12398.750000           0.075538
Computers & Mathematics                1466.909091           0.084256
Education                              2554.375000           0.051702
Engineering                             864.793103           0.063334
Health                                 2605.833333           0.065920
Humanities & Liberal Arts              6282.666667           0.081008
Industrial Arts & Consumer Services    3798.571429           0.056083
Interdisciplinary                      1061.000000           0.070861
Law & Public Policy 