In [5]:
import pandas as pd
import numpy as np

There have long been accusations that the SAT isn’t a fair test for college admissions, 
because wealthier students generally do better than poorer students. Given the data we have about the SAT, 
can we conclude that wealthier students do indeed, on average, score better? We will examine the math portion of the SAT, 
seeing if we can see any such problems in the data.

In [6]:
path = '../../pandas-workout-data/data/sat-scores.csv'
columns = ['Year', 'State.Code', 'Total.Math', 
                         'Family Income.Less than 20k.Math', 
                         'Family Income.Between 20-40k.Math', 
                         'Family Income.Between 40-60k.Math', 
                         'Family Income.Between 60-80k.Math',
                         'Family Income.Between 80-100k.Math',
                         'Family Income.More than 100k.Math']

In [7]:
df = pd.read_csv(filepath_or_buffer=path, usecols=columns)
df.head(5)

Unnamed: 0,Year,State.Code,Total.Math,Family Income.Between 20-40k.Math,Family Income.Between 40-60k.Math,Family Income.Between 60-80k.Math,Family Income.Between 80-100k.Math,Family Income.Less than 20k.Math,Family Income.More than 100k.Math
0,2005,AL,559,513,539,550,566,462,588
1,2005,AK,519,492,517,513,528,464,541
2,2005,AZ,530,498,520,524,534,485,554
3,2005,AR,552,513,543,553,570,489,572
4,2005,CA,522,477,506,521,535,451,566


In [8]:
# Renaming some columns
df = df.rename(columns={
    'Family Income.Less than 20k.Math':'income<20k',
    'Family Income.Between 20-40k.Math':'20k<income<40k',
    'Family Income.Between 40-60k.Math':'40k<income<60k',
    'Family Income.Between 60-80k.Math':'60k<income<80k',
    'Family Income.Between 80-100k.Math':'80k<income<100k',
    'Family Income.More than 100k.Math':'income>100k' 
})
df.head()

Unnamed: 0,Year,State.Code,Total.Math,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income<20k,income>100k
0,2005,AL,559,513,539,550,566,462,588
1,2005,AK,519,492,517,513,528,464,541
2,2005,AZ,530,498,520,524,534,485,554
3,2005,AR,552,513,543,553,570,489,572
4,2005,CA,522,477,506,521,535,451,566


Find the average SAT math score for each income level, grouped and then sorted by year.

In [9]:
df.groupby('Year')[[
    'Total.Math', 
    '20k<income<40k', 
    '40k<income<60k', 
    '60k<income<80k', 
    '80k<income<100k', 
    'income<20k', 
    'income>100k']].mean()

Unnamed: 0_level_0,Total.Math,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income<20k,income>100k
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005,535.653846,488.653846,522.673077,536.076923,548.942308,427.596154,572.173077
2006,537.480769,502.923077,523.769231,534.903846,550.461538,461.019231,572.519231
2007,535.339623,494.849057,519.490566,533.188679,545.698113,457.924528,565.169811
2008,535.981132,523.622642,547.471698,549.188679,557.641509,478.641509,564.566038
2009,540.803922,527.823529,550.980392,553.941176,565.333333,482.058824,585.784314
2010,540.843137,499.27451,522.0,534.235294,547.627451,477.039216,569.27451
2011,533.226415,494.886792,513.415094,528.660377,541.849057,460.45283,563.245283
2012,533.603774,492.056604,512.45283,525.773585,538.301887,458.773585,557.320755
2013,532.622642,490.132075,511.377358,520.320755,537.396226,469.358491,556.339623
2014,534.283019,497.641509,514.943396,527.169811,543.132075,459.415094,555.433962


In [10]:
# There's a better aproach to do the above query
df.groupby('Year').mean(numeric_only=True)

Unnamed: 0_level_0,Total.Math,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income<20k,income>100k
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005,535.653846,488.653846,522.673077,536.076923,548.942308,427.596154,572.173077
2006,537.480769,502.923077,523.769231,534.903846,550.461538,461.019231,572.519231
2007,535.339623,494.849057,519.490566,533.188679,545.698113,457.924528,565.169811
2008,535.981132,523.622642,547.471698,549.188679,557.641509,478.641509,564.566038
2009,540.803922,527.823529,550.980392,553.941176,565.333333,482.058824,585.784314
2010,540.843137,499.27451,522.0,534.235294,547.627451,477.039216,569.27451
2011,533.226415,494.886792,513.415094,528.660377,541.849057,460.45283,563.245283
2012,533.603774,492.056604,512.45283,525.773585,538.301887,458.773585,557.320755
2013,532.622642,490.132075,511.377358,520.320755,537.396226,469.358491,556.339623
2014,534.283019,497.641509,514.943396,527.169811,543.132075,459.415094,555.433962


For each year in the data set, determine how much better each income group did, on average, than the next-poorer group of students. Do you see (just by looking at the data) any income group that did worse, in any year, than the next-poorer students?

We want to compare the scores by year and income brackets. But pct_change works on rows, not columns and right now, our data frame has the brackets as columns. We thus need to flip the data frame on its side so the years are the columns and the income brackets are the rows.


In [11]:
df.groupby('Year')[['income<20k',
            '20k<income<40k',
            '40k<income<60k',
            '60k<income<80k',
            '80k<income<100k',
            'income>100k']].mean().T

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
income<20k,427.596154,461.019231,457.924528,478.641509,482.058824,477.039216,460.45283,458.773585,469.358491,459.415094,447.490566
20k<income<40k,488.653846,502.923077,494.849057,523.622642,527.823529,499.27451,494.886792,492.056604,490.132075,497.641509,491.603774
40k<income<60k,522.673077,523.769231,519.490566,547.471698,550.980392,522.0,513.415094,512.45283,511.377358,514.943396,513.754717
60k<income<80k,536.076923,534.903846,533.188679,549.188679,553.941176,534.235294,528.660377,525.773585,520.320755,527.169811,527.132075
80k<income<100k,548.942308,550.461538,545.698113,557.641509,565.333333,547.627451,541.849057,538.301887,537.396226,543.132075,542.037736
income>100k,572.173077,572.519231,565.169811,564.566038,585.784314,569.27451,563.245283,557.320755,556.339623,555.433962,563.433962


In [12]:
df.groupby('Year')[['income<20k',
            '20k<income<40k',
            '40k<income<60k',
            '60k<income<80k',
            '80k<income<100k',
            'income>100k']].mean().T.pct_change()

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
income<20k,,,,,,,,,,,
20k<income<40k,0.142793,0.090894,0.080635,0.093977,0.094936,0.046611,0.074783,0.072548,0.04426,0.083207,0.098579
40k<income<60k,0.069618,0.04145,0.049796,0.045546,0.043872,0.045517,0.037439,0.041451,0.043346,0.034768,0.045059
60k<income<80k,0.025645,0.021259,0.026368,0.003136,0.005374,0.023439,0.029694,0.025994,0.017489,0.023743,0.026038
80k<income<100k,0.023999,0.029085,0.023462,0.015391,0.020566,0.025068,0.024947,0.023828,0.032817,0.030279,0.028277
income>100k,0.042319,0.040071,0.035682,0.012418,0.036175,0.039529,0.039487,0.035331,0.03525,0.02265,0.039474


In [13]:
df.groupby('Year')[['income<20k',
            '20k<income<40k',
            '40k<income<60k',
            '60k<income<80k',
            '80k<income<100k',
            'income>100k']].mean().T.diff()

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
income<20k,,,,,,,,,,,
20k<income<40k,61.057692,41.903846,36.924528,44.981132,45.764706,22.235294,34.433962,33.283019,20.773585,38.226415,44.113208
40k<income<60k,34.019231,20.846154,24.641509,23.849057,23.156863,22.72549,18.528302,20.396226,21.245283,17.301887,22.150943
60k<income<80k,13.403846,11.134615,13.698113,1.716981,2.960784,12.235294,15.245283,13.320755,8.943396,12.226415,13.377358
80k<income<100k,12.865385,15.557692,12.509434,8.45283,11.392157,13.392157,13.188679,12.528302,17.075472,15.962264,14.90566
income>100k,23.230769,22.057692,19.471698,6.924528,20.45098,21.647059,21.396226,19.018868,18.943396,12.301887,21.396226


Which income bracket, on average, had the greatest advantage over the next-poorer income bracket?

In [20]:
(
    df
    .groupby('Year')
    [['income<20k',
      '20k<income<40k',
      '40k<income<60k',
      '60k<income<80k',
      '80k<income<100k',
      'income>100k']]
    .mean()
    .T
    .pct_change()
    .T
    .mean()
)

income<20k              NaN
20k<income<40k     0.083929
40k<income<60k     0.045260
60k<income<80k     0.020744
80k<income<100k    0.025247
income>100k        0.034399
dtype: float64

 if you feel more comfortable passing the axis keyword argument, or if your data set is large enough that transposing will take too much time or memory, you can try that.

In [22]:
# Another option would be to pass mean the axis keyword argument
(
    df
    .groupby('Year')
    [['income<20k',
      '20k<income<40k',
      '40k<income<60k',
      '60k<income<80k',
      '80k<income<100k',
      'income>100k']]
    .mean()
    .T
    .pct_change()
    .mean(axis='columns')
    .sort_values(ascending=False)
)

20k<income<40k     0.083929
40k<income<60k     0.045260
income>100k        0.034399
80k<income<100k    0.025247
60k<income<80k     0.020744
income<20k              NaN
dtype: float64

Can we find, in a calculated and automated way, which income levels consistently (i.e., across all years) do worse than the next-poorest group?

All this is fine, but relying on a visual scan of the data is not a good way to go about things. Rather, we’d like an automated way to find which, if any, of the income brackets did worse than the next-lower bracket. How can we do that?

In [23]:
change = (
   df
    .groupby('Year')
    [['income<20k',
      '20k<income<40k',
      '40k<income<60k',
      '60k<income<80k',
      '80k<income<100k',
      'income>100k']]
    .mean()
    .T
    .pct_change() 
)

In [24]:
change <= 0

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
income<20k,False,False,False,False,False,False,False,False,False,False,False
20k<income<40k,False,False,False,False,False,False,False,False,False,False,False
40k<income<60k,False,False,False,False,False,False,False,False,False,False,False
60k<income<80k,False,False,False,False,False,False,False,False,False,False,False
80k<income<100k,False,False,False,False,False,False,False,False,False,False,False
income>100k,False,False,False,False,False,False,False,False,False,False,False


In [25]:
change[change <= 0]

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
income<20k,,,,,,,,,,,
20k<income<40k,,,,,,,,,,,
40k<income<60k,,,,,,,,,,,
60k<income<80k,,,,,,,,,,,
80k<income<100k,,,,,,,,,,,
income>100k,,,,,,,,,,,


In [26]:
change[change <= 0].dropna()

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015


Sure enough, we see that every single income bracket did better, on average, than the income bracket below it.|

# Beyond the exercise