In [1]:
import pandas as pd
import numpy as np

Read in the scores file, only keeping the Year, State.Code, Total.Math, Total.Test-takers, and Total.Verbal columns.

In [2]:
file_path = '../../pandas-workout-data/data/sat-scores.csv'
columns = ['Year', 'State.Code', 'Total.Math', 'Total.Test-takers', 'Total.Verbal']

In [3]:
df = pd.read_csv(filepath_or_buffer=file_path, usecols=columns)
df.head(5)

Unnamed: 0,Year,State.Code,Total.Math,Total.Test-takers,Total.Verbal
0,2005,AL,559,3985,567
1,2005,AK,519,3996,523
2,2005,AZ,530,18184,526
3,2005,AR,552,1600,563
4,2005,CA,522,186552,504


Create a multi-index based on the year and the two-letter state code.

In [4]:
df = df.set_index(['Year', 'State.Code'])
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total.Math,Total.Test-takers,Total.Verbal
Year,State.Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005,AL,559,3985,567
2005,AK,519,3996,523
2005,AZ,530,18184,526
2005,AR,552,1600,563
2005,CA,522,186552,504


Determine how many people took the SAT in 2005.

In [5]:
with pd.option_context('display.max_rows', 8):
    print(df.loc[2005, 'Total.Test-takers'])

State.Code
AL     3985
AK     3996
AZ    18184
AR     1600
      ...  
VA     3480
WA    35020
WI     4230
WY      656
Name: Total.Test-takers, Length: 52, dtype: int64


In [6]:
print(df.loc[2005, 'Total.Test-takers'].sum())

1344824


Determine the average SAT math score in 2010 from New York (NY), New Jersey (NJ), Massachusetts (MA), and Illinois (IL).

In [7]:
df.loc[([2010], ['NY', 'NJ', 'MA', 'IL']), 'Total.Math']

Year  State.Code
2010  NY            499
      NJ            514
      MA            527
      IL            601
Name: Total.Math, dtype: int64

In [8]:
print(df.loc[([2010], ['NY', 'NJ', 'MA', 'IL']), 'Total.Math'].mean())

535.25


Determine the average SAT verbal score in 2012–2015 from Arizona (AZ), California (CA), and Texas (TX).

In [9]:
df.loc[([2012, 2013, 2014, 2015], ['AZ', 'CA', 'TX']), 'Total.Verbal']

Year  State.Code
2012  AZ            517
      CA            496
      TX            474
2013  AZ            522
      CA            498
      TX            476
2014  AZ            522
      CA            498
      TX            476
2015  AZ            523
      CA            496
      TX            470
Name: Total.Verbal, dtype: int64

In [10]:
print(df.loc[([2012, 2015], ['AZ', 'CA', 'TX']), 'Total.Verbal'].mean())

496.0


### Beyond the exercise

What were the average math and verbal scores for Florida, Indiana, and Idaho across all years? (Don’t break out the values by state.)

In [11]:
df.index

MultiIndex([(2005, 'AL'),
            (2005, 'AK'),
            (2005, 'AZ'),
            (2005, 'AR'),
            (2005, 'CA'),
            (2005, 'CO'),
            (2005, 'CT'),
            (2005, 'DE'),
            (2005, 'DC'),
            (2005, 'FL'),
            ...
            (2015, 'TN'),
            (2015, 'TX'),
            (2015, 'UT'),
            (2015, 'VT'),
            (2015, 'VI'),
            (2015, 'VA'),
            (2015, 'WA'),
            (2015, 'WV'),
            (2015, 'WI'),
            (2015, 'WY')],
           names=['Year', 'State.Code'], length=577)

In [12]:
df.loc[(slice(None), ['FL', 'IN', 'ID']), ['Total.Math', 'Total.Verbal']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Total.Math,Total.Verbal
Year,State.Code,Unnamed: 2_level_1,Unnamed: 3_level_1
2005,FL,498,498
2006,FL,497,496
2007,FL,498,497
2008,FL,499,496
2009,FL,499,497
2010,FL,499,496
2011,FL,490,487
2012,FL,494,492
2013,FL,491,492
2014,FL,485,491


In [13]:
print(df.loc[(slice(None), ['FL', 'IN', 'ID']), ['Total.Math', 'Total.Verbal']].mean())

Total.Math      507.090909
Total.Verbal    504.606061
dtype: float64


Which state received the highest verbal score, and in which year?

In [18]:
df.loc[df['Total.Verbal'] == df['Total.Verbal'].max()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Total.Math,Total.Test-takers,Total.Verbal
Year,State.Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,ND,613,174,612


In [19]:
# but we can also use idxmax to get the index of the highest score
df['Total.Verbal'].idxmax()

(np.int64(2013), 'ND')

Was the average math score in 2005 higher or lower than that in 2015?

In [15]:
df.loc[2005, 'Total.Math'].mean() - df.loc[2015, 'Total.Math'].mean()

np.float64(2.559506531204647)

It was higher

In [20]:
df.index.is_monotonic_increasing

False