# Indexing and selecting data

http://pandas.pydata.org/pandas-docs/stable/indexing.html 

In [2]:
# Import stuff
# Create sample data frame from old MTH 225 gradebook
# (Student last names redacted to first two letters for privacy)

import numpy as np
import pandas as pd

# index_col = 0 forces the leftmost column to be the index
# Otherwise the index is 0,1,2,...
grades = pd.read_csv('225-gradebook.csv', index_col=0)

grades.head()

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Bl,2,1,21,12,3,3,18
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Cr,4,1,23,17,3,3,20


## Different ways to index and select

Using `.loc`:

- Single label (`df.loc['Label']`)
- List or array of labels (`df.loc[['a','b','c']]`)
- Slice object (`df.loc['a':'f']` -- note both beginning and end are included unlike usual slicing) (BUT see below)
- Boolean array (`df.loc[True, True, False]`)
- Callable function 

In [3]:
grades.loc['Dy']

hw_pass     0
hw_e        0
lt_pass    11
lt_e        4
pp_pass     1
pp_e        0
gp         21
Name: Dy, dtype: int64

In [4]:
grades.loc[['Dy', 'Ya']]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Dy,0,0,11,4,1,0,21
Ya,5,4,27,21,5,5,25


In [5]:
grades.loc['Fl':'Va']

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Fl,5,4,28,24,5,5,24
Fr,0,0,2,2,0,0,9
Ga,5,3,25,17,5,5,22
He,4,3,23,17,3,3,24
Li,5,4,26,17,5,5,19
Mc,0,0,9,1,2,2,22
Ma,5,2,24,11,5,5,23
Ng,4,2,23,16,3,3,19
Sc,5,3,24,19,4,4,26
Sh,4,4,23,15,2,1,25


In [6]:
# Boolean array

# All students with fewer than 3 E's on homework 
grades.loc[grades.hw_e < 3]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Bl,2,1,21,12,3,3,18
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Cr,4,1,23,17,3,3,20
Di,4,2,26,20,3,3,28
Dy,0,0,11,4,1,0,21
Fr,0,0,2,2,0,0,9
Mc,0,0,9,1,2,2,22
Ma,5,2,24,11,5,5,23
Ng,4,2,23,16,3,3,19


In [7]:
grades[grades.gp == 20]  # Exactly 20 points on Guided Practice

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Cr,4,1,23,17,3,3,20
Va,3,1,13,4,0,0,20


Using `.iloc`: 

- Single integer
- List or array of integers
- Slice object with ints
- Boolean array
- Callable function 

In [8]:
grades.iloc[10]

hw_pass     5
hw_e        3
lt_pass    25
lt_e       17
pp_pass     5
pp_e        5
gp         22
Name: Ga, dtype: int64

In [9]:
grades.iloc[10:20]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Ga,5,3,25,17,5,5,22
He,4,3,23,17,3,3,24
Li,5,4,26,17,5,5,19
Mc,0,0,9,1,2,2,22
Ma,5,2,24,11,5,5,23
Ng,4,2,23,16,3,3,19
Sc,5,3,24,19,4,4,26
Sh,4,4,23,15,2,1,25
St,0,0,2,0,0,0,17
Va,3,1,13,4,0,0,20


## Basics

In [10]:
dates = pd.date_range('1/1/2017', periods = 10)
df = pd.DataFrame(np.random.randn(10,4), index = dates, columns = ['A', 'B', 'C', 'D'])

df

Unnamed: 0,A,B,C,D
2017-01-01,0.958136,1.938955,-1.655659,1.372844
2017-01-02,1.956253,-0.543414,1.574555,-1.063261
2017-01-03,-0.949197,-1.258297,1.242756,0.084331
2017-01-04,-0.877255,-1.226532,0.16067,1.614422
2017-01-05,0.667658,0.335953,-0.796132,-0.002407
2017-01-06,-0.525167,1.244997,0.977835,0.976571
2017-01-07,0.672892,0.095643,0.113786,0.514653
2017-01-08,1.525857,-0.729438,0.567731,1.181354
2017-01-09,-1.435654,-0.867045,-0.016776,-1.033167
2017-01-10,0.867699,-0.014276,0.009655,0.606679


In [11]:
# Just giving a name slices out the column
df['A']   

2017-01-01    0.958136
2017-01-02    1.956253
2017-01-03   -0.949197
2017-01-04   -0.877255
2017-01-05    0.667658
2017-01-06   -0.525167
2017-01-07    0.672892
2017-01-08    1.525857
2017-01-09   -1.435654
2017-01-10    0.867699
Freq: D, Name: A, dtype: float64

In [12]:
# Item in position 5 of column A
df['A'][5]

-0.52516708568713011

In [13]:
# Can also pass a list of columns -- still not using .loc

df[['B', 'D']]

Unnamed: 0,B,D
2017-01-01,1.938955,1.372844
2017-01-02,-0.543414,-1.063261
2017-01-03,-1.258297,0.084331
2017-01-04,-1.226532,1.614422
2017-01-05,0.335953,-0.002407
2017-01-06,1.244997,0.976571
2017-01-07,0.095643,0.514653
2017-01-08,-0.729438,1.181354
2017-01-09,-0.867045,-1.033167
2017-01-10,-0.014276,0.606679


## Attribute access

In [14]:
# Can get indices directly as attributes: 

print(grades.hw_pass)
print(df.B)

Am    5
Bl    2
Br    4
Co    4
Cr    4
Di    4
Du    4
Dy    0
Fl    5
Fr    0
Ga    5
He    4
Li    5
Mc    0
Ma    5
Ng    4
Sc    5
Sh    4
St    0
Va    3
Ve    4
Wa    5
Ya    5
Zi    1
Name: hw_pass, dtype: int64
2017-01-01    1.938955
2017-01-02   -0.543414
2017-01-03   -1.258297
2017-01-04   -1.226532
2017-01-05    0.335953
2017-01-06    1.244997
2017-01-07    0.095643
2017-01-08   -0.729438
2017-01-09   -0.867045
2017-01-10   -0.014276
Freq: D, Name: B, dtype: float64


In [15]:
# Can also set values this way:
df.B[1] = 3.14159
df.B

2017-01-01    1.938955
2017-01-02    3.141590
2017-01-03   -1.258297
2017-01-04   -1.226532
2017-01-05    0.335953
2017-01-06    1.244997
2017-01-07    0.095643
2017-01-08   -0.729438
2017-01-09   -0.867045
2017-01-10   -0.014276
Freq: D, Name: B, dtype: float64

In [16]:
# Or add a row to a data frame by giving a dictionary

df.iloc[1] = dict(A = 1, B = 2, C = 3, D = 99)
df

Unnamed: 0,A,B,C,D
2017-01-01,0.958136,1.938955,-1.655659,1.372844
2017-01-02,1.0,2.0,3.0,99.0
2017-01-03,-0.949197,-1.258297,1.242756,0.084331
2017-01-04,-0.877255,-1.226532,0.16067,1.614422
2017-01-05,0.667658,0.335953,-0.796132,-0.002407
2017-01-06,-0.525167,1.244997,0.977835,0.976571
2017-01-07,0.672892,0.095643,0.113786,0.514653
2017-01-08,1.525857,-0.729438,0.567731,1.181354
2017-01-09,-1.435654,-0.867045,-0.016776,-1.033167
2017-01-10,0.867699,-0.014276,0.009655,0.606679


## Slicing ranges

In a DataFrame, slicing inside `[]` slices __rows__ --- even though putting in just a single value without slicing would give columns. 

In [17]:
grades[:3]

# Note `grades[3]` throws an error

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Bl,2,1,21,12,3,3,18
Br,4,2,26,17,4,4,28


In [18]:
grades[10:20]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Ga,5,3,25,17,5,5,22
He,4,3,23,17,3,3,24
Li,5,4,26,17,5,5,19
Mc,0,0,9,1,2,2,22
Ma,5,2,24,11,5,5,23
Ng,4,2,23,16,3,3,19
Sc,5,3,24,19,4,4,26
Sh,4,4,23,15,2,1,25
St,0,0,2,0,0,0,17
Va,3,1,13,4,0,0,20


So basically this is an abbreviation for `.iloc`

`df[a:b] == df.iloc[a:b]`

## Selection by label

Literally "label" = the row label. Mostly this is done with `.loc` and `.iloc`

In [19]:
grades.loc['Ng']

hw_pass     4
hw_e        2
lt_pass    23
lt_e       16
pp_pass     3
pp_e        3
gp         19
Name: Ng, dtype: int64

In [20]:
grades.loc['Ng':'St']

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Ng,4,2,23,16,3,3,19
Sc,5,3,24,19,4,4,26
Sh,4,4,23,15,2,1,25
St,0,0,2,0,0,0,17


In [21]:
grades.loc[:'St']

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Bl,2,1,21,12,3,3,18
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Cr,4,1,23,17,3,3,20
Di,4,2,26,20,3,3,28
Du,4,3,22,15,4,4,22
Dy,0,0,11,4,1,0,21
Fl,5,4,28,24,5,5,24
Fr,0,0,2,2,0,0,9


In [22]:
# Can also set 

grades2 = grades.copy()
grades2.loc['St']['gp'] = 999
grades2[15:]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Ng,4,2,23,16,3,3,19
Sc,5,3,24,19,4,4,26
Sh,4,4,23,15,2,1,25
St,0,0,2,0,0,0,999
Va,3,1,13,4,0,0,20
Ve,4,3,25,22,3,3,25
Wa,5,4,25,22,5,4,25
Ya,5,4,27,21,5,5,25
Zi,1,0,23,11,3,3,22


In [23]:
# Using a list
# First list is from the index
# Second list if from the column labels

grades.loc[['Ng', 'Ve', 'Ya'], 'hw_pass':'lt_e']

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e
Ng,4,2,23,16
Ve,4,3,25,22
Ya,5,4,27,21


In [24]:
# Slices in both directions

grades.loc['Ng':, 'hw_e':]

Unnamed: 0,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Ng,2,23,16,3,3,19
Sc,3,24,19,4,4,26
Sh,4,23,15,2,1,25
St,0,2,0,0,0,17
Va,1,13,4,0,0,20
Ve,3,25,22,3,3,25
Wa,4,25,22,5,4,25
Ya,4,27,21,5,5,25
Zi,0,23,11,3,3,22


In [25]:
# With a boolean array

# Gives all the COLUMNS in which student Ng scored more than 20 
grades.loc['Ng'] > 20

hw_pass    False
hw_e       False
lt_pass     True
lt_e       False
pp_pass    False
pp_e       False
gp         False
Name: Ng, dtype: bool

In [26]:
# Gives all the ROWS in which the value in column gp is greater than 20
# **** .loc always gives rows ****
grades['gp'] > 20

Am     True
Bl    False
Br     True
Co     True
Cr    False
Di     True
Du     True
Dy     True
Fl     True
Fr    False
Ga     True
He     True
Li    False
Mc     True
Ma     True
Ng    False
Sc     True
Sh     True
St    False
Va    False
Ve     True
Wa     True
Ya     True
Zi     True
Name: gp, dtype: bool

In [27]:
# Summary stats for each graded item, for students with GP scores higher than 20

grades[grades['gp'] > 20].describe()

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,3.764706,2.470588,22.882353,15.705882,3.470588,3.294118,24.235294
std,1.714986,1.374666,5.206755,6.192381,1.545867,1.686887,2.077541
min,0.0,0.0,9.0,1.0,0.0,0.0,21.0
25%,4.0,2.0,23.0,14.0,3.0,3.0,22.0
50%,4.0,3.0,24.0,17.0,4.0,4.0,24.0
75%,5.0,3.0,26.0,20.0,5.0,5.0,25.0
max,5.0,4.0,28.0,24.0,5.0,5.0,28.0


In [28]:
# Compare average number of homeworks passed for students whose Guided Practice scores are above
# 20, with those whose Guided Practice scores are below 20

print(grades[grades['gp'] > 20].mean()['hw_pass'])
print(grades[grades['gp'] < 20].mean()['hw_pass'])

3.76470588235
2.2


In [29]:
# Getting a specific value:

grades.loc['Ng', 'gp']

19

## Selection by position 

This is what `.iloc` is for. 

In [30]:
grades.iloc[10:15]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Ga,5,3,25,17,5,5,22
He,4,3,23,17,3,3,24
Li,5,4,26,17,5,5,19
Mc,0,0,9,1,2,2,22
Ma,5,2,24,11,5,5,23


In [31]:
grades.iloc[1:5, 2:4]

Unnamed: 0,lt_pass,lt_e
Bl,21,12
Br,26,17
Co,21,14
Cr,23,17


In [32]:
grades.iloc[[0,10,20], 3:]

Unnamed: 0,lt_e,pp_pass,pp_e,gp
Am,17,5,5,26
Ga,17,5,5,22
Ve,22,3,3,25


In [33]:
# Cross section of a single student 
grades.iloc[1]

hw_pass     2
hw_e        1
lt_pass    21
lt_e       12
pp_pass     3
pp_e        3
gp         18
Name: Bl, dtype: int64

## Selection by callable

You can pass a lambda to a data frame just like you can anything else. 

In [34]:
# This is just a toy example
# It does the same thing as `grades[grades.gp > 20]`
grades[lambda grades: grades.gp > 20]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Di,4,2,26,20,3,3,28
Du,4,3,22,15,4,4,22
Dy,0,0,11,4,1,0,21
Fl,5,4,28,24,5,5,24
Ga,5,3,25,17,5,5,22
He,4,3,23,17,3,3,24
Mc,0,0,9,1,2,2,22


In [35]:
# Return data for students whose "E" scores total up to more than 10

grades[lambda grades: grades.hw_e + grades.lt_e > 10]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Bl,2,1,21,12,3,3,18
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Cr,4,1,23,17,3,3,20
Di,4,2,26,20,3,3,28
Du,4,3,22,15,4,4,22
Fl,5,4,28,24,5,5,24
Ga,5,3,25,17,5,5,22
He,4,3,23,17,3,3,24


In [36]:
grades[grades.hw_e + grades.lt_e > 10]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Bl,2,1,21,12,3,3,18
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Cr,4,1,23,17,3,3,20
Di,4,2,26,20,3,3,28
Du,4,3,22,15,4,4,22
Fl,5,4,28,24,5,5,24
Ga,5,3,25,17,5,5,22
He,4,3,23,17,3,3,24


## Selecting random samples 

Use the `.sample()` method on Series or DataFrames. It samples rows by default.

In [37]:
# Basically this just randomly selects a row
print(grades.sample())

# This randomly selects 5 rows 
print(grades.sample(n=5))

# This randomly samples 1/10 of the data
print(grades.sample(frac=0.10))

# This samples with replacement
print(grades.sample(n = 10, replace=True))

    hw_pass  hw_e  lt_pass  lt_e  pp_pass  pp_e  gp
Sc        5     3       24    19        4     4  26
    hw_pass  hw_e  lt_pass  lt_e  pp_pass  pp_e  gp
Fl        5     4       28    24        5     5  24
Ng        4     2       23    16        3     3  19
Dy        0     0       11     4        1     0  21
Va        3     1       13     4        0     0  20
Li        5     4       26    17        5     5  19
    hw_pass  hw_e  lt_pass  lt_e  pp_pass  pp_e  gp
Di        4     2       26    20        3     3  28
Ng        4     2       23    16        3     3  19
    hw_pass  hw_e  lt_pass  lt_e  pp_pass  pp_e  gp
Ma        5     2       24    11        5     5  23
Bl        2     1       21    12        3     3  18
Li        5     4       26    17        5     5  19
Co        4     2       21    14        0     0  24
Fl        5     4       28    24        5     5  24
Ma        5     2       24    11        5     5  23
Ya        5     4       27    21        5     5  25
Ya        5 

In [38]:
# Sample columns = change axis 

grades.sample(n=2, axis = 1)

Unnamed: 0,hw_e,gp
Am,3,26
Bl,1,18
Br,2,28
Co,2,24
Cr,1,20
Di,2,28
Du,3,22
Dy,0,21
Fl,4,24
Fr,0,9


## Boolean indexing

Some of this we already saw:

In [39]:
grades.head()

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Bl,2,1,21,12,3,3,18
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Cr,4,1,23,17,3,3,20


In [40]:
# Shows the students with fewer than 10 E's on learning target assessments
grades[grades.lt_e < 10]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Dy,0,0,11,4,1,0,21
Fr,0,0,2,2,0,0,9
Mc,0,0,9,1,2,2,22
St,0,0,2,0,0,0,17
Va,3,1,13,4,0,0,20


But we can also throw in Boolean operators

In [41]:
# | means or
# Mind the parentheses

grades[(grades.lt_e < 10) | (grades.hw_e < 2)]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Bl,2,1,21,12,3,3,18
Cr,4,1,23,17,3,3,20
Dy,0,0,11,4,1,0,21
Fr,0,0,2,2,0,0,9
Mc,0,0,9,1,2,2,22
St,0,0,2,0,0,0,17
Va,3,1,13,4,0,0,20
Zi,1,0,23,11,3,3,22


In [42]:
# & is and 

grades[(grades.hw_e >= 2) & (grades.lt_e >= 3) ]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Di,4,2,26,20,3,3,28
Du,4,3,22,15,4,4,22
Fl,5,4,28,24,5,5,24
Ga,5,3,25,17,5,5,22
He,4,3,23,17,3,3,24
Li,5,4,26,17,5,5,19
Ma,5,2,24,11,5,5,23


In [43]:
# ~ is "not"

grades[(grades.lt_e > 10) & ~(grades.hw_e == 2)]

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Bl,2,1,21,12,3,3,18
Cr,4,1,23,17,3,3,20
Du,4,3,22,15,4,4,22
Fl,5,4,28,24,5,5,24
Ga,5,3,25,17,5,5,22
He,4,3,23,17,3,3,24
Li,5,4,26,17,5,5,19
Sc,5,3,24,19,4,4,26
Sh,4,4,23,15,2,1,25


## The `where` method and masking

`where` does pretty much the same thing as the boolean selection but it keeps the shape of the series/DataFrame the same and sticks `NaN` anywhere that wouldn't have been selected.

In [44]:
grades.where(grades.lt_e < 2)

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,,,,,,,
Bl,,,,,,,
Br,,,,,,,
Co,,,,,,,
Cr,,,,,,,
Di,,,,,,,
Du,,,,,,,
Dy,,,,,,,
Fl,,,,,,,
Fr,,,,,,,


In [45]:
grades.where(grades < 10)

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,,,5,5,
Bl,2,1,,,3,3,
Br,4,2,,,4,4,
Co,4,2,,,0,0,
Cr,4,1,,,3,3,
Di,4,2,,,3,3,
Du,4,3,,,4,4,
Dy,0,0,,4.0,1,0,
Fl,5,4,,,5,5,
Fr,0,0,2.0,2.0,0,0,9.0


`mask` is the inverse of `where`:

In [46]:
grades.mask(grades.lt_e < 2)

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5.0,3.0,27.0,17.0,5.0,5.0,26.0
Bl,2.0,1.0,21.0,12.0,3.0,3.0,18.0
Br,4.0,2.0,26.0,17.0,4.0,4.0,28.0
Co,4.0,2.0,21.0,14.0,0.0,0.0,24.0
Cr,4.0,1.0,23.0,17.0,3.0,3.0,20.0
Di,4.0,2.0,26.0,20.0,3.0,3.0,28.0
Du,4.0,3.0,22.0,15.0,4.0,4.0,22.0
Dy,0.0,0.0,11.0,4.0,1.0,0.0,21.0
Fl,5.0,4.0,28.0,24.0,5.0,5.0,24.0
Fr,0.0,0.0,2.0,2.0,0.0,0.0,9.0


## Index objects

Indexes are actual objects in a class `Index`. They behave like multisets (duplicates allowed). 

In [49]:
my_index = pd.Index(['a','b','c','d','f'])
type(my_index)

columns = pd.Index(['Homework', 'Tests', 'Final Exam'])

df = pd.DataFrame(np.random.randn(5,3), index = my_index, columns = columns)

df

Unnamed: 0,Homework,Tests,Final Exam
a,-1.574986,0.150297,0.68608
b,0.315615,-1.664547,-1.757779
c,1.173149,-0.941015,0.057408
d,0.043962,-1.173853,-0.350653
f,-0.912411,1.003285,0.228033


## Set/reset index

In [50]:
grades.head()

Unnamed: 0,hw_pass,hw_e,lt_pass,lt_e,pp_pass,pp_e,gp
Am,5,3,27,17,5,5,26
Bl,2,1,21,12,3,3,18
Br,4,2,26,17,4,4,28
Co,4,2,21,14,0,0,24
Cr,4,1,23,17,3,3,20
