# Selecting Subsets of Data 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)

## Introduction

## Selecting Series data

### How to do it...

In [2]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
city = college['CITY']
city

INSTNM
Alabama A & M University                                       Normal
University of Alabama at Birmingham                        Birmingham
Amridge University                                         Montgomery
University of Alabama in Huntsville                        Huntsville
Alabama State University                                   Montgomery
                                                             ...     
SAE Institute of Technology  San Francisco                 Emeryville
Rasmussen College - Overland Park                         Overland...
National Personal Training Institute of Cleveland         Highland...
Bay Area Medical Academy - San Jose Satellite Location       San Jose
Excel Learning Center-San Antonio South                   San Antonio
Name: CITY, Length: 7535, dtype: object

In [3]:
city['Alabama A & M University']

'Normal'

In [4]:
city.loc['Alabama A & M University']

'Normal'

In [5]:
city.iloc[0]

'Normal'

In [6]:
city[['Alabama A & M University', 'Alabama State University']]

INSTNM
Alabama A & M University        Normal
Alabama State University    Montgomery
Name: CITY, dtype: object

In [7]:
city.loc[['Alabama A & M University', 'Alabama State University']]

INSTNM
Alabama A & M University        Normal
Alabama State University    Montgomery
Name: CITY, dtype: object

In [8]:
city.iloc[[0, 4]]

INSTNM
Alabama A & M University        Normal
Alabama State University    Montgomery
Name: CITY, dtype: object

In [9]:
city['Alabama A & M University': 'Alabama State University']

INSTNM
Alabama A & M University                   Normal
University of Alabama at Birmingham    Birmingham
Amridge University                     Montgomery
University of Alabama in Huntsville    Huntsville
Alabama State University               Montgomery
Name: CITY, dtype: object

In [10]:
city[0:5]

INSTNM
Alabama A & M University                   Normal
University of Alabama at Birmingham    Birmingham
Amridge University                     Montgomery
University of Alabama in Huntsville    Huntsville
Alabama State University               Montgomery
Name: CITY, dtype: object

In [11]:
city.loc['Alabama A & M University': 'Alabama State University']

INSTNM
Alabama A & M University                   Normal
University of Alabama at Birmingham    Birmingham
Amridge University                     Montgomery
University of Alabama in Huntsville    Huntsville
Alabama State University               Montgomery
Name: CITY, dtype: object

In [12]:
city.iloc[0:5]

INSTNM
Alabama A & M University                   Normal
University of Alabama at Birmingham    Birmingham
Amridge University                     Montgomery
University of Alabama in Huntsville    Huntsville
Alabama State University               Montgomery
Name: CITY, dtype: object

In [13]:
alabama_mask = city.isin(['Birmingham', 'Montgomery'])
city[alabama_mask]

INSTNM
University of Alabama at Birmingham    Birmingham
Amridge University                     Montgomery
Alabama State University               Montgomery
Auburn University at Montgomery        Montgomery
Birmingham Southern College            Birmingham
                                          ...    
Fortis Institute-Birmingham            Birmingham
Hair Academy                           Montgomery
Brown Mackie College-Birmingham        Birmingham
Nunation School of Cosmetology         Birmingham
Troy University-Montgomery Campus      Montgomery
Name: CITY, Length: 26, dtype: object

### How it works...

In [14]:
s = pd.Series([10, 20, 35, 28], index=[5,2,3,1])
s

5    10
2    20
3    35
1    28
dtype: int64

In [15]:
s[0:4]

5    10
2    20
3    35
1    28
dtype: int64

In [16]:
s[5]

10

In [17]:
s[1]

28

### There's more...

In [18]:
college.loc['Alabama A & M University', 'CITY']

'Normal'

In [19]:
college.iloc[0, 0]

'Normal'

In [20]:
college.loc[['Alabama A & M University', 
    'Alabama State University'], 'CITY']

INSTNM
Alabama A & M University        Normal
Alabama State University    Montgomery
Name: CITY, dtype: object

In [21]:
college.iloc[[0, 4], 0]

INSTNM
Alabama A & M University        Normal
Alabama State University    Montgomery
Name: CITY, dtype: object

In [22]:
college.loc['Alabama A & M University':
    'Alabama State University', 'CITY']

INSTNM
Alabama A & M University                   Normal
University of Alabama at Birmingham    Birmingham
Amridge University                     Montgomery
University of Alabama in Huntsville    Huntsville
Alabama State University               Montgomery
Name: CITY, dtype: object

In [23]:
college.iloc[0:5, 0]

INSTNM
Alabama A & M University                   Normal
University of Alabama at Birmingham    Birmingham
Amridge University                     Montgomery
University of Alabama in Huntsville    Huntsville
Alabama State University               Montgomery
Name: CITY, dtype: object

In [24]:
city.loc['Reid State Technical College':
         'Alabama State University']

Series([], Name: CITY, dtype: object)

## Selecting DataFrame rows

In [25]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college.sample(5, random_state=42)

Unnamed: 0_level_0,CITY,STABBR,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Career Point College,San Antonio,TX,...,20700,14977
Ner Israel Rabbinical College,Baltimore,MD,...,PrivacyS...,PrivacyS...
Reflections Academy of Beauty,Decatur,IL,...,,PrivacyS...
Capital Area Technical College,Baton Rouge,LA,...,26400,PrivacyS...
West Virginia University Institute of Technology,Montgomery,WV,...,43400,23969


In [26]:
college.iloc[60]

CITY                  Anchorage
STABBR                       AK
HBCU                          0
MENONLY                       0
WOMENONLY                     0
                        ...    
PCTPELL                  0.2385
PCTFLOAN                 0.2647
UG25ABV                  0.4386
MD_EARN_WNE_P10           42500
GRAD_DEBT_MDN_SUPP      19449.5
Name: University of Alaska Anchorage, Length: 26, dtype: object

In [27]:
college.loc['University of Alaska Anchorage']

CITY                  Anchorage
STABBR                       AK
HBCU                          0
MENONLY                       0
WOMENONLY                     0
                        ...    
PCTPELL                  0.2385
PCTFLOAN                 0.2647
UG25ABV                  0.4386
MD_EARN_WNE_P10           42500
GRAD_DEBT_MDN_SUPP      19449.5
Name: University of Alaska Anchorage, Length: 26, dtype: object

In [28]:
college.iloc[[60, 99, 3]]

Unnamed: 0_level_0,CITY,STABBR,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
University of Alaska Anchorage,Anchorage,AK,...,42500,19449.5
International Academy of Hair Design,Tempe,AZ,...,22200,10556.0
University of Alabama in Huntsville,Huntsville,AL,...,45500,24097.0


In [29]:
labels = ['University of Alaska Anchorage',
          'International Academy of Hair Design',
          'University of Alabama in Huntsville']
college.loc[labels]

Unnamed: 0_level_0,CITY,STABBR,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
University of Alaska Anchorage,Anchorage,AK,...,42500,19449.5
International Academy of Hair Design,Tempe,AZ,...,22200,10556.0
University of Alabama in Huntsville,Huntsville,AL,...,45500,24097.0


In [30]:
college.iloc[99:102]

Unnamed: 0_level_0,CITY,STABBR,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
International Academy of Hair Design,Tempe,AZ,...,22200,10556
GateWay Community College,Phoenix,AZ,...,29800,7283
Mesa Community College,Mesa,AZ,...,35200,8000


In [31]:
start = 'International Academy of Hair Design'
stop = 'Mesa Community College'
college.loc[start:stop]

Unnamed: 0_level_0,CITY,STABBR,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
International Academy of Hair Design,Tempe,AZ,...,22200,10556
GateWay Community College,Phoenix,AZ,...,29800,7283
Mesa Community College,Mesa,AZ,...,35200,8000


### How it works...

### There's more...

In [32]:
college.iloc[[60, 99, 3]].index.tolist()

['University of Alaska Anchorage',
 'International Academy of Hair Design',
 'University of Alabama in Huntsville']

## Selecting DataFrame rows and columns simultaneously

### How to do it...

In [33]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college.iloc[:3, :4]

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama A & M University,Normal,AL,1.0,0.0
University of Alabama at Birmingham,Birmingham,AL,0.0,0.0
Amridge University,Montgomery,AL,0.0,0.0


In [34]:
college.loc[:'Amridge University', :'MENONLY']

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama A & M University,Normal,AL,1.0,0.0
University of Alabama at Birmingham,Birmingham,AL,0.0,0.0
Amridge University,Montgomery,AL,0.0,0.0


In [35]:
college.iloc[:, [4,6]].head()

Unnamed: 0_level_0,WOMENONLY,SATVRMID
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama A & M University,0.0,424.0
University of Alabama at Birmingham,0.0,570.0
Amridge University,0.0,
University of Alabama in Huntsville,0.0,595.0
Alabama State University,0.0,425.0


In [36]:
college.loc[:, ['WOMENONLY', 'SATVRMID']].head()

Unnamed: 0_level_0,WOMENONLY,SATVRMID
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama A & M University,0.0,424.0
University of Alabama at Birmingham,0.0,570.0
Amridge University,0.0,
University of Alabama in Huntsville,0.0,595.0
Alabama State University,0.0,425.0


In [37]:
college.iloc[[100, 200], [7, 15]]

Unnamed: 0_level_0,SATMTMID,UGDS_NHPI
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1
GateWay Community College,,0.0029
American Baptist Seminary of the West,,


In [38]:
rows = ['GateWay Community College',
        'American Baptist Seminary of the West']
columns = ['SATMTMID', 'UGDS_NHPI']
college.loc[rows, columns]

Unnamed: 0_level_0,SATMTMID,UGDS_NHPI
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1
GateWay Community College,,0.0029
American Baptist Seminary of the West,,


In [39]:
college.iloc[5, -4]

0.401

In [40]:
college.loc['The University of Alabama', 'PCTFLOAN']

0.401

In [41]:
college.iloc[90:80:-2, 5]

INSTNM
Empire Beauty School-Flagstaff     0
Charles of Italy Beauty College    0
Central Arizona College            0
University of Arizona              0
Arizona State University-Tempe     0
Name: RELAFFIL, dtype: int64

In [42]:
start = 'Empire Beauty School-Flagstaff'
stop = 'Arizona State University-Tempe'
college.loc[start:stop:-2, 'RELAFFIL']

INSTNM
Empire Beauty School-Flagstaff     0
Charles of Italy Beauty College    0
Central Arizona College            0
University of Arizona              0
Arizona State University-Tempe     0
Name: RELAFFIL, dtype: int64

### How it works...

### There's more...

## Selecting data with both integers and labels

### How to do it...

In [43]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')

In [44]:
col_start = college.columns.get_loc('UGDS_WHITE')
col_end = college.columns.get_loc('UGDS_UNKN') + 1
col_start, col_end

(10, 19)

In [45]:
college.iloc[:5, col_start:col_end]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0.0333,0.9353,...,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,...,0.0179,0.01
Amridge University,0.299,0.4192,...,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,...,0.0332,0.035
Alabama State University,0.0158,0.9208,...,0.0243,0.0137


### How it works...

### There's more...

In [46]:
row_start = college.index[10]
row_end = college.index[15]
college.loc[row_start:row_end, 'UGDS_WHITE':'UGDS_UNKN']

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Birmingham Southern College,0.7983,0.1102,...,0.0,0.0051
Chattahoochee Valley Community College,0.4661,0.4372,...,0.0,0.0139
Concordia College Alabama,0.028,0.8758,...,0.0466,0.0
South University-Montgomery,0.3046,0.6054,...,0.0019,0.0326
Enterprise State Community College,0.6408,0.2435,...,0.0012,0.0069
James H Faulkner State Community College,0.6979,0.2259,...,0.0007,0.0009


In [47]:
college.ix[10:16, 'UGDS_WHITE':'UGDS_UNKN']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  retval = getattr(retval, self.name)._getitem_axis(key, axis=i)


Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Birmingham Southern College,0.7983,0.1102,...,0.0,0.0051
Chattahoochee Valley Community College,0.4661,0.4372,...,0.0,0.0139
Concordia College Alabama,0.028,0.8758,...,0.0466,0.0
South University-Montgomery,0.3046,0.6054,...,0.0019,0.0326
Enterprise State Community College,0.6408,0.2435,...,0.0012,0.0069
James H Faulkner State Community College,0.6979,0.2259,...,0.0007,0.0009


In [48]:
college.iloc[10:16].loc[:, 'UGDS_WHITE':'UGDS_UNKN']

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Birmingham Southern College,0.7983,0.1102,...,0.0,0.0051
Chattahoochee Valley Community College,0.4661,0.4372,...,0.0,0.0139
Concordia College Alabama,0.028,0.8758,...,0.0466,0.0
South University-Montgomery,0.3046,0.6054,...,0.0019,0.0326
Enterprise State Community College,0.6408,0.2435,...,0.0012,0.0069
James H Faulkner State Community College,0.6979,0.2259,...,0.0007,0.0009


## Slicing lexicographically

### How to do it...

In [49]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')

In [50]:
college.loc['Sp':'Su']

KeyError: 'Sp'

In [51]:
college = college.sort_index()

In [52]:
college.loc['Sp':'Su']

Unnamed: 0_level_0,CITY,STABBR,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Spa Tech Institute-Ipswich,Ipswich,MA,...,21500,6333
Spa Tech Institute-Plymouth,Plymouth,MA,...,21500,6333
Spa Tech Institute-Westboro,Westboro,MA,...,21500,6333
Spa Tech Institute-Westbrook,Westbrook,ME,...,21500,6333
Spalding University,Louisville,KY,...,41700,25000
...,...,...,...,...,...
Studio Academy of Beauty,Chandler,AZ,...,,6333
Studio Jewelers,New York,NY,...,PrivacyS...,PrivacyS...
Stylemaster College of Hair Design,Longview,WA,...,17000,13320
Styles and Profiles Beauty College,Selmer,TN,...,PrivacyS...,PrivacyS...


### How it works...

### There's more...

In [53]:
college = college.sort_index(ascending=False)
college.index.is_monotonic_decreasing

True

In [54]:
college.loc['E':'B']

Unnamed: 0_level_0,CITY,STABBR,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dyersburg State Community College,Dyersburg,TN,...,26800,7475
Dutchess Community College,Poughkee...,NY,...,32500,10250
Dutchess BOCES-Practical Nursing Program,Poughkee...,NY,...,36500,9500
Durham Technical Community College,Durham,NC,...,27200,11069.5
Durham Beauty Academy,Durham,NC,...,PrivacyS...,15332
...,...,...,...,...,...
Bacone College,Muskogee,OK,...,29700,26350
Babson College,Wellesley,MA,...,86700,27000
BJ's Beauty & Barber College,Auburn,WA,...,,PrivacyS...
BIR Training Center,Chicago,IL,...,PrivacyS...,15394
