In [3]:
import pandas as pd

path = 'datasets/employee_list.parquet'
df = pd.read_parquet(path)
df = df.sort_values('name', ascending=True).set_index('name')
df.head()

Unnamed: 0_level_0,employee_number,company,country,dob,age,department,salary,has_parking_space
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alex Rodriguez,901656,"Hernandez, Cunningham and Clark",Israel,1975-11-08,46,System Architect,132988,True
Allison Hall,120088,Wilson and Sons,Israel,1987-02-04,34,Consulting,113536,False
Amber Sloan,414171,"Hernandez, Cunningham and Clark",Germany,1982-11-08,39,Consulting,100644,False
Amy Mitchell,425959,"Hernandez, Cunningham and Clark",Suriname,1975-09-30,46,System Architect,99776,False
Andrew Hodge,379723,"Hernandez, Cunningham and Clark",India,1983-10-09,38,Consulting,124667,True


### Selecting subsets of a Series
- Remember that when you slice by position, pandas uses the half-open interval.
- The half-open interval includes the first index, but not the end index.
- However, when you slice by label, pandas uses the closed interval and includes both the start and end index. 
- This behavior is inconsistent with Python in general, but is practical for labels.

In [5]:
salary = df['salary']

In [6]:
# pull out a scalar from the series directly
salary['Amber Sloan']

100644

In [7]:
# Pull out a scalar value using the .loc attribute by name
salary.loc['Amber Sloan']

100644

In [8]:
# Pull out several values by indexing.
salary[['Amber Sloan', 'Andrew Hodge']]

name
Amber Sloan     100644
Andrew Hodge    124667
Name: salary, dtype: int64

In [9]:
# Pull out several values by using .loc
salary.loc[['Amber Sloan', 'Andrew Hodge']]

name
Amber Sloan     100644
Andrew Hodge    124667
Name: salary, dtype: int64

In [17]:
# Pull out several values by using .loc and slice
salary.loc['Amber Sloan':'Andrew Hodge']

name
Amber Sloan     100644
Amy Mitchell     99776
Andrew Hodge    124667
Name: salary, dtype: int64

In [14]:
 # Pull out several values by using .iloc
 salary.iloc[[2, 5]]

name
Amber Sloan       100644
Andrew Johnson    128330
Name: salary, dtype: int64

In [16]:
 # Pull out several values by using .iloc and slice
 salary.iloc[2:5]

name
Amber Sloan     100644
Amy Mitchell     99776
Andrew Hodge    124667
Name: salary, dtype: int64

In [19]:
# Use the isin method to find matches
to_find = [100644, 99776, 124667]
salary[salary.isin(to_find)]

name
Amber Sloan     100644
Amy Mitchell     99776
Andrew Hodge    124667
Name: salary, dtype: int64

### Selecting subsets of a DataFrame

In [21]:
# Pull out a scalar value using the .loc attribute by name
df.loc['Amber Sloan', 'salary']

100644

In [23]:
# Pull out a scalar value using the .iloc attribute
df.iloc[[2, 4], 6]

name
Amber Sloan     100644
Andrew Hodge    124667
Name: salary, dtype: int64

In [24]:
# Pull out several values by using .loc
df.loc[['Amber Sloan', 'Andrew Hodge'], 'salary']

name
Amber Sloan     100644
Andrew Hodge    124667
Name: salary, dtype: int64

In [25]:
# Use a slice to pull out many values with .loc
df.loc['Amber Sloan':'Andrew Hodge', 'salary']

name
Amber Sloan     100644
Amy Mitchell     99776
Andrew Hodge    124667
Name: salary, dtype: int64

In [27]:
# Use a slice to pull out many values with .iloc
df.iloc[2:5, 6]

name
Amber Sloan     100644
Amy Mitchell     99776
Andrew Hodge    124667
Name: salary, dtype: int64

### Selecting DataFrame rows

In [28]:
# Select an entire row at a certain position using .iloc
df.iloc[2] # since Python is zero-based, this is actually the  3rd row

employee_number                               414171
company              Hernandez, Cunningham and Clark
country                                      Germany
dob                              1982-11-08 00:00:00
age                                               39
department                                Consulting
salary                                        100644
has_parking_space                              False
Name: Amber Sloan, dtype: object

In [31]:
 # Select three disjointed set of rows using .iloc
 df.iloc[[2, 4]]

Unnamed: 0_level_0,employee_number,company,country,dob,age,department,salary,has_parking_space
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amber Sloan,414171,"Hernandez, Cunningham and Clark",Germany,1982-11-08,39,Consulting,100644,False
Andrew Hodge,379723,"Hernandez, Cunningham and Clark",India,1983-10-09,38,Consulting,124667,True


In [32]:
# select two disjointed set of rows using .loc
df.loc[['Amber Sloan', 'Andrew Hodge']]

Unnamed: 0_level_0,employee_number,company,country,dob,age,department,salary,has_parking_space
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amber Sloan,414171,"Hernandez, Cunningham and Clark",Germany,1982-11-08,39,Consulting,100644,False
Andrew Hodge,379723,"Hernandez, Cunningham and Clark",India,1983-10-09,38,Consulting,124667,True


In [33]:
# slice notation with .iloc to select contiguous rows of the data
df.iloc[2:5]

Unnamed: 0_level_0,employee_number,company,country,dob,age,department,salary,has_parking_space
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amber Sloan,414171,"Hernandez, Cunningham and Clark",Germany,1982-11-08,39,Consulting,100644,False
Amy Mitchell,425959,"Hernandez, Cunningham and Clark",Suriname,1975-09-30,46,System Architect,99776,False
Andrew Hodge,379723,"Hernandez, Cunningham and Clark",India,1983-10-09,38,Consulting,124667,True


In [34]:
# slite notation with .loc
start = 'Amber Sloan'
stop = 'Andrew Hodge'
df.loc[start:stop]

Unnamed: 0_level_0,employee_number,company,country,dob,age,department,salary,has_parking_space
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amber Sloan,414171,"Hernandez, Cunningham and Clark",Germany,1982-11-08,39,Consulting,100644,False
Amy Mitchell,425959,"Hernandez, Cunningham and Clark",Suriname,1975-09-30,46,System Architect,99776,False
Andrew Hodge,379723,"Hernandez, Cunningham and Clark",India,1983-10-09,38,Consulting,124667,True


### Select DataFrame rows and columns simultaneously
One of the keys to selecting rows and columns at the same time is to understand the use of the comma in the brackets. The selection to the left of the comma always selects rows based on the row index. The selection to the right of the comma always selects columns based on the column index.

In [35]:
# use .iloc to select the first 3 rows and the first 4 columns
df.iloc[:3, :4]

Unnamed: 0_level_0,employee_number,company,country,dob
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alex Rodriguez,901656,"Hernandez, Cunningham and Clark",Israel,1975-11-08
Allison Hall,120088,Wilson and Sons,Israel,1987-02-04
Amber Sloan,414171,"Hernandez, Cunningham and Clark",Germany,1982-11-08


In [36]:
# use .iloc to select all rows and two different columns
df.iloc[:, [4, 6]].head()

Unnamed: 0_level_0,age,salary
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alex Rodriguez,46,132988
Allison Hall,34,113536
Amber Sloan,39,100644
Amy Mitchell,46,99776
Andrew Hodge,38,124667


In [39]:
# use .iloc to select disjointed rows and columns
df.iloc[[2, 4], [4, 6]]

Unnamed: 0_level_0,age,salary
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Amber Sloan,39,100644
Andrew Hodge,38,124667


In [40]:
# use .iloc to select a single scalar value
df.iloc[2, 6]

100644

In [41]:
# use .loc to select a slice of rows and a single column
start = 'Amber Sloan'
stop = 'Andrew Hodge'
df.loc[start:stop, 'salary']

name
Amber Sloan     100644
Amy Mitchell     99776
Andrew Hodge    124667
Name: salary, dtype: int64

### Selecting data with both integers and labels
Sometimes, you want the functionality of both .iloc and .loc, to select data by both position and label. In earlier versions of pandas, .ix was available to select data by both position and label. While this conveniently worked for those specific situations, it was ambiguous by nature and was a source of confusion for many pandas users. The .ix indexer has subsequently been deprecated and thus should be avoided.

Before the .ix deprecation, it was possible to select the first five rows and the columns of the college dataset from UGDS_WHITE through UGDS_UNKN using college.ix[:5, 'UGDS_WHITE':'UGDS_UNKN']. This is now impossible to do directly using .loc or .iloc. The following recipe shows how to find the integer location of the columns and then use .iloc to complete the selection.

In [44]:
col_start = df.columns.get_loc('age')
col_end = df.columns.get_loc('salary') + 1
df.iloc[:5, col_start: col_end]

Unnamed: 0_level_0,age,department,salary
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alex Rodriguez,46,System Architect,132988
Allison Hall,34,Consulting,113536
Amber Sloan,39,Consulting,100644
Amy Mitchell,46,System Architect,99776
Andrew Hodge,38,Consulting,124667


In [45]:
row_start = df.index[2]
row_end = df.index[5]
df.loc[row_start:row_end, 'age':'salary']

Unnamed: 0_level_0,age,department,salary
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amber Sloan,39,Consulting,100644
Amy Mitchell,46,System Architect,99776
Andrew Hodge,38,Consulting,124667
Andrew Johnson,46,System Architect,128330


### Slicing lexicongraphically

In [51]:
print(df.index.is_monotonic_increasing)
df.loc['Am':'Ao'].head()

True


Unnamed: 0_level_0,employee_number,company,country,dob,age,department,salary,has_parking_space
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Amber Sloan,414171,"Hernandez, Cunningham and Clark",Germany,1982-11-08,39,Consulting,100644,False
Amy Mitchell,425959,"Hernandez, Cunningham and Clark",Suriname,1975-09-30,46,System Architect,99776,False
Andrew Hodge,379723,"Hernandez, Cunningham and Clark",India,1983-10-09,38,Consulting,124667,True
Andrew Johnson,133141,Wilson and Sons,Japan,1975-05-29,46,System Architect,128330,False
Andrew Rivera,791480,"Hernandez, Cunningham and Clark",Israel,1969-01-22,53,Finance,86820,True
