# Index Alignment

In [3]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)

## Introduction

## Examining the Index object

### How to do it...

In [12]:
college = pd.read_csv('data/college.csv')
columns = college.columns
columns

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL',
       'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS', 'UGDS_WHITE',
       'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NHPI',
       'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF', 'CURROPER', 'PCTPELL',
       'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP'],
      dtype='object')

In [13]:
columns.values

array(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY',
       'RELAFFIL', 'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS',
       'UGDS_WHITE', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN',
       'UGDS_NHPI', 'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF',
       'CURROPER', 'PCTPELL', 'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10',
       'GRAD_DEBT_MDN_SUPP'], dtype=object)

In [14]:
columns[5]

'WOMENONLY'

In [15]:
columns[[1,8,10]]

Index(['CITY', 'SATMTMID', 'UGDS'], dtype='object')

In [16]:
columns[-7:-4]

Index(['PPTUG_EF', 'CURROPER', 'PCTPELL'], dtype='object')

In [17]:
columns.min(), columns.max(), columns.isnull().sum()

('CITY', 'WOMENONLY', 0)

In [18]:
columns + '_A'

Index(['INSTNM_A', 'CITY_A', 'STABBR_A', 'HBCU_A', 'MENONLY_A', 'WOMENONLY_A',
       'RELAFFIL_A', 'SATVRMID_A', 'SATMTMID_A', 'DISTANCEONLY_A', 'UGDS_A',
       'UGDS_WHITE_A', 'UGDS_BLACK_A', 'UGDS_HISP_A', 'UGDS_ASIAN_A',
       'UGDS_AIAN_A', 'UGDS_NHPI_A', 'UGDS_2MOR_A', 'UGDS_NRA_A',
       'UGDS_UNKN_A', 'PPTUG_EF_A', 'CURROPER_A', 'PCTPELL_A', 'PCTFLOAN_A',
       'UG25ABV_A', 'MD_EARN_WNE_P10_A', 'GRAD_DEBT_MDN_SUPP_A'],
      dtype='object')

In [19]:
columns > 'G'

array([ True, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True])

In [20]:
columns[1] = 'city'

TypeError: Index does not support mutable operations

### How it works...

### There's more...

In [61]:
c1 = columns[:4]
c1

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU'], dtype='object')

In [62]:
c2 = columns[2:6]
c2

Index(['STABBR', 'HBCU', 'MENONLY', 'WOMENONLY'], dtype='object')

In [63]:
c1.union(c2) # or `c1 | c2`

Index(['CITY', 'HBCU', 'INSTNM', 'MENONLY', 'STABBR', 'WOMENONLY'], dtype='object')

In [64]:
c1.symmetric_difference(c2) # or `c1 ^ c2`

Index(['CITY', 'INSTNM', 'MENONLY', 'WOMENONLY'], dtype='object')

## Producing Cartesian products

### How to do it...

In [65]:
s1 = pd.Series(index=list('aaab'), data=np.arange(4))
s1

a    0
a    1
a    2
b    3
dtype: int64

In [66]:
s2 = pd.Series(index=list('cababb'), data=np.arange(6))
s2

c    0
a    1
b    2
a    3
b    4
b    5
dtype: int64

In [67]:
s1 + s2

a    1.0
a    3.0
a    2.0
a    4.0
a    3.0
a    5.0
b    5.0
b    7.0
b    8.0
c    NaN
dtype: float64

### How it works...

### There's more...

In [68]:
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('aaabb'), data=np.arange(5))
s1 + s2

a    0
a    2
a    4
b    6
b    8
dtype: int64

In [69]:
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('bbaaa'), data=np.arange(5))
s1 + s2

a    2
a    3
a    4
a    3
a    4
    ..
a    6
b    3
b    4
b    4
b    5
Length: 13, dtype: int64

In [70]:
s3 = pd.Series(index=list('ab'), data=np.arange(2))
s4 = pd.Series(index=list('ba'), data=np.arange(2))
s3 + s4

a    1
b    1
dtype: int64

## Exploding indexes

### How to do it...

In [71]:
employee = pd.read_csv('data/employee.csv', index_col='RACE')
employee.head()

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,...,HIRE_DATE,JOB_DATE
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Hispanic/Latino,0,ASSISTAN...,...,2006-06-12,2012-10-13
Hispanic/Latino,1,LIBRARY ...,...,2000-07-19,2010-09-18
White,2,POLICE O...,...,2015-02-03,2015-02-03
White,3,ENGINEER...,...,1982-02-08,1991-05-25
White,4,ELECTRICIAN,...,1989-06-19,1994-10-22


In [72]:
salary1 = employee['BASE_SALARY']
salary2 = employee['BASE_SALARY']
salary1 is salary2

True

In [73]:
salary2 = employee['BASE_SALARY'].copy()
salary1 is salary2

False

In [74]:
salary1 = salary1.sort_index()
salary1.head()

RACE
American Indian or Alaskan Native    78355.0
American Indian or Alaskan Native    26125.0
American Indian or Alaskan Native    98536.0
American Indian or Alaskan Native        NaN
American Indian or Alaskan Native    55461.0
Name: BASE_SALARY, dtype: float64

In [75]:
salary2.head()

RACE
Hispanic/Latino    121862.0
Hispanic/Latino     26125.0
White               45279.0
White               63166.0
White               56347.0
Name: BASE_SALARY, dtype: float64

In [76]:
salary_add = salary1 + salary2

In [77]:
salary_add.head()

RACE
American Indian or Alaskan Native    138702.0
American Indian or Alaskan Native    156710.0
American Indian or Alaskan Native    176891.0
American Indian or Alaskan Native    159594.0
American Indian or Alaskan Native    127734.0
Name: BASE_SALARY, dtype: float64

In [78]:
salary_add1 = salary1 + salary1
len(salary1), len(salary2), len(salary_add), len(salary_add1)

(2000, 2000, 1175424, 2000)

### How it works...

### There's more...

In [79]:
index_vc = salary1.index.value_counts(dropna=False)
index_vc

Black or African American            700
White                                665
Hispanic/Latino                      480
Asian/Pacific Islander               107
NaN                                   35
American Indian or Alaskan Native     11
Others                                 2
Name: RACE, dtype: int64

In [80]:
index_vc.pow(2).sum()

1175424

## Filling values with unequal indexes

In [4]:
baseball_14 = pd.read_csv('data/baseball14.csv',
   index_col='playerID')
baseball_15 = pd.read_csv('data/baseball15.csv',
   index_col='playerID')
baseball_16 = pd.read_csv('data/baseball16.csv',
   index_col='playerID')
baseball_14.head()

Unnamed: 0_level_0,yearID,stint,...,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,2014,1,...,5.0,20.0
cartech02,2014,1,...,4.0,12.0
castrja01,2014,1,...,3.0,11.0
corpoca01,2014,1,...,2.0,3.0
dominma01,2014,1,...,7.0,23.0


In [82]:
baseball_14.index.difference(baseball_15.index)

Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
       'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
      dtype='object', name='playerID')

In [83]:
baseball_14.index.difference(baseball_16.index)

Index(['cartech02', 'corpoca01', 'dominma01', 'fowlede01', 'grossro01',
       'guzmaje01', 'hoeslj01', 'krausma01', 'preslal01', 'singljo02',
       'villajo01'],
      dtype='object', name='playerID')

In [84]:
hits_14 = baseball_14['H']
hits_15 = baseball_15['H']
hits_16 = baseball_16['H']
hits_14.head()

playerID
altuvjo01    225
cartech02    115
castrja01    103
corpoca01     40
dominma01    121
Name: H, dtype: int64

In [85]:
(hits_14 + hits_15).head()

playerID
altuvjo01    425.0
cartech02    193.0
castrja01    174.0
congeha01      NaN
corpoca01      NaN
Name: H, dtype: float64

In [86]:
hits_14.add(hits_15, fill_value=0).head()

playerID
altuvjo01    425.0
cartech02    193.0
castrja01    174.0
congeha01     46.0
corpoca01     40.0
Name: H, dtype: float64

In [87]:
hits_total = (hits_14
   .add(hits_15, fill_value=0)
   .add(hits_16, fill_value=0)
)
hits_total.head()

playerID
altuvjo01    641.0
bregmal01     53.0
cartech02    193.0
castrja01    243.0
congeha01     46.0
Name: H, dtype: float64

In [88]:
hits_total.hasnans

False

### How it works...

In [89]:
s = pd.Series(index=['a', 'b', 'c', 'd'],
              data=[np.nan, 3, np.nan, 1])
s

a    NaN
b    3.0
c    NaN
d    1.0
dtype: float64

In [90]:
s1 = pd.Series(index=['a', 'b', 'c'], data=[np.nan, 6, 10])
s1

a     NaN
b     6.0
c    10.0
dtype: float64

In [91]:
s.add(s1, fill_value=5)

a     NaN
b     9.0
c    15.0
d     6.0
dtype: float64

### There's more...

In [5]:
df_14 = baseball_14[['G','AB', 'R', 'H']]
df_14.head()

Unnamed: 0_level_0,G,AB,R,H
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altuvjo01,158,660,85,225
cartech02,145,507,68,115
castrja01,126,465,43,103
corpoca01,55,170,22,40
dominma01,157,564,51,121


In [6]:
df_15 = baseball_15[['AB', 'R', 'H', 'HR']]
df_15.head()

Unnamed: 0_level_0,AB,R,H,HR
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altuvjo01,638,86,200,15
cartech02,391,50,78,24
castrja01,337,38,71,11
congeha01,201,25,46,11
correca01,387,52,108,22


In [7]:
(df_14 + df_15).head(10).style.highlight_null('yellow')


Unnamed: 0_level_0,AB,G,H,HR,R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,1298.0,,425.0,,171.0
cartech02,898.0,,193.0,,118.0
castrja01,802.0,,174.0,,81.0
congeha01,,,,,
corpoca01,,,,,
correca01,,,,,
dominma01,,,,,
fowlede01,,,,,
gattiev01,,,,,
gomezca01,,,,,


In [8]:
(df_14
.add(df_15, fill_value=0)
.head(10)
.style.highlight_null('yellow')
)

Unnamed: 0_level_0,AB,G,H,HR,R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,1298,158.0,425,15.0,171
cartech02,898,145.0,193,24.0,118
castrja01,802,126.0,174,11.0,81
congeha01,201,,46,11.0,25
corpoca01,170,55.0,40,,22
correca01,387,,108,22.0,52
dominma01,564,157.0,121,,51
fowlede01,434,116.0,120,,61
gattiev01,566,,139,27.0,66
gomezca01,149,,36,4.0,19


## Adding columns from different DataFrames

### How to do it...

In [94]:
employee = pd.read_csv('data/employee.csv')
dept_sal = employee[['DEPARTMENT', 'BASE_SALARY']]

In [95]:
dept_sal = dept_sal.sort_values(['DEPARTMENT', 'BASE_SALARY'],
    ascending=[True, False])

In [96]:
max_dept_sal = dept_sal.drop_duplicates(subset='DEPARTMENT')
max_dept_sal.head()

Unnamed: 0,DEPARTMENT,BASE_SALARY
1494,Admn. & ...,140416.0
149,City Con...,64251.0
236,City Cou...,100000.0
647,Conventi...,38397.0
1500,Dept of ...,89221.0


In [97]:
max_dept_sal = max_dept_sal.set_index('DEPARTMENT')
employee = employee.set_index('DEPARTMENT')

In [98]:
employee = (employee
   .assign(MAX_DEPT_SALARY=max_dept_sal['BASE_SALARY'])
)
employee

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,...,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Municipal Courts Department,0,ASSISTAN...,...,2012-10-13,121862.0
Library,1,LIBRARY ...,...,2010-09-18,107763.0
Houston Police Department-HPD,2,POLICE O...,...,2015-02-03,199596.0
Houston Fire Department (HFD),3,ENGINEER...,...,1991-05-25,210588.0
General Services Department,4,ELECTRICIAN,...,1994-10-22,89194.0
...,...,...,...,...,...
Houston Police Department-HPD,1995,POLICE O...,...,2015-06-09,199596.0
Houston Fire Department (HFD),1996,COMMUNIC...,...,2013-10-06,210588.0
Houston Police Department-HPD,1997,POLICE O...,...,2015-10-13,199596.0
Houston Police Department-HPD,1998,POLICE O...,...,2011-07-02,199596.0


In [99]:
employee.query('BASE_SALARY > MAX_DEPT_SALARY')

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,...,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [100]:
employee = pd.read_csv('data/employee.csv')
max_dept_sal = (employee
    [['DEPARTMENT', 'BASE_SALARY']]
    .sort_values(['DEPARTMENT', 'BASE_SALARY'],
        ascending=[True, False])
    .drop_duplicates(subset='DEPARTMENT')
    .set_index('DEPARTMENT')
)

In [101]:
(employee
   .set_index('DEPARTMENT')
   .assign(MAX_DEPT_SALARY=max_dept_sal['BASE_SALARY'])
)

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,...,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Municipal Courts Department,0,ASSISTAN...,...,2012-10-13,121862.0
Library,1,LIBRARY ...,...,2010-09-18,107763.0
Houston Police Department-HPD,2,POLICE O...,...,2015-02-03,199596.0
Houston Fire Department (HFD),3,ENGINEER...,...,1991-05-25,210588.0
General Services Department,4,ELECTRICIAN,...,1994-10-22,89194.0
...,...,...,...,...,...
Houston Police Department-HPD,1995,POLICE O...,...,2015-06-09,199596.0
Houston Fire Department (HFD),1996,COMMUNIC...,...,2013-10-06,210588.0
Houston Police Department-HPD,1997,POLICE O...,...,2015-10-13,199596.0
Houston Police Department-HPD,1998,POLICE O...,...,2011-07-02,199596.0


### How it works...

In [102]:
random_salary = (dept_sal
    .sample(n=10, random_state=42)
    .set_index('DEPARTMENT')
)
random_salary

Unnamed: 0_level_0,BASE_SALARY
DEPARTMENT,Unnamed: 1_level_1
Public Works & Engineering-PWE,34861.0
Houston Airport System (HAS),29286.0
Houston Police Department-HPD,31907.0
Houston Police Department-HPD,66614.0
Houston Police Department-HPD,42000.0
Houston Police Department-HPD,43443.0
Houston Police Department-HPD,66614.0
Public Works & Engineering-PWE,52582.0
Finance,93168.0
Houston Police Department-HPD,35318.0


In [103]:
employee['RANDOM_SALARY'] = random_salary['BASE_SALARY']

ValueError: cannot reindex from a duplicate axis

### There's more...

In [104]:
(employee
    .set_index('DEPARTMENT')
    .assign(MAX_SALARY2=max_dept_sal['BASE_SALARY'].head(3))
    .MAX_SALARY2
    .value_counts()
)

140416.0    29
100000.0    11
64251.0      5
Name: MAX_SALARY2, dtype: int64

In [105]:
max_sal = (employee
    .groupby('DEPARTMENT')
    .BASE_SALARY
    .transform('max')
)

In [106]:
(employee
    .assign(MAX_DEPT_SALARY=max_sal)
)

Unnamed: 0,UNIQUE_ID,POSITION_TITLE,...,JOB_DATE,MAX_DEPT_SALARY
0,0,ASSISTAN...,...,2012-10-13,121862.0
1,1,LIBRARY ...,...,2010-09-18,107763.0
2,2,POLICE O...,...,2015-02-03,199596.0
3,3,ENGINEER...,...,1991-05-25,210588.0
4,4,ELECTRICIAN,...,1994-10-22,89194.0
...,...,...,...,...,...
1995,1995,POLICE O...,...,2015-06-09,199596.0
1996,1996,COMMUNIC...,...,2013-10-06,210588.0
1997,1997,POLICE O...,...,2015-10-13,199596.0
1998,1998,POLICE O...,...,2011-07-02,199596.0


In [107]:
max_sal = (employee
    .groupby('DEPARTMENT')
    .BASE_SALARY
    .max()
)

In [108]:
(employee
    .merge(max_sal.rename('MAX_DEPT_SALARY'),
           how='left', left_on='DEPARTMENT',
           right_index=True)
)

Unnamed: 0,UNIQUE_ID,POSITION_TITLE,...,JOB_DATE,MAX_DEPT_SALARY
0,0,ASSISTAN...,...,2012-10-13,121862.0
1,1,LIBRARY ...,...,2010-09-18,107763.0
2,2,POLICE O...,...,2015-02-03,199596.0
3,3,ENGINEER...,...,1991-05-25,210588.0
4,4,ELECTRICIAN,...,1994-10-22,89194.0
...,...,...,...,...,...
1995,1995,POLICE O...,...,2015-06-09,199596.0
1996,1996,COMMUNIC...,...,2013-10-06,210588.0
1997,1997,POLICE O...,...,2015-10-13,199596.0
1998,1998,POLICE O...,...,2011-07-02,199596.0


## Highlighting the maximum value from each column

### How to do it...

In [9]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college.dtypes

CITY                   object
STABBR                 object
HBCU                  float64
MENONLY               float64
WOMENONLY             float64
                       ...   
PCTPELL               float64
PCTFLOAN              float64
UG25ABV               float64
MD_EARN_WNE_P10        object
GRAD_DEBT_MDN_SUPP     object
Length: 26, dtype: object

In [110]:
college.MD_EARN_WNE_P10.sample(10, random_state=42)

INSTNM
Career Point College                                      20700
Ner Israel Rabbinical College                       PrivacyS...
Reflections Academy of Beauty                               NaN
Capital Area Technical College                            26400
West Virginia University Institute of Technology          43400
Mid-State Technical College                               32000
Strayer University-Huntsville Campus                      49200
National Aviation Academy of Tampa Bay                    45000
University of California-Santa Cruz                       43000
Lexington Theological Seminary                              NaN
Name: MD_EARN_WNE_P10, dtype: object

In [111]:
college.GRAD_DEBT_MDN_SUPP.sample(10, random_state=42)

INSTNM
Career Point College                                      14977
Ner Israel Rabbinical College                       PrivacyS...
Reflections Academy of Beauty                       PrivacyS...
Capital Area Technical College                      PrivacyS...
West Virginia University Institute of Technology          23969
Mid-State Technical College                                8025
Strayer University-Huntsville Campus                    36173.5
National Aviation Academy of Tampa Bay                    22778
University of California-Santa Cruz                       19884
Lexington Theological Seminary                      PrivacyS...
Name: GRAD_DEBT_MDN_SUPP, dtype: object

In [112]:
college.MD_EARN_WNE_P10.value_counts()

PrivacySuppressed    822
38800                151
21500                 97
49200                 78
27400                 46
                    ... 
68200                  1
51300                  1
54700                  1
83800                  1
54400                  1
Name: MD_EARN_WNE_P10, Length: 598, dtype: int64

In [113]:
set(college.MD_EARN_WNE_P10.apply(type))

{float, str}

In [114]:
college.GRAD_DEBT_MDN_SUPP.value_counts()

PrivacySuppressed    1510
9500                  514
27000                 306
25827.5               136
25000                 124
                     ... 
23185                   1
12908.5                 1
10100                   1
6200                    1
12112                   1
Name: GRAD_DEBT_MDN_SUPP, Length: 2038, dtype: int64

In [115]:
cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
for col in cols:
    college[col] = pd.to_numeric(college[col], errors='coerce')

In [116]:
college.dtypes.loc[cols]

MD_EARN_WNE_P10       float64
GRAD_DEBT_MDN_SUPP    float64
dtype: object

In [11]:
college_n = college.select_dtypes('number')
college_n.head()

Unnamed: 0_level_0,HBCU,MENONLY,...,PCTFLOAN,UG25ABV
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,1.0,0.0,...,0.8284,0.1049
University of Alabama at Birmingham,0.0,0.0,...,0.5214,0.2422
Amridge University,0.0,0.0,...,0.7795,0.854
University of Alabama in Huntsville,0.0,0.0,...,0.4596,0.264
Alabama State University,1.0,0.0,...,0.7554,0.127


In [13]:
binary_only = college_n.nunique() == 2
binary_only.head()

HBCU          True
MENONLY       True
WOMENONLY     True
RELAFFIL      True
SATVRMID     False
dtype: bool

In [14]:
binary_cols = binary_only[binary_only].index.tolist()
binary_cols

['HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL', 'DISTANCEONLY', 'CURROPER']

In [15]:
college_n2 = college_n.drop(columns=binary_cols)
college_n2.head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,...,PCTFLOAN,UG25ABV
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,424.0,420.0,...,0.8284,0.1049
University of Alabama at Birmingham,570.0,565.0,...,0.5214,0.2422
Amridge University,,,...,0.7795,0.854
University of Alabama in Huntsville,595.0,590.0,...,0.4596,0.264
Alabama State University,425.0,430.0,...,0.7554,0.127


In [16]:
max_cols = college_n2.idxmax()
max_cols

SATVRMID      Californ...
SATMTMID      Californ...
UGDS          Universi...
UGDS_WHITE    Mr Leon'...
UGDS_BLACK    Velvatex...
                 ...     
UGDS_UNKN     Le Cordo...
PPTUG_EF      Thunderb...
PCTPELL       MTI Busi...
PCTFLOAN      ABC Beau...
UG25ABV       Dongguk ...
Length: 16, dtype: object

In [17]:
unique_max_cols = max_cols.unique()
unique_max_cols[:5]

array(['California Institute of Technology',
       'University of Phoenix-Arizona',
       "Mr Leon's School of Hair Design-Moscow",
       'Velvatex College of Beauty Culture',
       'Thunderbird School of Global Management'], dtype=object)

In [123]:
college_n2.loc[unique_max_cols] #.style.highlight_max()

Unnamed: 0_level_0,SATVRMID,SATMTMID,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California Institute of Technology,765.0,785.0,...,77800.0,11812.5
University of Phoenix-Arizona,,,...,,33000.0
Mr Leon's School of Hair Design-Moscow,,,...,,15710.0
Velvatex College of Beauty Culture,,,...,,
Thunderbird School of Global Management,,,...,118900.0,
...,...,...,...,...,...
MTI Business College Inc,,,...,23000.0,9500.0
ABC Beauty College Inc,,,...,,16500.0
Dongguk University-Los Angeles,,,...,,
Medical College of Wisconsin,,,...,233100.0,


In [18]:
college_n2.loc[unique_max_cols].style.highlight_max()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
California Institute of Technology,765.0,785.0,983,0.2787,0.0153,0.1221,0.4385,0.001,0.0,0.057,0.0875,0.0,0.0,0.1126,0.2303,0.0082
University of Phoenix-Arizona,,,151558,0.3098,0.1555,0.076,0.0082,0.0042,0.005,0.1131,0.0131,0.3152,0.0,0.6009,0.592,
Mr Leon's School of Hair Design-Moscow,,,16,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.625,0.2
Velvatex College of Beauty Culture,,,25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.7692,0.0,0.52
Thunderbird School of Global Management,,,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Cosmopolitan Beauty and Tech School,,,110,0.0091,0.0,0.0182,0.9727,0.0,0.0,0.0,0.0,0.0,0.3182,0.7761,0.1244,0.9545
Haskell Indian Nations University,430.0,440.0,805,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0224,0.8396,0.0,0.2089
Palau Community College,,,602,0.0,0.0017,0.0,0.0,0.0,0.9983,0.0,0.0,0.0,0.3887,0.856,0.0,0.2616
LIU Brentwood,,,15,0.0,0.1333,0.2667,0.0,0.0,0.0,0.5333,0.0,0.0667,0.4,0.5652,0.7826,0.7826
California University of Management and Sciences,,,98,0.0102,0.0204,0.0,0.0408,0.0,0.0,0.0,0.9286,0.0,0.0,0.0926,0.0556,0.6852


In [124]:
def remove_binary_cols(df):
    binary_only = df.nunique() == 2
    cols = binary_only[binary_only].index.tolist()
    return df.drop(columns=cols)

In [125]:
def select_rows_with_max_cols(df):
    max_cols = df.idxmax()
    unique = max_cols.unique()
    return df.loc[unique]

In [126]:
(college
   .assign(
       MD_EARN_WNE_P10=pd.to_numeric(college.MD_EARN_WNE_P10, errors='coerce'),
       GRAD_DEBT_MDN_SUPP=pd.to_numeric(college.GRAD_DEBT_MDN_SUPP, errors='coerce'))
   .select_dtypes('number')
   .pipe(remove_binary_cols)
   .pipe(select_rows_with_max_cols)
)

Unnamed: 0_level_0,SATVRMID,SATMTMID,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California Institute of Technology,765.0,785.0,...,77800.0,11812.5
University of Phoenix-Arizona,,,...,,33000.0
Mr Leon's School of Hair Design-Moscow,,,...,,15710.0
Velvatex College of Beauty Culture,,,...,,
Thunderbird School of Global Management,,,...,118900.0,
...,...,...,...,...,...
MTI Business College Inc,,,...,23000.0,9500.0
ABC Beauty College Inc,,,...,,16500.0
Dongguk University-Los Angeles,,,...,,
Medical College of Wisconsin,,,...,233100.0,


### How it works...

### There's more...

In [19]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_').head()

In [20]:
college_ugds.style.highlight_max(axis='columns')

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


## Replicating idxmax with method chaining

### How to do it...

In [128]:
def remove_binary_cols(df):
    binary_only = df.nunique() == 2
    cols = binary_only[binary_only].index.tolist()
    return df.drop(columns=cols)

In [129]:
college_n = (college
   .assign(
       MD_EARN_WNE_P10=pd.to_numeric(college.MD_EARN_WNE_P10, errors='coerce'),
       GRAD_DEBT_MDN_SUPP=pd.to_numeric(college.GRAD_DEBT_MDN_SUPP, errors='coerce'))
   .select_dtypes('number')
   .pipe(remove_binary_cols)
)

In [130]:
college_n.max().head()

SATVRMID         765.0
SATMTMID         785.0
UGDS          151558.0
UGDS_WHITE         1.0
UGDS_BLACK         1.0
dtype: float64

In [131]:
college_n.eq(college_n.max()).head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,False,False,...,False,False
University of Alabama at Birmingham,False,False,...,False,False
Amridge University,False,False,...,False,False
University of Alabama in Huntsville,False,False,...,False,False
Alabama State University,False,False,...,False,False


In [132]:
has_row_max = (college_n
    .eq(college_n.max())
    .any(axis='columns')
)
has_row_max.head()

INSTNM
Alabama A & M University               False
University of Alabama at Birmingham    False
Amridge University                     False
University of Alabama in Huntsville    False
Alabama State University               False
dtype: bool

In [133]:
college_n.shape

(7535, 18)

In [134]:
has_row_max.sum()

401

In [135]:
college_n.eq(college_n.max()).cumsum()

Unnamed: 0_level_0,SATVRMID,SATMTMID,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0,0,...,0,0
University of Alabama at Birmingham,0,0,...,0,0
Amridge University,0,0,...,0,0
University of Alabama in Huntsville,0,0,...,0,0
Alabama State University,0,0,...,0,0
...,...,...,...,...,...
SAE Institute of Technology San Francisco,1,1,...,1,2
Rasmussen College - Overland Park,1,1,...,1,2
National Personal Training Institute of Cleveland,1,1,...,1,2
Bay Area Medical Academy - San Jose Satellite Location,1,1,...,1,2


In [136]:
(college_n
    .eq(college_n.max())
    .cumsum()
    .cumsum()
)

Unnamed: 0_level_0,SATVRMID,SATMTMID,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0,0,...,0,0
University of Alabama at Birmingham,0,0,...,0,0
Amridge University,0,0,...,0,0
University of Alabama in Huntsville,0,0,...,0,0
Alabama State University,0,0,...,0,0
...,...,...,...,...,...
SAE Institute of Technology San Francisco,7305,7305,...,3445,10266
Rasmussen College - Overland Park,7306,7306,...,3446,10268
National Personal Training Institute of Cleveland,7307,7307,...,3447,10270
Bay Area Medical Academy - San Jose Satellite Location,7308,7308,...,3448,10272


In [137]:
has_row_max2 = (college_n
    .eq(college_n.max()) 
    .cumsum() 
    .cumsum() 
    .eq(1) 
    .any(axis='columns')
)

In [138]:
has_row_max2.head()

INSTNM
Alabama A & M University               False
University of Alabama at Birmingham    False
Amridge University                     False
University of Alabama in Huntsville    False
Alabama State University               False
dtype: bool

In [139]:
has_row_max2.sum()

16

In [140]:
idxmax_cols = has_row_max2[has_row_max2].index
idxmax_cols

Index(['Thunderbird School of Global Management',
       'Southwest University of Visual Arts-Tucson', 'ABC Beauty College Inc',
       'Velvatex College of Beauty Culture',
       'California Institute of Technology',
       'Le Cordon Bleu College of Culinary Arts-San Francisco',
       'MTI Business College Inc', 'Dongguk University-Los Angeles',
       'Mr Leon's School of Hair Design-Moscow',
       'Haskell Indian Nations University', 'LIU Brentwood',
       'Medical College of Wisconsin', 'Palau Community College',
       'California University of Management and Sciences',
       'Cosmopolitan Beauty and Tech School', 'University of Phoenix-Arizona'],
      dtype='object', name='INSTNM')

In [141]:
set(college_n.idxmax().unique()) == set(idxmax_cols)

True

In [142]:
def idx_max(df):
     has_row_max = (df
         .eq(df.max())
         .cumsum()
         .cumsum()
         .eq(1)
         .any(axis='columns')
     )
     return has_row_max[has_row_max].index

In [143]:
idx_max(college_n)

Index(['Thunderbird School of Global Management',
       'Southwest University of Visual Arts-Tucson', 'ABC Beauty College Inc',
       'Velvatex College of Beauty Culture',
       'California Institute of Technology',
       'Le Cordon Bleu College of Culinary Arts-San Francisco',
       'MTI Business College Inc', 'Dongguk University-Los Angeles',
       'Mr Leon's School of Hair Design-Moscow',
       'Haskell Indian Nations University', 'LIU Brentwood',
       'Medical College of Wisconsin', 'Palau Community College',
       'California University of Management and Sciences',
       'Cosmopolitan Beauty and Tech School', 'University of Phoenix-Arizona'],
      dtype='object', name='INSTNM')

### How it works...

### There's more...

In [144]:
def idx_max(df):
     has_row_max = (df
         .eq(df.max())
         .cumsum()
         .cumsum()
         .eq(1)
         .any(axis='columns')
         [lambda df_: df_]
         .index
     )
     return has_row_max

In [145]:
%timeit college_n.idxmax().values

1.62 ms ± 80.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [146]:
%timeit idx_max(college_n)

29.9 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Finding the most common maximum of columns

### How to do it...

In [147]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0.0333,0.9353,...,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,...,0.0179,0.01
Amridge University,0.299,0.4192,...,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,...,0.0332,0.035
Alabama State University,0.0158,0.9208,...,0.0243,0.0137


In [148]:
highest_percentage_race = college_ugds.idxmax(axis='columns')
highest_percentage_race.head()

INSTNM
Alabama A & M University               UGDS_BLACK
University of Alabama at Birmingham    UGDS_WHITE
Amridge University                     UGDS_BLACK
University of Alabama in Huntsville    UGDS_WHITE
Alabama State University               UGDS_BLACK
dtype: object

In [149]:
highest_percentage_race.value_counts(normalize=True)

UGDS_WHITE    0.670352
UGDS_BLACK    0.151586
UGDS_HISP     0.129473
UGDS_UNKN     0.023422
UGDS_ASIAN    0.012074
UGDS_AIAN     0.006110
UGDS_NRA      0.004073
UGDS_NHPI     0.001746
UGDS_2MOR     0.001164
dtype: float64

### How it works...

### There's more...

In [150]:
(college_ugds
    [highest_percentage_race == 'UGDS_BLACK']
    .drop(columns='UGDS_BLACK')
    .idxmax(axis='columns')
    .value_counts(normalize=True)
)

UGDS_WHITE    0.661228
UGDS_HISP     0.230326
UGDS_UNKN     0.071977
UGDS_NRA      0.018234
UGDS_ASIAN    0.009597
UGDS_2MOR     0.006718
UGDS_AIAN     0.000960
UGDS_NHPI     0.000960
dtype: float64