In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [65]:
movie = pd.read_csv('../../datasets/pandas-cookbook/movie.csv')
college = pd.read_csv('../../datasets/pandas-cookbook/college.csv')
employee = pd.read_csv('../../datasets/pandas-cookbook/employee.csv')
baseball_14 = pd.read_csv('../../datasets/pandas-cookbook/baseball14.csv')
baseball_15 = pd.read_csv('../../datasets/pandas-cookbook/baseball15.csv')
baseball_16 = pd.read_csv('../../datasets/pandas-cookbook/baseball16.csv')

#### 1. Explore the column

In [11]:
columns = college.columns
print(columns, end='\n\n')
print(columns.values)

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL',
       'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS', 'UGDS_WHITE',
       'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NHPI',
       'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF', 'CURROPER', 'PCTPELL',
       'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP'],
      dtype='object')

['INSTNM' 'CITY' 'STABBR' 'HBCU' 'MENONLY' 'WOMENONLY' 'RELAFFIL'
 'SATVRMID' 'SATMTMID' 'DISTANCEONLY' 'UGDS' 'UGDS_WHITE' 'UGDS_BLACK'
 'UGDS_HISP' 'UGDS_ASIAN' 'UGDS_AIAN' 'UGDS_NHPI' 'UGDS_2MOR' 'UGDS_NRA'
 'UGDS_UNKN' 'PPTUG_EF' 'CURROPER' 'PCTPELL' 'PCTFLOAN' 'UG25ABV'
 'MD_EARN_WNE_P10' 'GRAD_DEBT_MDN_SUPP']


#### 2. Select items from the index by position with a scalar, list, or slice

In [8]:
print(columns[5])
print(columns[[1, 8, 10]])
print(columns[-7:-4])

WOMENONLY
Index(['CITY', 'SATMTMID', 'UGDS'], dtype='object')
Index(['PPTUG_EF', 'CURROPER', 'PCTPELL'], dtype='object')


#### 3. Explore set operations on columns

In [19]:
c1 = columns[:4]
c2 = columns[2:6]

print('c1: ', c1)
print('c2: ', c2)
print('union', c1.union(c2)) # c1 | c2
print('intersection: ', c1.intersection(c2))
print('difference: ', c1.difference(c2))
print('symmetric difference: ', c1.symmetric_difference(c2))


c1:  Index(['INSTNM', 'CITY', 'STABBR', 'HBCU'], dtype='object')
c2:  Index(['STABBR', 'HBCU', 'MENONLY', 'WOMENONLY'], dtype='object')
union Index(['CITY', 'HBCU', 'INSTNM', 'MENONLY', 'STABBR', 'WOMENONLY'], dtype='object')
intersection:  Index(['STABBR', 'HBCU'], dtype='object')
difference:  Index(['CITY', 'INSTNM'], dtype='object')
symmetric difference:  Index(['CITY', 'INSTNM', 'MENONLY', 'WOMENONLY'], dtype='object')


#### 4. Cartesian product
when combining two series or two dataframes it is important that:
1. both series/dataframes contain indices with only unique values

OR

2. that the indices of both series/dataframes are in the same order
<br><br>


If not a Cartesian product is made, which means that the new index
contains all combinations of pairs of both indices. A LOT of extra rows
can mistakenly be added this way. 

When two Series are added together using the plus operator and one of the index labels does not appear in the other, the resulting value is always missing. pandas has the .add method, which provides an option to fill the missing value. Note that these Series do not include duplicate entries, hence there is no need to worry about a Cartesian product exploding the number of entries.

In [38]:
employee = pd.read_csv('../../datasets/pandas-cookbook/employee.csv', index_col='RACE')

salary_1 = employee['BASE_SALARY']
salary_2 = employee['BASE_SALARY'].copy()
salary_1 = salary_1.sort_index()

new_series = salary_1 + salary_2
print(salary_1.size)
print(new_series.size)
print(salary_1.index.value_counts().pow(2).sum())

2000
1175424
1174199


#### 5.Adding/combining series and DataFrames

In [46]:
bball_14 = baseball_14.set_index('playerID')
bball_15 = baseball_15.set_index('playerID')
bball_16 = baseball_16.set_index('playerID')

In [47]:
bball_14.index.difference(bball_15.index)

Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
       'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
      dtype='object', name='playerID')

In [49]:
# add homeruns from different dataframes
hits_14 = bball_14.H
hits_15 = bball_15.H
hits_16 = bball_16.H

# since there are players who are not present in all dataframes 
# we use the fill_value=0 to avoid them getting a NaN value
hits_total = hits_14.add(hits_15, fill_value=0).add(hits_16, fill_value=0)
hits_total.head()

playerID
altuvjo01    641.0
bregmal01     53.0
cartech02    193.0
castrja01    243.0
congeha01     46.0
Name: H, dtype: float64

#### 6. Employee dataset: find max salary for each department, and then add the corresponding value to each row

In [69]:
# solution 1
max_dept_sal = (
    employee[['DEPARTMENT', 'BASE_SALARY']]
    .sort_values(['DEPARTMENT', 'BASE_SALARY'],
        ascending=[True, False])
    .drop_duplicates(subset='DEPARTMENT')
    .set_index('DEPARTMENT')
)

(
employee
.set_index('DEPARTMENT')
.assign(MAX_DEPT_SALARY=max_dept_sal['BASE_SALARY'])
)

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Municipal Courts Department,0,ASSISTANT DIRECTOR (EX LVL),121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13,121862.0
Library,1,LIBRARY ASSISTANT,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18,107763.0
Houston Police Department-HPD,2,POLICE OFFICER,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03,199596.0
Houston Fire Department (HFD),3,ENGINEER/OPERATOR,63166.0,White,Full Time,Male,Active,1982-02-08,1991-05-25,210588.0
General Services Department,4,ELECTRICIAN,56347.0,White,Full Time,Male,Active,1989-06-19,1994-10-22,89194.0
...,...,...,...,...,...,...,...,...,...,...
Houston Police Department-HPD,1995,POLICE OFFICER,43443.0,White,Full Time,Male,Active,2014-06-09,2015-06-09,199596.0
Houston Fire Department (HFD),1996,COMMUNICATIONS CAPTAIN,66523.0,Black or African American,Full Time,Male,Active,2003-09-02,2013-10-06,210588.0
Houston Police Department-HPD,1997,POLICE OFFICER,43443.0,White,Full Time,Male,Active,2014-10-13,2015-10-13,199596.0
Houston Police Department-HPD,1998,POLICE OFFICER,55461.0,Asian/Pacific Islander,Full Time,Male,Active,2009-01-20,2011-07-02,199596.0


In [73]:
# solution 2
max_sal = (
employee
.groupby('DEPARTMENT')
.BASE_SALARY
.transform('max')
)

employee.assign(MAX_DEPT_SALARY=max_sal)

Unnamed: 0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
0,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13,121862.0
1,1,LIBRARY ASSISTANT,Library,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18,107763.0
2,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03,199596.0
3,3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,White,Full Time,Male,Active,1982-02-08,1991-05-25,210588.0
4,4,ELECTRICIAN,General Services Department,56347.0,White,Full Time,Male,Active,1989-06-19,1994-10-22,89194.0
...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,POLICE OFFICER,Houston Police Department-HPD,43443.0,White,Full Time,Male,Active,2014-06-09,2015-06-09,199596.0
1996,1996,COMMUNICATIONS CAPTAIN,Houston Fire Department (HFD),66523.0,Black or African American,Full Time,Male,Active,2003-09-02,2013-10-06,210588.0
1997,1997,POLICE OFFICER,Houston Police Department-HPD,43443.0,White,Full Time,Male,Active,2014-10-13,2015-10-13,199596.0
1998,1998,POLICE OFFICER,Houston Police Department-HPD,55461.0,Asian/Pacific Islander,Full Time,Male,Active,2009-01-20,2011-07-02,199596.0


In [74]:
# solution 3
max_sal = (
employee
.groupby('DEPARTMENT')
.BASE_SALARY
.max()
)

(
employee.merge(
    max_sal.rename('MAX_DEPT_SALARY'),
    how='left',
    left_on='DEPARTMENT',
    right_index=True)
)


Unnamed: 0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,RACE,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
0,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Hispanic/Latino,Full Time,Female,Active,2006-06-12,2012-10-13,121862.0
1,1,LIBRARY ASSISTANT,Library,26125.0,Hispanic/Latino,Full Time,Female,Active,2000-07-19,2010-09-18,107763.0
2,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,White,Full Time,Male,Active,2015-02-03,2015-02-03,199596.0
3,3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,White,Full Time,Male,Active,1982-02-08,1991-05-25,210588.0
4,4,ELECTRICIAN,General Services Department,56347.0,White,Full Time,Male,Active,1989-06-19,1994-10-22,89194.0
...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,POLICE OFFICER,Houston Police Department-HPD,43443.0,White,Full Time,Male,Active,2014-06-09,2015-06-09,199596.0
1996,1996,COMMUNICATIONS CAPTAIN,Houston Fire Department (HFD),66523.0,Black or African American,Full Time,Male,Active,2003-09-02,2013-10-06,210588.0
1997,1997,POLICE OFFICER,Houston Police Department-HPD,43443.0,White,Full Time,Male,Active,2014-10-13,2015-10-13,199596.0
1998,1998,POLICE OFFICER,Houston Police Department-HPD,55461.0,Asian/Pacific Islander,Full Time,Male,Active,2009-01-20,2011-07-02,199596.0


#### 7. College dataset: highlight the maximum value from each column

In [113]:
college = pd.read_csv('../../datasets/pandas-cookbook/college.csv', index_col='INSTNM')

In [77]:
# identify non-numeric colums
college.dtypes

CITY                   object
STABBR                 object
HBCU                  float64
MENONLY               float64
WOMENONLY             float64
RELAFFIL                int64
SATVRMID              float64
SATMTMID              float64
DISTANCEONLY          float64
UGDS                  float64
UGDS_WHITE            float64
UGDS_BLACK            float64
UGDS_HISP             float64
UGDS_ASIAN            float64
UGDS_AIAN             float64
UGDS_NHPI             float64
UGDS_2MOR             float64
UGDS_NRA              float64
UGDS_UNKN             float64
PPTUG_EF              float64
CURROPER                int64
PCTPELL               float64
PCTFLOAN              float64
UG25ABV               float64
MD_EARN_WNE_P10        object
GRAD_DEBT_MDN_SUPP     object
dtype: object

In [130]:
f = college.MD_EARN_WNE_P10.str.contains('[a-zA-Z]', na=False)
college.MD_EARN_WNE_P10[f].value_counts()

PrivacySuppressed    822
Name: MD_EARN_WNE_P10, dtype: int64

In [129]:
f = college.GRAD_DEBT_MDN_SUPP.str.contains('[a-zA-Z]', na=False)
college.GRAD_DEBT_MDN_SUPP[f].value_counts()

PrivacySuppressed    1510
Name: GRAD_DEBT_MDN_SUPP, dtype: int64

In [85]:
# Change dtype to float and change strings to NaNs
cols = ['MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP']
for col in cols:
    college[col] = pd.to_numeric(college[col], errors='coerce')

In [95]:
# now we can select the columns that contain numbers
college_n = college.select_dtypes('number')

# find columns that contain boolean values
binary_only = college_n.nunique() == 2

# use the Boolean array to create a list of binary colums
binary_cols = binary_only[binary_only].index

# drop the binary columns
college_n2 = college_n.drop(columns=binary_cols)

In [96]:
college_n2.head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Alabama A & M University,424.0,420.0,4206.0,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138,0.0656,0.7356,0.8284,0.1049,30300.0,33888.0
University of Alabama at Birmingham,570.0,565.0,11383.0,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01,0.2607,0.346,0.5214,0.2422,39700.0,21941.5
Amridge University,,,291.0,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715,0.4536,0.6801,0.7795,0.854,40100.0,23370.0
University of Alabama in Huntsville,595.0,590.0,5451.0,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035,0.2146,0.3072,0.4596,0.264,45500.0,24097.0
Alabama State University,425.0,430.0,4811.0,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137,0.0892,0.7347,0.7554,0.127,26600.0,33118.5


In [99]:
# find the index label of the maximum value for each column
max_cols = college_n2.idxmax()
max_cols

SATVRMID                             California Institute of Technology
SATMTMID                             California Institute of Technology
UGDS                                      University of Phoenix-Arizona
UGDS_WHITE                       Mr Leon's School of Hair Design-Moscow
UGDS_BLACK                           Velvatex College of Beauty Culture
UGDS_HISP                       Thunderbird School of Global Management
UGDS_ASIAN                          Cosmopolitan Beauty and Tech School
UGDS_AIAN                             Haskell Indian Nations University
UGDS_NHPI                                       Palau Community College
UGDS_2MOR                                                 LIU Brentwood
UGDS_NRA               California University of Management and Sciences
UGDS_UNKN             Le Cordon Bleu College of Culinary Arts-San Fr...
PPTUG_EF                        Thunderbird School of Global Management
PCTPELL                                        MTI Business Coll

In [101]:
unique_max_cols = max_cols.unique()
unique_max_cols

array(['California Institute of Technology',
       'University of Phoenix-Arizona',
       "Mr Leon's School of Hair Design-Moscow",
       'Velvatex College of Beauty Culture',
       'Thunderbird School of Global Management',
       'Cosmopolitan Beauty and Tech School',
       'Haskell Indian Nations University', 'Palau Community College',
       'LIU Brentwood',
       'California University of Management and Sciences',
       'Le Cordon Bleu College of Culinary Arts-San Francisco',
       'MTI Business College Inc', 'ABC Beauty College Inc',
       'Dongguk University-Los Angeles', 'Medical College of Wisconsin',
       'Southwest University of Visual Arts-Tucson'], dtype=object)

In [110]:
# Use the values of max_cols to select only those rows that have schools
# with a maximum value and then use the .style attribute to highlight
# those values

college_n2.loc[unique_max_cols].style.highlight_max(color='royalblue').format('{:.2f}')

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
California Institute of Technology,765.0,785.0,983.0,0.28,0.02,0.12,0.44,0.0,0.0,0.06,0.09,0.0,0.0,0.11,0.23,0.01,77800.0,11812.5
University of Phoenix-Arizona,,,151558.0,0.31,0.16,0.08,0.01,0.0,0.01,0.11,0.01,0.32,0.0,0.6,0.59,,,33000.0
Mr Leon's School of Hair Design-Moscow,,,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.62,0.62,0.2,,15710.0
Velvatex College of Beauty Culture,,,25.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.77,0.0,0.52,,
Thunderbird School of Global Management,,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,118900.0,
Cosmopolitan Beauty and Tech School,,,110.0,0.01,0.0,0.02,0.97,0.0,0.0,0.0,0.0,0.0,0.32,0.78,0.12,0.95,,
Haskell Indian Nations University,430.0,440.0,805.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.02,0.84,0.0,0.21,22800.0,
Palau Community College,,,602.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.39,0.86,0.0,0.26,24700.0,
LIU Brentwood,,,15.0,0.0,0.13,0.27,0.0,0.0,0.0,0.53,0.0,0.07,0.4,0.57,0.78,0.78,44600.0,25499.0
California University of Management and Sciences,,,98.0,0.01,0.02,0.0,0.04,0.0,0.0,0.0,0.93,0.0,0.0,0.09,0.06,0.69,,


In [116]:
# refactored code
def remove_binary_cols(df):
    binary_only = df.nunique() == 2
    cols = binary_only[binary_only].index.tolist()
    return df.drop(columns=cols)

def select_rows_with_max_cols(df):
    max_cols = df.idxmax()
    unique = max_cols.unique()
    return df.loc[unique]

(
college
    .assign(
        MD_EARN_WNE_P10=pd.to_numeric(
        college.MD_EARN_WNE_P10, errors='coerce'
        ),
        GRAD_DEBT_MDN_SUPP=pd.to_numeric(
        college.GRAD_DEBT_MDN_SUPP, errors='coerce'
        )
    )
    .select_dtypes('number')
    .pipe(remove_binary_cols)
    .pipe(select_rows_with_max_cols)
    .style.highlight_max(color='royalblue').format('{:.2f}')
)

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
California Institute of Technology,765.0,785.0,983.0,0.28,0.02,0.12,0.44,0.0,0.0,0.06,0.09,0.0,0.0,0.11,0.23,0.01,77800.0,11812.5
University of Phoenix-Arizona,,,151558.0,0.31,0.16,0.08,0.01,0.0,0.01,0.11,0.01,0.32,0.0,0.6,0.59,,,33000.0
Mr Leon's School of Hair Design-Moscow,,,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.62,0.62,0.2,,15710.0
Velvatex College of Beauty Culture,,,25.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.77,0.0,0.52,,
Thunderbird School of Global Management,,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,118900.0,
Cosmopolitan Beauty and Tech School,,,110.0,0.01,0.0,0.02,0.97,0.0,0.0,0.0,0.0,0.0,0.32,0.78,0.12,0.95,,
Haskell Indian Nations University,430.0,440.0,805.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.02,0.84,0.0,0.21,22800.0,
Palau Community College,,,602.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.39,0.86,0.0,0.26,24700.0,
LIU Brentwood,,,15.0,0.0,0.13,0.27,0.0,0.0,0.0,0.53,0.0,0.07,0.4,0.57,0.78,0.78,44600.0,25499.0
California University of Management and Sciences,,,98.0,0.01,0.02,0.0,0.04,0.0,0.0,0.0,0.93,0.0,0.0,0.09,0.06,0.69,,


#### Replicate idxmax with method chaining

In [133]:
def remove_binary_cols(df):
    binary_only = df.nunique() == 2
    cols = binary_only[binary_only].index.tolist()
    return df.drop(columns=cols)

college_n = (
college
    .assign(
        MD_EARN_WNE_P10=pd.to_numeric(
        college.MD_EARN_WNE_P10, errors='coerce'
        ),
        GRAD_DEBT_MDN_SUPP=pd.to_numeric(
        college_n.GRAD_DEBT_MDN_SUPP,errors='coerce'
        ),
    )
    .select_dtypes('number')
    .pipe(remove_binary_cols)
)

In [135]:
college_n.head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Alabama A & M University,424.0,420.0,4206.0,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138,0.0656,0.7356,0.8284,0.1049,30300.0,33888.0
University of Alabama at Birmingham,570.0,565.0,11383.0,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01,0.2607,0.346,0.5214,0.2422,39700.0,21941.5
Amridge University,,,291.0,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715,0.4536,0.6801,0.7795,0.854,40100.0,23370.0
University of Alabama in Huntsville,595.0,590.0,5451.0,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035,0.2146,0.3072,0.4596,0.264,45500.0,24097.0
Alabama State University,425.0,430.0,4811.0,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137,0.0892,0.7347,0.7554,0.127,26600.0,33118.5


In [136]:
# find the maximum of each column with the .max method
college_n.max().head()

SATVRMID         765.0
SATMTMID         785.0
UGDS          151558.0
UGDS_WHITE         1.0
UGDS_BLACK         1.0
dtype: float64

In [137]:
college_n.eq(college_n.max()).head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Alabama A & M University,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Alabama State University,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [148]:
def idx_max(df):
    has_row_max = (
    college_n
        .eq(college_n.max())
        .cumsum()
        .cumsum()
        .eq(1)
        .any(axis='columns')
    )
    return has_row_max[has_row_max].index

In [149]:
idx_max(college_n)

Index(['Thunderbird School of Global Management',
       'Southwest University of Visual Arts-Tucson', 'ABC Beauty College Inc',
       'Velvatex College of Beauty Culture',
       'California Institute of Technology',
       'Le Cordon Bleu College of Culinary Arts-San Francisco',
       'MTI Business College Inc', 'Dongguk University-Los Angeles',
       'Mr Leon's School of Hair Design-Moscow',
       'Haskell Indian Nations University', 'LIU Brentwood',
       'Medical College of Wisconsin', 'Palau Community College',
       'California University of Management and Sciences',
       'Cosmopolitan Beauty and Tech School', 'University of Phoenix-Arizona'],
      dtype='object', name='INSTNM')