# Essential DataFrame Operations

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

## __Selecting multiple DataFrame columns__

### Step1: Read in the movie dataset and pass the desired column to the indexing operator

In [3]:
movies = pd.read_csv('movie.csv')
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [4]:
movies.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [5]:
movies_actor_director = movies[['actor_1_name','actor_2_name','actor_3_name','director_name']]
movies_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


### Step2: Get the type of Series

In [6]:
print(type(movies[['director_name']]))
print(type(movies['director_name']))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


### Step3: Use the .iloc method to pull column name

In [7]:
print(type(movies.loc[:,['director_name']]))
print(type(movies.loc[:,'director_name']))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


## __Selecting column with methods__

### Step1: Write a function to shorten the column names for displays

In [8]:
def shorten(col):
    return col.replace("facebook_likes",'fb').replace("_for_reviews","")

In [9]:
movies = movies.rename(columns = shorten)
movies.columns

Index(['color', 'director_name', 'num_critic', 'duration', 'director_fb',
       'actor_3_fb', 'actor_2_name', 'actor_1_fb', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_fb',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_fb', 'imdb_score', 'aspect_ratio',
       'movie_fb'],
      dtype='object')

In [10]:
movies.dtypes.value_counts()

float64    13
object     12
int64       3
dtype: int64

### Step2: Use the .select_dtypes method to select only the integer columns

In [11]:
movies.select_dtypes(include = 'int').head()

Unnamed: 0,num_voted_users,cast_total_fb,movie_fb
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


### Step3: To select all numeric columns we can pass include = 'number'

In [12]:
movies.select_dtypes(include = 'number').head()

Unnamed: 0,num_critic,duration,director_fb,actor_3_fb,actor_1_fb,gross,num_voted_users,cast_total_fb,facenumber_in_poster,num_user,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


### Step4: If we want multiple columns

In [13]:
movies.select_dtypes(include = ['int','object']).head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,cast_total_fb,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating,movie_fb
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13,33000
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13,0
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13,85000
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13,164000
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,8,143,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,0


### Step5: To exclude the floating point columns use the exclude option

In [14]:
movies.select_dtypes(exclude = 'float').head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,cast_total_fb,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating,movie_fb
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13,33000
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13,0
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13,85000
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13,164000
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,8,143,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,0


### Step6: Use the filter command

In [15]:
movies.filter(like = 'fb').head()

Unnamed: 0,director_fb,actor_3_fb,actor_1_fb,cast_total_fb,actor_2_fb,movie_fb
0,0.0,855.0,1000.0,4834,936.0,33000
1,563.0,1000.0,40000.0,48350,5000.0,0
2,0.0,161.0,11000.0,11700,393.0,85000
3,22000.0,23000.0,27000.0,106759,23000.0,164000
4,131.0,,131.0,143,12.0,0


### Step 7: Use the items parameter with filter

In [16]:
cols = ['actor_1_name','actor_2_name','actor_3_name','director_name']
movies.filter(items = cols).head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


### Step8: Using Regex parameter with filter

In [17]:
movies.filter(regex = r'\d').head()

Unnamed: 0,actor_3_fb,actor_2_name,actor_1_fb,actor_1_name,actor_3_name,actor_2_fb
0,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
1,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
2,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
3,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,131.0,Doug Walker,,12.0


## __Ordering Column names__

### Step 1: Read the dataframe and write a function to shorten the column

In [18]:
def shorten(col):
    return col.replace("facebook_likes","fb").replace('_for_reviews','')
movies = movies.rename(columns = shorten)

### Step2: Read the columns

In [19]:
movies.columns

Index(['color', 'director_name', 'num_critic', 'duration', 'director_fb',
       'actor_3_fb', 'actor_2_name', 'actor_1_fb', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_fb',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_fb', 'imdb_score', 'aspect_ratio',
       'movie_fb'],
      dtype='object')

### Step3: Arrange the columns as per logic

In [20]:
cat_core = ['movie_title','title_year','content_rating','genres']
cat_people = ['director_name','actor_1_name','actor_2_name','actor_3_name']
cat_other = ['color','country','language','plot_keywords','movie_imdb_link']
cont_fb = ['director_fb','actor_1_fb','actor_2_fb','actor_3_fb','cast_total_fb','movie_fb']
cont_finance = ['budget','gross']
cont_num_reviews = ['num_voted_users','num_user','num_critic']
cont_other = ['imdb_score','duration','aspect_ratio','facenumber_in_poster']

### Step 4: Concatenate all the list together

In [21]:
new_col_order = (cat_core+
                cat_people+
                cat_other+
                cont_fb+
                cont_finance+
                cont_num_reviews+
                cont_other)
set(movies.columns) == set(new_col_order)

True

In [22]:
movies[new_col_order].head()

Unnamed: 0,movie_title,title_year,content_rating,genres,director_name,actor_1_name,actor_2_name,actor_3_name,color,country,...,movie_fb,budget,gross,num_voted_users,num_user,num_critic,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,...,33000,237000000.0,760505847.0,886204,3054.0,723.0,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,...,0,300000000.0,309404152.0,471220,1238.0,302.0,7.1,169.0,2.35,0.0
2,Spectre,2015.0,PG-13,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Color,UK,...,85000,245000000.0,200074175.0,275868,994.0,602.0,6.8,148.0,2.35,1.0
3,The Dark Knight Rises,2012.0,PG-13,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Color,USA,...,164000,250000000.0,448130642.0,1144337,2701.0,813.0,8.5,164.0,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens,,,Documentary,Doug Walker,Doug Walker,Rob Walker,,,,...,0,,,8,,,7.1,,,0.0


## __Summarizing a DataFrame__

Step1: Read the movies dataset and examine the basic descriptive properties

In [23]:
movies.shape

(4916, 28)

In [24]:
movies.size

137648

In [25]:
movies.ndim

2

In [26]:
len(movies)

4916

### Step2: The .count method shows the number of non-missing values for each column

In [27]:
movies.count().head()

color            4897
director_name    4814
num_critic       4867
duration         4901
director_fb      4814
dtype: int64

### Step3: Compute summary statistics

In [28]:
movies.min()

num_critic                                                              1
duration                                                                7
director_fb                                                             0
actor_3_fb                                                              0
actor_1_fb                                                              0
gross                                                                 162
genres                                                             Action
movie_title                                                       #Horror
num_voted_users                                                         5
cast_total_fb                                                           0
facenumber_in_poster                                                    0
movie_imdb_link         http://www.imdb.com/title/tt0006864/?ref_=fn_t...
num_user                                                                1
budget                                

In [29]:
movies.max()

num_critic                                                            813
duration                                                              511
director_fb                                                         23000
actor_3_fb                                                          23000
actor_1_fb                                                         640000
gross                                                         7.60506e+08
genres                                                            Western
movie_title                                                      Æon Flux
num_voted_users                                                   1689764
cast_total_fb                                                      656730
facenumber_in_poster                                                   43
movie_imdb_link         http://www.imdb.com/title/tt5574490/?ref_=fn_t...
num_user                                                             5060
budget                                

In [30]:
movies.mean().round(2)

num_critic                   137.99
duration                     107.09
director_fb                  691.01
actor_3_fb                   631.28
actor_1_fb                  6494.49
gross                   47644514.53
num_voted_users            82644.92
cast_total_fb               9579.82
facenumber_in_poster           1.38
num_user                     267.67
budget                  36547486.03
title_year                  2002.45
actor_2_fb                  1621.92
imdb_score                     6.44
aspect_ratio                   2.22
movie_fb                    7348.29
dtype: float64

### Step4: Use the .describe method to view the summary statistics

In [31]:
movies.describe()

Unnamed: 0,num_critic,duration,director_fb,actor_3_fb,actor_1_fb,gross,num_voted_users,cast_total_fb,facenumber_in_poster,num_user,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,49.0,93.0,7.0,132.0,607.0,5019656.0,8361.75,1394.75,0.0,64.0,6000000.0,1999.0,277.0,5.8,1.85,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
75%,191.0,118.0,189.75,633.0,11000.0,61108410.0,93772.75,13616.75,2.0,320.5,43000000.0,2011.0,912.0,7.2,2.35,2000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


In [32]:
movies.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic,4867.0,137.9889,120.2394,1.0,49.0,108.0,191.0,813.0
duration,4901.0,107.0908,25.28602,7.0,93.0,103.0,118.0,511.0
director_fb,4814.0,691.0145,2832.954,0.0,7.0,48.0,189.75,23000.0
actor_3_fb,4893.0,631.2763,1625.875,0.0,132.0,366.0,633.0,23000.0
actor_1_fb,4909.0,6494.488,15106.99,0.0,607.0,982.0,11000.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,5019656.25,25043962.0,61108412.75,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,8361.75,33132.5,93772.75,1689764.0
cast_total_fb,4916.0,9579.816,18164.32,0.0,1394.75,3049.0,13616.75,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,1.0,2.0,43.0
num_user,4895.0,267.6688,372.9348,1.0,64.0,153.0,320.5,5060.0


## __Chaining DataFrame methods__

### Step 1: Use the .isnull to count the number of missing values

In [33]:
movies.isnull().head()

Unnamed: 0,color,director_name,num_critic,duration,director_fb,actor_3_fb,actor_2_name,actor_1_fb,gross,genres,...,num_user,language,country,content_rating,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,True,True,False,True,False,False,True,False,...,True,True,True,True,True,True,False,False,True,False


### Step2: We will use the .sum method

In [34]:
movies.isnull().sum().head()

color             19
director_name    102
num_critic        49
duration          15
director_fb      102
dtype: int64

### Step3: We can go a step further

In [35]:
movies.isnull().sum().sum()

2654

### Step4: To determine if there are any missing values

In [36]:
movies.isnull().any().any()

True

In [37]:
movies.isnull().dtypes.value_counts()

bool    28
dtype: int64

#### Most missing values are in object data type so we fill the missing values

In [38]:
movies[['color','movie_title','color']].max()

Series([], dtype: float64)

In [39]:
movies.select_dtypes(['object']).fillna("").max()

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
movie_title                                                 Æon Flux
actor_3_name                                           Óscar Jaenada
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
dtype: object

## __DataFrame operations__

### Read the dataset

In [40]:
colleges = pd.read_csv('college.csv')
colleges.head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


### Set the index as INSTNM

In [41]:
colleges = colleges.set_index('INSTNM')
colleges.head()

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,291.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,5451.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,4811.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


### Select Columns with UGDS

In [42]:
college_ugds = colleges.filter(like = 'UGDS_')
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


### Step1: Locate for Northwest-shoals Community college

In [47]:
name = 'Northwest-Shoals Community College'
college_ugds.loc[name]

UGDS_WHITE    0.7912
UGDS_BLACK    0.1250
UGDS_HISP     0.0339
UGDS_ASIAN    0.0036
UGDS_AIAN     0.0088
UGDS_NHPI     0.0006
UGDS_2MOR     0.0012
UGDS_NRA      0.0033
UGDS_UNKN     0.0324
Name: Northwest-Shoals Community College, dtype: float64

#### Round the values

In [48]:
college_ugds.loc[name].round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.12
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

#### We can also add 0.001 and then use the round function

In [49]:
(college_ugds.loc[name] + 0.001).round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.13
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

### Step 2: Lets do this on entire college_ugds DataFrame by first adding 0.00501

In [50]:
college_ugds + 0.00501

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03831,0.94031,0.01051,0.00691,0.00741,0.00691,0.00501,0.01091,0.01881
University of Alabama at Birmingham,0.59721,0.26501,0.03331,0.05681,0.00721,0.00571,0.04181,0.02291,0.01501
Amridge University,0.30401,0.42421,0.01191,0.00841,0.00501,0.00501,0.00501,0.00501,0.27651
University of Alabama in Huntsville,0.70381,0.13051,0.04321,0.04261,0.01931,0.00521,0.02221,0.03821,0.04001
Alabama State University,0.02081,0.92581,0.01711,0.00691,0.00601,0.00561,0.01481,0.02931,0.01871
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


### Step 3: Use floor division operator, //, to round down to the nearest whole number percentage

In [51]:
(college_ugds + 0.00501)//0.01

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,3.0,94.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
University of Alabama at Birmingham,59.0,26.0,3.0,5.0,0.0,0.0,4.0,2.0,1.0
Amridge University,30.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,27.0
University of Alabama in Huntsville,70.0,13.0,4.0,4.0,1.0,0.0,2.0,3.0,4.0
Alabama State University,2.0,92.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


### Step 4: To complete the rounding exercise divide by 100

In [52]:
college_ugds_op_round = ((college_ugds + 0.00501)//0.01/100)
college_ugds_op_round.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.0,0.0,0.0,0.01,0.02,0.01


### Step 5: Use the round DataFrame method,Note: Due to bankers rounding we add 0.00001

In [55]:
college_ugds_round = (college_ugds + 0.00001).round(2)
college_ugds_round.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.0,0.0,0.0,0.01,0.02,0.01


### Step 6: Use the equals method to the test the equality of two DataFrames

In [56]:
college_ugds_op_round.equals(college_ugds_round)

True

## __Comparing Missing Values__

### Play around with np.nan

In [58]:
print(np.nan == np.nan)
print(None == None)

False
True


In [59]:
np.nan > 5

False

In [60]:
np.nan != 5

True

### Load the Dataset

In [61]:
colleges = pd.read_csv('college.csv',index_col = 'INSTNM')
colleges.head()

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,291.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,5451.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,4811.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


### Step 1: To get an idea of how the equal operator works, lets compare each element to a scalar value

In [62]:
college_ugds == 0.0019

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,True,False,True,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,False,False,False,False,False
Alabama State University,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False


### Step 2: The above method drawbacks with missing values

In [63]:
college_self_compare = (college_ugds == college_ugds)
college_self_compare.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,True,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,True,True,True,True,True
Amridge University,True,True,True,True,True,True,True,True,True
University of Alabama in Huntsville,True,True,True,True,True,True,True,True,True
Alabama State University,True,True,True,True,True,True,True,True,True


### Step 3: Use the .all method to get a better answer

In [64]:
college_self_compare.all()

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

### Step 4: Try summing up

In [66]:
(college_ugds == np.nan).sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

### Step 5: Instead of using == try using is.na method

In [67]:
college_ugds.isna().sum()

UGDS_WHITE    661
UGDS_BLACK    661
UGDS_HISP     661
UGDS_ASIAN    661
UGDS_AIAN     661
UGDS_NHPI     661
UGDS_2MOR     661
UGDS_NRA      661
UGDS_UNKN     661
dtype: int64

### Step 6: Use the .equals method

In [68]:
college_ugds.equals(college_ugds)

True

## __Transposing the direction of a DataFrame operation__

### Note: 
    👉🏽 Many DataFrames have an axis parameter
    👉🏽 This parameter controls the direction in which the operation takes place
    👉🏽 Axis can be '0'(row) or '1'(Column)
    👉🏽 By default axis value is 0

### Step 1: Read the college dataset and perform the necessary actions

In [69]:
college = pd.read_csv('college.csv',index_col = 'INSTNM')
college_ugds = college.filter(like = 'UGDS_')
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


### Step 2: Use the .count method which returns the number of non-missing values

In [70]:
college_ugds.count()

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

#### Note: The axis parameter is almost always '0'. So step 2 is equivalent to :
        👉🏽 college_ugds.count(axis = 0)
        👉🏽 college_ugds.count(axis = 'index')
        👉🏽 college_ugds.count(0)

### Step 3: Change the axis parameter to columns

In [74]:
college_ugds.count(axis = 'columns').head()

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
University of Alabama in Huntsville    9
Alabama State University               9
dtype: int64

### Step 4: Instead of counting non-missing values we can sum all the values in each row.

In [75]:
college_ugds.sum(axis = 'columns').head()

INSTNM
Alabama A & M University               1.0000
University of Alabama at Birmingham    0.9999
Amridge University                     1.0000
University of Alabama in Huntsville    1.0000
Alabama State University               1.0000
dtype: float64

### Step 5: To get an idea of the distribution of each column, the .median method can be used

In [77]:
college_ugds.median(0)

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
UGDS_NHPI     0.00000
UGDS_2MOR     0.01750
UGDS_NRA      0.00000
UGDS_UNKN     0.01430
dtype: float64

#### More

In [80]:
college_ugds_cumsum = college_ugds.cumsum(1)
college_ugds_cumsum.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9686,0.9741,0.976,0.9784,0.9803,0.9803,0.9862,1.0
University of Alabama at Birmingham,0.5922,0.8522,0.8805,0.9323,0.9345,0.9352,0.972,0.9899,0.9999
Amridge University,0.299,0.7182,0.7251,0.7285,0.7285,0.7285,0.7285,0.7285,1.0
University of Alabama in Huntsville,0.6988,0.8243,0.8625,0.9001,0.9144,0.9146,0.9318,0.965,1.0
Alabama State University,0.0158,0.9366,0.9487,0.9506,0.9516,0.9522,0.962,0.9863,1.0


## __Determining College Campus Diversity__

### Step 1: Read the dataset

In [82]:
college = pd.read_csv('college.csv',index_col = 'INSTNM')
college_ugds = college.filter(like = 'UGDS_')


### Step 2: Check for missing values across rows

In [84]:
college_ugds.isnull().sum(1).sort_values(ascending = False).head()

INSTNM
Excel Learning Center-San Antonio South         9
Philadelphia College of Osteopathic Medicine    9
Assemblies of God Theological Seminary          9
Episcopal Divinity School                       9
Phillips Graduate Institute                     9
dtype: int64

### Step 3: Drop the missing values

In [87]:
college_ugds = college_ugds.dropna(how = 'all')
college_ugds.isnull().sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

### Step 4: Use the .ge to return a DataFrame that checks for diversity 

In [88]:
college_ugds.ge(0.15)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,True,False,False,False,False,False,False,False
University of Alabama at Birmingham,True,True,False,False,False,False,False,False,False
Amridge University,True,True,False,False,False,False,False,False,True
University of Alabama in Huntsville,True,False,False,False,False,False,False,False,False
Alabama State University,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
Hollywood Institute of Beauty Careers-West Palm Beach,True,True,True,False,False,False,False,False,False
Hollywood Institute of Beauty Careers-Casselberry,False,True,True,False,False,False,False,False,False
Coachella Valley Beauty College-Beaumont,True,False,True,False,False,False,False,False,False
Dewey University-Mayaguez,False,False,True,False,False,False,False,False,False


### Step 5: We can use the .sum method to count the true values

In [90]:
diversity_metric = college_ugds.ge(0.15).sum(axis = 'columns')
diversity_metric.head()

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

### Step 6: To get an idea of distribution we use value_counts method

In [91]:
diversity_metric.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

### Step 7: Sort the values by school to see how many schools have higher diversity

In [92]:
diversity_metric.sort_values(ascending = False).head()

INSTNM
Regency Beauty Institute-Austin          5
Central Texas Beauty College-Temple      5
Sullivan and Cogliano Training Center    4
Ambria College of Nursing                4
Berkeley College-New York                4
dtype: int64

### Step 8: It seems a little suspicious that schools can be that diverse. Lets take a look at the raw percentages

In [94]:
college_ugds.loc[['Regency Beauty Institute-Austin',
                 'Central Texas Beauty College-Temple']]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,0.0,0.0,0.1733,0.0,0.2667
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,0.0,0.0,0.1717,0.0,0.1515


### Step 9: Check for top 5 US schools 

In [95]:
us_new_top = ['Rutgers University-Newark',
              'Andrews University',
              'Stanford University',
              'University of Houston',
              'University of Nevada-Las Vegas']
diversity_metric.loc[us_new_top]

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64

## __What did I learn?__

### _Python in use_