In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [4]:
movie = pd.read_csv("data/movie.csv")
pd.set_option('display.max_rows', 10, 'display.max_columns' , len(movie.columns))

# Selecting multiple DataFrame columns

In [3]:
movie_actor_director = movie[['actor_1_name', 'actor_2_name','actor_2_name','director_name']]
movie_actor_director.head() 
# If a string is passed, it will return a single-dimensional Series. If a list is
# passed to the indexing operator, it returns a DataFrame of all the columns in the list in the
# specified order

Unnamed: 0,actor_1_name,actor_2_name,actor_2_name.1,director_name
0,CCH Pounder,Joel David Moore,Joel David Moore,James Cameron
1,Johnny Depp,Orlando Bloom,Orlando Bloom,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Rory Kinnear,Sam Mendes
3,Tom Hardy,Christian Bale,Christian Bale,Christopher Nolan
4,Doug Walker,Rob Walker,Rob Walker,Doug Walker


# Selecting columns with methods

In [5]:
movie.dtypes.value_counts()

float64    13
object     12
int64       3
dtype: int64

In [6]:
movie.select_dtypes(['int']).head() #int,float,object, bool - pass all requied as elements of list
# Use the select_dtypes method to select only the integer columns:

Unnamed: 0,num_voted_users,cast_total_facebook_likes,movie_facebook_likes
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [3]:
movie.select_dtypes(['int','float'])

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,1.0,87.0,2.0,318.0,637.0,,629,2283,2.0,6.0,,2013.0,470.0,7.7,,84
4912,43.0,43.0,,319.0,841.0,,73839,1753,1.0,359.0,,,593.0,7.5,16.00,32000
4913,13.0,76.0,0.0,0.0,0.0,,38,0,0.0,3.0,1400.0,2013.0,0.0,6.3,,16
4914,14.0,100.0,0.0,489.0,946.0,10443.0,1255,2386,5.0,9.0,,2012.0,719.0,6.3,2.35,660


In [11]:
# If you would like to select all the numeric columns, you may simply pass the
# string number to the include parameter:

movie.select_dtypes(include=['integer']).head() #integer, float, int/integer, object, datetime, category (categorical)

Unnamed: 0,num_voted_users,cast_total_facebook_likes,movie_facebook_likes
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [8]:
# The filter method allows columns to be searched through regular expressions
# with the regex parameter. Here, we search for all columns that have a digit
# somewhere in their name

movie.filter(regex="\d")

Unnamed: 0,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,actor_1_name,actor_3_name,actor_2_facebook_likes
0,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
1,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
2,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
3,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,131.0,Doug Walker,,12.0
...,...,...,...,...,...,...
4911,318.0,Daphne Zuniga,637.0,Eric Mabius,Crystal Lowe,470.0
4912,319.0,Valorie Curry,841.0,Natalie Zea,Sam Underwood,593.0
4913,0.0,Maxwell Moody,0.0,Eva Boehnke,David Chandler,0.0
4914,489.0,Daniel Henney,946.0,Alan Ruck,Eliza Coupe,719.0


In [9]:
# The filter method comes with another parameter, items, which takes a list of exact
# column names. This is nearly an exact duplication of the indexing operator, except that a
# KeyError will not be raised if one of the strings does not match a column name. For
# instance, movie.filter(items=['actor_1_name', 'asdf']) runs without error and
# returns a single column DataFrame.
movie.filter(items = ['actor_1_name','vineed']) #Vineed not present in DF. No error and returned the one which was present

Unnamed: 0,actor_1_name
0,CCH Pounder
1,Johnny Depp
2,Christoph Waltz
3,Tom Hardy
4,Doug Walker
...,...
4911,Eric Mabius
4912,Natalie Zea
4913,Eva Boehnke
4914,Alan Ruck


# Ordering column names sensibly

In [12]:
 #Getting the list of all columns
print(movie.columns)

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')


In [15]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,0.0,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,0.0,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,1.0,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,0.0,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,8,143,,0.0,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,,,,12.0,7.1,,0


In [20]:
disc_core = ['movie_title','language','country','title_year','content_rating']
disc_people = ['director_name','actor_2_name','actor_1_name','actor_3_name']
disc_other = ['color','genres','plot_keywords','movie_imdb_link']
cont_fb = (['director_facebook_likes', 'actor_1_facebook_likes', 
            'actor_2_facebook_likes', 'actor_3_facebook_likes',
            'cast_total_facebook_likes', 'movie_facebook_likes'])
cont_finance = ['budget', 'gross']
cont_num_reviews = (['num_voted_users', 'num_user_for_reviews',
                     'num_critic_for_reviews'])
cont_other = ['imdb_score', 'duration', 'aspect_ratio', 'facenumber_in_poster']

new_col_order = disc_core+disc_people+disc_other+cont_fb+cont_finance+cont_num_reviews+cont_other

In [21]:
# Python sets are unordered and the
# equality statement checks whether each member of one set is a member of the other.
set(movie.columns) == set(new_col_order)

True

In [25]:
movie2 = movie[new_col_order]


Unnamed: 0,movie_title,language,country,title_year,content_rating,director_name,actor_2_name,actor_1_name,actor_3_name,color,genres,plot_keywords,movie_imdb_link,director_facebook_likes,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,budget,gross,num_voted_users,num_user_for_reviews,num_critic_for_reviews,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,English,USA,2009.0,PG-13,James Cameron,Joel David Moore,CCH Pounder,Wes Studi,Color,Action|Adventure|Fantasy|Sci-Fi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,0.0,1000.0,936.0,855.0,4834,33000,237000000.0,760505847.0,886204,3054.0,723.0,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,English,USA,2007.0,PG-13,Gore Verbinski,Orlando Bloom,Johnny Depp,Jack Davenport,Color,Action|Adventure|Fantasy,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,563.0,40000.0,5000.0,1000.0,48350,0,300000000.0,309404152.0,471220,1238.0,302.0,7.1,169.0,2.35,0.0
2,Spectre,English,UK,2015.0,PG-13,Sam Mendes,Rory Kinnear,Christoph Waltz,Stephanie Sigman,Color,Action|Adventure|Thriller,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,0.0,11000.0,393.0,161.0,11700,85000,245000000.0,200074175.0,275868,994.0,602.0,6.8,148.0,2.35,1.0
3,The Dark Knight Rises,English,USA,2012.0,PG-13,Christopher Nolan,Christian Bale,Tom Hardy,Joseph Gordon-Levitt,Color,Action|Thriller,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,22000.0,27000.0,23000.0,23000.0,106759,164000,250000000.0,448130642.0,1144337,2701.0,813.0,8.5,164.0,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens,,,,,Doug Walker,Rob Walker,Doug Walker,,,Documentary,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,131.0,131.0,12.0,,143,0,,,8,,,7.1,,,0.0


# Operating on the entire DataFrame

In [30]:
movie.shape #number of rows and columns
movie.size #total number of elements in the dataframe
movie.ndim # number of dimensions - axis 0 and 1 || Always 2 for DFs. 
len(movie) #nmber of rows

4916

In [37]:
movie.count() #gives length of each columns 

color                       19
director_name              102
num_critic_for_reviews      49
duration                    15
director_facebook_likes    102
                          ... 
title_year                 106
actor_2_facebook_likes      13
imdb_score                   0
aspect_ratio               326
movie_facebook_likes         0
Length: 28, dtype: int64

In [16]:
movie.min() #by defalt, skips NaN

  movie.min()


num_critic_for_reviews        1.0
duration                      7.0
director_facebook_likes       0.0
actor_3_facebook_likes        0.0
actor_1_facebook_likes        0.0
                            ...  
title_year                 1916.0
actor_2_facebook_likes        0.0
imdb_score                    1.6
aspect_ratio                 1.18
movie_facebook_likes            0
Length: 19, dtype: object

In [15]:
movie.min(skipna = False) #returns NaN if even one element is missing

  movie.min(skipna = False)


num_critic_for_reviews     NaN
duration                   NaN
director_facebook_likes    NaN
actor_3_facebook_likes     NaN
actor_1_facebook_likes     NaN
                          ... 
title_year                 NaN
actor_2_facebook_likes     NaN
imdb_score                 1.6
aspect_ratio               NaN
movie_facebook_likes         0
Length: 19, dtype: object

In [10]:
movie.describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,49.0,93.0,7.0,132.0,607.0,5019656.0,8361.75,1394.75,0.0,64.0,6000000.0,1999.0,277.0,5.8,1.85,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
75%,191.0,118.0,189.75,633.0,11000.0,61108410.0,93772.75,13616.75,2.0,320.5,43000000.0,2011.0,912.0,7.2,2.35,2000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


In [13]:
# It is possible to specify exact quantiles in the describe method using the percentiles parameter:
movie.describe(percentiles = [0.01,.5,.99])

# The numeric columns have missing values as well but have a result returned. By default,
# pandas handles missing values in numeric columns by skipping them. It is possible to
# change this behavior by setting the skipna parameter to False. This will cause pandas to
# return NaN for all these aggregation methods if there exists at least a single missing value.

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
1%,2.0,43.0,0.0,0.0,6.08,8474.8,53.0,6.0,0.0,1.94,60000.0,1951.0,0.0,3.1,1.33,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
99%,546.68,189.0,16000.0,11000.0,44920.0,326412800.0,681584.6,62413.9,8.0,1999.24,200000000.0,2016.0,17000.0,8.5,4.0,93850.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


In [19]:
#get sum of all missing values in the DF - Method 1
movie.isnull().sum().sum()

2654

In [20]:
#Method 2
(len(movie) - movie.count()).sum()

2654

In [26]:
#Check if any missing numbers in the DF
movie.isnull().any().any()

True

In [None]:
#Strings/object columns with missing values will not return max,min,sum. Explicitly, fillna needs to be used to remove missng

# Working with operators on a DataFrame

In [52]:
college = pd.read_csv("data/college.csv",index_col = 'INSTNM')
college #error
# Typically, when an operator is
# used with a DataFrame, the columns are either all numeric or all object (usually strings). If
# the DataFrame does not contain homogeneous data, then the operation is likely to fail.

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0000,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888
University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,0.5922,0.2600,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.0100,0.2607,1,0.3460,0.5214,0.2422,39700,21941.5
Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,291.0,0.2990,0.4192,0.0069,0.0034,0.0000,0.0000,0.0000,0.0000,0.2715,0.4536,1,0.6801,0.7795,0.8540,40100,23370
University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,5451.0,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.0350,0.2146,1,0.3072,0.4596,0.2640,45500,24097
Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,4811.0,0.0158,0.9208,0.0121,0.0019,0.0010,0.0006,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.1270,26600,33118.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,Emeryville,CA,,,,1,,,,,,,,,,,,,,,1,,,,,9500
Rasmussen College - Overland Park,Overland Park,KS,,,,1,,,,,,,,,,,,,,,1,,,,,21163
National Personal Training Institute of Cleveland,Highland Heights,OH,,,,1,,,,,,,,,,,,,,,1,,,,,6333
Bay Area Medical Academy - San Jose Satellite Location,San Jose,CA,,,,1,,,,,,,,,,,,,,,1,,,,,PrivacySuppressed


In [53]:
college_ugds_= college.filter(like="UGDS_")
college_ugds_.head() #missing vales stay missing even after the addition

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [43]:
college_ugds_+.0051 #missing vales shay missing even after the addition


Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0384,0.9404,0.0106,0.0070,0.0075,0.0070,0.0051,0.0110,0.0189
University of Alabama at Birmingham,0.5973,0.2651,0.0334,0.0569,0.0073,0.0058,0.0419,0.0230,0.0151
Amridge University,0.3041,0.4243,0.0120,0.0085,0.0051,0.0051,0.0051,0.0051,0.2766
University of Alabama in Huntsville,0.7039,0.1306,0.0433,0.0427,0.0194,0.0053,0.0223,0.0383,0.0401
Alabama State University,0.0209,0.9259,0.0172,0.0070,0.0061,0.0057,0.0149,0.0294,0.0188
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [45]:
(college_ugds_+.0051)//.01/100

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.00,0.00,0.0,0.00,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.00,0.0,0.04,0.02,0.01
Amridge University,0.30,0.42,0.01,0.00,0.00,0.0,0.00,0.00,0.27
University of Alabama in Huntsville,0.70,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.00,0.00,0.0,0.01,0.02,0.01
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [58]:
college_ugds_op_round = (college_ugds_+.00501)//.01/100
college_ugds_round = (college_ugds_ + .00001).round(2)
# print(college_ugds_round)
college_ugds_op_round.equals(college_ugds_round)


True

# Comparing missing values

In [63]:
np.nan ==np.nan
college_ugds_compare = college_ugds_op_round ==college_ugds_round
college_ugds_compare.all() 

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

In [66]:
#Equals method to be used to compare two data frames
college_ugds_op_round.equals(college_ugds_round)


True

In [69]:
college_ugds_.isnull().sum() #counting number of missing values
(len(college_ugds_) - college_ugds_.count()).equals(college_ugds_.isnull().sum()) #Comparing two methods

True

# Transposing the direction of a DataFrame operation

In [73]:
college_ugds_.count(axis=1)
college_ugds_.sum(axis=1)

INSTNM
Alabama A & M University                                  1.0000
University of Alabama at Birmingham                       0.9999
Amridge University                                        1.0000
University of Alabama in Huntsville                       1.0000
Alabama State University                                  1.0000
                                                           ...  
SAE Institute of Technology  San Francisco                0.0000
Rasmussen College - Overland Park                         0.0000
National Personal Training Institute of Cleveland         0.0000
Bay Area Medical Academy - San Jose Satellite Location    0.0000
Excel Learning Center-San Antonio South                   0.0000
Length: 7535, dtype: float64

In [75]:
#Cumulative sum method
college_ugds_cumsum = college_ugds_.cumsum(axis = 1)
college_ugds_cumsum.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9686,0.9741,0.976,0.9784,0.9803,0.9803,0.9862,1.0
University of Alabama at Birmingham,0.5922,0.8522,0.8805,0.9323,0.9345,0.9352,0.972,0.9899,0.9999
Amridge University,0.299,0.7182,0.7251,0.7285,0.7285,0.7285,0.7285,0.7285,1.0
University of Alabama in Huntsville,0.6988,0.8243,0.8625,0.9001,0.9144,0.9146,0.9318,0.965,1.0
Alabama State University,0.0158,0.9366,0.9487,0.9506,0.9516,0.9522,0.962,0.9863,1.0


# Determining college campus diversity

In [77]:
college_ugds_.isnull().sum(axis=1).sort_values(ascending=False).head()

INSTNM
Excel Learning Center-San Antonio South              9
Western State College of Law at Argosy University    9
Albany Law School                                    9
Albany Medical College                               9
A T Still University of Health Sciences              9
dtype: int64

In [79]:
college_ugds_dropped_na = college_ugds_.dropna(how="all")
# The dropna method in step 3 has the how parameter, which is
# defaulted to the string any but may also be changed to all. When set to any, it drops rows
# that contain one or more missing values. When set to all, it only drops rows where all
# values are missing.
# In this case, we conservatively drop rows that are missing all values. This is because it's
# possible that some missing values simply represent 0 percent. This did not happen to be the
# case here, as there were no missing values after the dropna was performed. If there were
# still missing values, we could have run the fillna(0) method to fill all the remaining
# values with 0.
college_ugds_dropped_na.isnull().sum(axis = 1).sort_values(ascending = False)

INSTNM
Alabama A & M University                          0
California State University-Monterey Bay          0
Lorain County Joint Vocational School District    0
Pike County Joint Vocational School District      0
South Texas College                               0
                                                 ..
CUNY Hunter College                               0
CUNY Hostos Community College                     0
CUNY Graduate School and University Center        0
CUNY City College                                 0
Coastal Pines Technical College                   0
Length: 6874, dtype: int64

In [82]:
diversity_metric = college_ugds_dropped_na.ge(.15).sum(axis = 1)
diversity_metric.value_counts()
diversity_metric.sort_values(ascending=False)

INSTNM
Central Texas Beauty College-Temple                               5
Regency Beauty Institute-Austin                                   5
Westwood College-O'Hare Airport                                   4
Regency Beauty Institute-Pasadena                                 4
Soma Institute-The National School of Clinical Massage Therapy    4
                                                                 ..
Professional Business College                                     0
Education and Technology Institute                                0
Taft University System                                            0
Prince Institute-Rocky Mountains                                  0
Spanish-American Institute                                        0
Length: 6874, dtype: int64

In [84]:
college_ugds_dropped_na.loc[['Central Texas Beauty College-Temple','Regency Beauty Institute-Austin']]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,0.0,0.0,0.1717,0.0,0.1515
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,0.0,0.0,0.1733,0.0,0.2667


In [102]:
top_diverse_college = pd.read_csv('data/college_diversity.csv')
top_diverse_college
# top_college = ['Andrews University',
#                'Stanford University','University of Houston','University of Nevada--Las Vegas',
#                'University of San Francisco','San Francisco State University','University of Illinois',
# #                'New Jersey Institute of Technology',"Texas Woman's University"]

# us = ['Rutgers University-Newark',
# 'Andrews University',
# 'Stanford University',
# 'University of Houston',
# 'University of Nevada-Las Vegas']
               


Unnamed: 0,School,Diversity Index
0,"Rutgers University--Newark Newark, NJ",0.76
1,"Andrews University Berrien Springs, MI",0.74
2,"Stanford University Stanford, CA",0.74
3,"University of Houston Houston, TX",0.74
4,"University of Nevada--Las Vegas Las Vegas, NV",0.74
5,"University of San Francisco San Francisco, CA",0.74
6,"San Francisco State University San Francisco, CA",0.73
7,"University of Illinois--Chicago Chicago, IL",0.73
8,"New Jersey Institute of Technology Newark, NJ",0.72
9,"Texas Woman's University Denton, TX",0.72


In [101]:
diversity_metric.loc[us]

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64