# Day 7 - April 28, 2020

### Selecting multiple dataframe columns

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv("D:\Learning Data Science\Python - Pandas\Pandas cookbook\Pandas-Cookbook-master\data\movie.csv")

In [3]:
movies.head(3).T

Unnamed: 0,0,1,2
color,Color,Color,Color
director_name,James Cameron,Gore Verbinski,Sam Mendes
num_critic_for_reviews,723,302,602
duration,178,169,148
director_facebook_likes,0,563,0
actor_3_facebook_likes,855,1000,161
actor_2_name,Joel David Moore,Orlando Bloom,Rory Kinnear
actor_1_facebook_likes,1000,40000,11000
gross,7.60506e+08,3.09404e+08,2.00074e+08
genres,Action|Adventure|Fantasy|Sci-Fi,Action|Adventure|Fantasy,Action|Adventure|Thriller


In [4]:
movie_actor_director = movies[["actor_1_name", "actor_2_name", "actor_3_name", "director_name"]]
movie_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [5]:
# when selecting a single column of a df, if we pass in a list with a single item, we will get back a DataFrame... 

type(movies[["director_name"]])

pandas.core.frame.DataFrame

In [6]:
# If we pass in just a string with the column name, we will get a Series back.

type(movies["director_name"])

pandas.core.series.Series

In [7]:
# using .loc to pull out a column by name, can also return either a DataFrame or a Series

type(movies.loc[:,["director_name"]])

pandas.core.frame.DataFrame

In [8]:
type(movies.loc[:,"director_name"])

pandas.core.series.Series

In [9]:
# to avoid readability issues, save column names to a list variable first

cols = ["actor_1_name", "actor_2_name", "actor_3_name", "director_name"]
movie_actor_director = movies[cols]
movie_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [10]:
# KeyError is raise whenever a multiple column selection is attempted without the use of a list

# movies["actor_1_name", "actor_2_name", "actor_3_name", "director_name"]

In [11]:
# shortening the column names

def shorten(col):
    return(
        str (col)
        .replace("facebook_likes","fb")
        .replace("_for_reviews","")
        )

movies = movies.rename(columns=shorten)
movies.head().T

Unnamed: 0,0,1,2,3,4
color,Color,Color,Color,Color,
director_name,James Cameron,Gore Verbinski,Sam Mendes,Christopher Nolan,Doug Walker
num_critic,723,302,602,813,
duration,178,169,148,164,
director_fb,0,563,0,22000,131
actor_3_fb,855,1000,161,23000,
actor_2_name,Joel David Moore,Orlando Bloom,Rory Kinnear,Christian Bale,Rob Walker
actor_1_fb,1000,40000,11000,27000,131
gross,7.60506e+08,3.09404e+08,2.00074e+08,4.48131e+08,
genres,Action|Adventure|Fantasy|Sci-Fi,Action|Adventure|Fantasy,Action|Adventure|Thriller,Action|Thriller,Documentary


In [12]:
# selecting columns by datatypes
movies.dtypes

color                    object
director_name            object
num_critic              float64
duration                float64
director_fb             float64
actor_3_fb              float64
actor_2_name             object
actor_1_fb              float64
gross                   float64
genres                   object
actor_1_name             object
movie_title              object
num_voted_users           int64
cast_total_fb             int64
actor_3_name             object
facenumber_in_poster    float64
plot_keywords            object
movie_imdb_link          object
num_user                float64
language                 object
country                  object
content_rating           object
budget                  float64
title_year              float64
actor_2_fb              float64
imdb_score              float64
aspect_ratio            float64
movie_fb                  int64
dtype: object

In [13]:
movies.dtypes.value_counts()

float64    13
object     12
int64       3
dtype: int64

In [14]:
movies.select_dtypes(include="int64").head()

Unnamed: 0,num_voted_users,cast_total_fb,movie_fb
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [15]:
movies.select_dtypes(include="number").head()

Unnamed: 0,num_critic,duration,director_fb,actor_3_fb,actor_1_fb,gross,num_voted_users,cast_total_fb,facenumber_in_poster,num_user,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


In [16]:
movies.select_dtypes(include=["int64","object"]).head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,cast_total_fb,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating,movie_fb
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13,33000
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13,0
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13,85000
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13,164000
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,8,143,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,0


In [17]:
movies.select_dtypes(exclude="float").head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,cast_total_fb,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating,movie_fb
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13,33000
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13,0
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13,85000
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13,164000
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens,8,143,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,0


In [18]:
# .filter method searches (only) column names or index labels (and not data) based on the parameter used

movies.filter(like="fb").head()

Unnamed: 0,director_fb,actor_3_fb,actor_1_fb,cast_total_fb,actor_2_fb,movie_fb
0,0.0,855.0,1000.0,4834,936.0,33000
1,563.0,1000.0,40000.0,48350,5000.0,0
2,0.0,161.0,11000.0,11700,393.0,85000
3,22000.0,23000.0,27000.0,106759,23000.0,164000
4,131.0,,131.0,143,12.0,0


In [19]:
cols = ["actor_1_name", "actor_2_name", "actor_3_name", "director_name"]

movies.filter(items=cols).head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [20]:
# .filter method allows columns to be searched with regular expressions using the regex parameter

movies.filter(regex=r"\d").head() #searches columns for a digit in their name

Unnamed: 0,actor_3_fb,actor_2_name,actor_1_fb,actor_1_name,actor_3_name,actor_2_fb
0,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
1,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
2,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
3,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,131.0,Doug Walker,,12.0


# Day 8 - April 29, 2020

### Ordering column names

In [21]:
"""A guideline to order columns:
f Classify each column as either categorical or continuous
f Group common columns within the categorical and continuous columns
f Place the most important groups of columns first with categorical columns before
continuous ones"""

movies.head(3).T

Unnamed: 0,0,1,2
color,Color,Color,Color
director_name,James Cameron,Gore Verbinski,Sam Mendes
num_critic,723,302,602
duration,178,169,148
director_fb,0,563,0
actor_3_fb,855,1000,161
actor_2_name,Joel David Moore,Orlando Bloom,Rory Kinnear
actor_1_fb,1000,40000,11000
gross,7.60506e+08,3.09404e+08,2.00074e+08
genres,Action|Adventure|Fantasy|Sci-Fi,Action|Adventure|Fantasy,Action|Adventure|Thriller


In [22]:
movies.columns

Index(['color', 'director_name', 'num_critic', 'duration', 'director_fb',
       'actor_3_fb', 'actor_2_name', 'actor_1_fb', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_fb',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_fb', 'imdb_score', 'aspect_ratio',
       'movie_fb'],
      dtype='object')

In [23]:
cat_core = ["movie_title", "title_year", "content_rating", "genres"]
cat_people = ["director_name", "actor_1_name", "actor_2_name", "actor_3_name"]
cat_other = ["color", "country", "language", "plot_keywords", "movie_imdb_link"]
cont_fb = ["director_fb", "actor_1_fb", "actor_2_fb", "actor_3_fb", "cast_total_fb", "movie_fb"]
cont_finance = ["budget", "gross"]
cont_num_reviews = ["num_voted_users", "num_user", "num_critic"]
cont_other = ["imdb_score", "duration", "aspect_ratio", "facenumber_in_poster"]

new_col_order = (cat_core + cat_people + cat_other + cont_fb + cont_finance + cont_num_reviews + cont_other)
new_col_order

['movie_title',
 'title_year',
 'content_rating',
 'genres',
 'director_name',
 'actor_1_name',
 'actor_2_name',
 'actor_3_name',
 'color',
 'country',
 'language',
 'plot_keywords',
 'movie_imdb_link',
 'director_fb',
 'actor_1_fb',
 'actor_2_fb',
 'actor_3_fb',
 'cast_total_fb',
 'movie_fb',
 'budget',
 'gross',
 'num_voted_users',
 'num_user',
 'num_critic',
 'imdb_score',
 'duration',
 'aspect_ratio',
 'facenumber_in_poster']

In [24]:
set(movies.columns) == set(new_col_order)

True

In [25]:
movies_reordered = movies[new_col_order]
movies_reordered.head()

Unnamed: 0,movie_title,title_year,content_rating,genres,director_name,actor_1_name,actor_2_name,actor_3_name,color,country,...,movie_fb,budget,gross,num_voted_users,num_user,num_critic,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,...,33000,237000000.0,760505847.0,886204,3054.0,723.0,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,...,0,300000000.0,309404152.0,471220,1238.0,302.0,7.1,169.0,2.35,0.0
2,Spectre,2015.0,PG-13,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Color,UK,...,85000,245000000.0,200074175.0,275868,994.0,602.0,6.8,148.0,2.35,1.0
3,The Dark Knight Rises,2012.0,PG-13,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Color,USA,...,164000,250000000.0,448130642.0,1144337,2701.0,813.0,8.5,164.0,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens,,,Documentary,Doug Walker,Doug Walker,Rob Walker,,,,...,0,,,8,,,7.1,,,0.0


### Summarizing a DataFrame

In [26]:
movies.shape

(4916, 28)

In [27]:
movies.size

137648

In [28]:
movies.ndim

2

In [29]:
len(movies)

4916

In [30]:
movies.count()

color                   4897
director_name           4814
num_critic              4867
duration                4901
director_fb             4814
actor_3_fb              4893
actor_2_name            4903
actor_1_fb              4909
gross                   4054
genres                  4916
actor_1_name            4909
movie_title             4916
num_voted_users         4916
cast_total_fb           4916
actor_3_name            4893
facenumber_in_poster    4903
plot_keywords           4764
movie_imdb_link         4916
num_user                4895
language                4904
country                 4911
content_rating          4616
budget                  4432
title_year              4810
actor_2_fb              4903
imdb_score              4916
aspect_ratio            4590
movie_fb                4916
dtype: int64

In [31]:
movies.min()

num_critic                 1.00
duration                   7.00
director_fb                0.00
actor_3_fb                 0.00
actor_1_fb                 0.00
gross                    162.00
num_voted_users            5.00
cast_total_fb              0.00
facenumber_in_poster       0.00
num_user                   1.00
budget                   218.00
title_year              1916.00
actor_2_fb                 0.00
imdb_score                 1.60
aspect_ratio               1.18
movie_fb                   0.00
dtype: float64

In [32]:
movies.min(skipna=False)

num_critic              NaN
duration                NaN
director_fb             NaN
actor_3_fb              NaN
actor_1_fb              NaN
gross                   NaN
num_voted_users         5.0
cast_total_fb           0.0
facenumber_in_poster    NaN
num_user                NaN
budget                  NaN
title_year              NaN
actor_2_fb              NaN
imdb_score              1.6
aspect_ratio            NaN
movie_fb                0.0
dtype: float64

In [33]:
movies.max()

num_critic              8.130000e+02
duration                5.110000e+02
director_fb             2.300000e+04
actor_3_fb              2.300000e+04
actor_1_fb              6.400000e+05
gross                   7.605058e+08
num_voted_users         1.689764e+06
cast_total_fb           6.567300e+05
facenumber_in_poster    4.300000e+01
num_user                5.060000e+03
budget                  4.200000e+09
title_year              2.016000e+03
actor_2_fb              1.370000e+05
imdb_score              9.500000e+00
aspect_ratio            1.600000e+01
movie_fb                3.490000e+05
dtype: float64

In [34]:
movies.mean()

num_critic              1.379889e+02
duration                1.070908e+02
director_fb             6.910145e+02
actor_3_fb              6.312763e+02
actor_1_fb              6.494488e+03
gross                   4.764451e+07
num_voted_users         8.264492e+04
cast_total_fb           9.579816e+03
facenumber_in_poster    1.377320e+00
num_user                2.676688e+02
budget                  3.654749e+07
title_year              2.002448e+03
actor_2_fb              1.621924e+03
imdb_score              6.437429e+00
aspect_ratio            2.222349e+00
movie_fb                7.348294e+03
dtype: float64

In [35]:
movies.median()

num_critic                   108.00
duration                     103.00
director_fb                   48.00
actor_3_fb                   366.00
actor_1_fb                   982.00
gross                   25043962.00
num_voted_users            33132.50
cast_total_fb               3049.00
facenumber_in_poster           1.00
num_user                     153.00
budget                  19850000.00
title_year                  2005.00
actor_2_fb                   593.00
imdb_score                     6.60
aspect_ratio                   2.35
movie_fb                     159.00
dtype: float64

In [36]:
movies.std()

num_critic              1.202394e+02
duration                2.528602e+01
director_fb             2.832954e+03
actor_3_fb              1.625875e+03
actor_1_fb              1.510699e+04
gross                   6.737255e+07
num_voted_users         1.383222e+05
cast_total_fb           1.816432e+04
facenumber_in_poster    2.023826e+00
num_user                3.729348e+02
budget                  1.002427e+08
title_year              1.245398e+01
actor_2_fb              4.011300e+03
imdb_score              1.127802e+00
aspect_ratio            1.402940e+00
movie_fb                1.920602e+04
dtype: float64

In [37]:
movies.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic,4867.0,137.9889,120.2394,1.0,49.0,108.0,191.0,813.0
duration,4901.0,107.0908,25.28602,7.0,93.0,103.0,118.0,511.0
director_fb,4814.0,691.0145,2832.954,0.0,7.0,48.0,189.75,23000.0
actor_3_fb,4893.0,631.2763,1625.875,0.0,132.0,366.0,633.0,23000.0
actor_1_fb,4909.0,6494.488,15106.99,0.0,607.0,982.0,11000.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,5019656.25,25043962.0,61108412.75,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,8361.75,33132.5,93772.75,1689764.0
cast_total_fb,4916.0,9579.816,18164.32,0.0,1394.75,3049.0,13616.75,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,1.0,2.0,43.0
num_user,4895.0,267.6688,372.9348,1.0,64.0,153.0,320.5,5060.0


In [38]:
movies.describe(percentiles=[0.2,0.4,0.6,0.8]).T

Unnamed: 0,count,mean,std,min,20%,40%,50%,60%,80%,max
num_critic,4867.0,137.9889,120.2394,1.0,39.0,82.0,108.0,137.0,219.0,813.0
duration,4901.0,107.0908,25.28602,7.0,91.0,99.0,103.0,108.0,121.0,511.0
director_fb,4814.0,691.0145,2832.954,0.0,3.0,25.0,48.0,83.8,272.0,23000.0
actor_3_fb,4893.0,631.2763,1625.875,0.0,94.4,260.0,366.0,466.0,697.0,23000.0
actor_1_fb,4909.0,6494.488,15106.99,0.0,510.0,854.0,982.0,1000.0,13000.0,640000.0
gross,4054.0,47644510.0,67372550.0,162.0,2924634.2,15337338.6,25043962.0,36000000.0,74945462.8,760505800.0
num_voted_users,4916.0,82644.92,138322.2,5.0,5526.0,20885.0,33132.5,52244.0,120202.0,1689764.0
cast_total_fb,4916.0,9579.816,18164.32,0.0,1120.0,2348.0,3049.0,4302.0,16125.0,656730.0
facenumber_in_poster,4903.0,1.37732,2.023826,0.0,0.0,0.0,1.0,1.0,2.0,43.0
num_user,4895.0,267.6688,372.9348,1.0,47.0,113.0,153.0,205.0,384.0,5060.0


### Chaining DataFrame methods

In [39]:
movies.head().T

Unnamed: 0,0,1,2,3,4
color,Color,Color,Color,Color,
director_name,James Cameron,Gore Verbinski,Sam Mendes,Christopher Nolan,Doug Walker
num_critic,723,302,602,813,
duration,178,169,148,164,
director_fb,0,563,0,22000,131
actor_3_fb,855,1000,161,23000,
actor_2_name,Joel David Moore,Orlando Bloom,Rory Kinnear,Christian Bale,Rob Walker
actor_1_fb,1000,40000,11000,27000,131
gross,7.60506e+08,3.09404e+08,2.00074e+08,4.48131e+08,
genres,Action|Adventure|Fantasy|Sci-Fi,Action|Adventure|Fantasy,Action|Adventure|Thriller,Action|Thriller,Documentary


In [40]:
# shortening the column names
def shorten(col):
    return(
        str (col)
        .replace("facebook_likes","fb")
        .replace("_for_reviews","")
        )

movies = movies.rename(columns=shorten)
movies.head().T

Unnamed: 0,0,1,2,3,4
color,Color,Color,Color,Color,
director_name,James Cameron,Gore Verbinski,Sam Mendes,Christopher Nolan,Doug Walker
num_critic,723,302,602,813,
duration,178,169,148,164,
director_fb,0,563,0,22000,131
actor_3_fb,855,1000,161,23000,
actor_2_name,Joel David Moore,Orlando Bloom,Rory Kinnear,Christian Bale,Rob Walker
actor_1_fb,1000,40000,11000,27000,131
gross,7.60506e+08,3.09404e+08,2.00074e+08,4.48131e+08,
genres,Action|Adventure|Fantasy|Sci-Fi,Action|Adventure|Fantasy,Action|Adventure|Thriller,Action|Thriller,Documentary


In [41]:
# counting the missing values in the df

movies.isnull().head()

Unnamed: 0,color,director_name,num_critic,duration,director_fb,actor_3_fb,actor_2_name,actor_1_fb,gross,genres,...,num_user,language,country,content_rating,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,True,True,False,True,False,False,True,False,...,True,True,True,True,True,True,False,False,True,False


In [42]:
movies.isnull().dtypes

color                   bool
director_name           bool
num_critic              bool
duration                bool
director_fb             bool
actor_3_fb              bool
actor_2_name            bool
actor_1_fb              bool
gross                   bool
genres                  bool
actor_1_name            bool
movie_title             bool
num_voted_users         bool
cast_total_fb           bool
actor_3_name            bool
facenumber_in_poster    bool
plot_keywords           bool
movie_imdb_link         bool
num_user                bool
language                bool
country                 bool
content_rating          bool
budget                  bool
title_year              bool
actor_2_fb              bool
imdb_score              bool
aspect_ratio            bool
movie_fb                bool
dtype: object

In [43]:
movies.isnull().dtypes.value_counts()

bool    28
dtype: int64

In [44]:
movies.isnull().sum()

color                    19
director_name           102
num_critic               49
duration                 15
director_fb             102
actor_3_fb               23
actor_2_name             13
actor_1_fb                7
gross                   862
genres                    0
actor_1_name              7
movie_title               0
num_voted_users           0
cast_total_fb             0
actor_3_name             23
facenumber_in_poster     13
plot_keywords           152
movie_imdb_link           0
num_user                 21
language                 12
country                   5
content_rating          300
budget                  484
title_year              106
actor_2_fb               13
imdb_score                0
aspect_ratio            326
movie_fb                  0
dtype: int64

In [45]:
movies.isnull().sum().sum()

2654

In [46]:
movies.isnull().any()

color                    True
director_name            True
num_critic               True
duration                 True
director_fb              True
actor_3_fb               True
actor_2_name             True
actor_1_fb               True
gross                    True
genres                  False
actor_1_name             True
movie_title             False
num_voted_users         False
cast_total_fb           False
actor_3_name             True
facenumber_in_poster     True
plot_keywords            True
movie_imdb_link         False
num_user                 True
language                 True
country                  True
content_rating           True
budget                   True
title_year               True
actor_2_fb               True
imdb_score              False
aspect_ratio             True
movie_fb                False
dtype: bool

In [47]:
movies.isnull().any().any()

True

In [48]:
# aggregation methods (.min, .max, and .sum), do not return anything for object columns

movies[["color", "movie_title", "color"]].max()

Series([], dtype: float64)

In [49]:
# To force pandas to return something for each column, fill in the missing values with an empty string

movies.select_dtypes(["object"]).fillna("").max()

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
movie_title                                                 Æon Flux
actor_3_name                                           Óscar Jaenada
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
dtype: object

# Day 9 - April 30, 2020

### DataFrame Operations

In [56]:
colleges = pd.read_csv("D:\Learning Data Science\Python - Pandas\Pandas cookbook\Pandas-Cookbook-master\data\college.csv")

In [58]:
colleges.head(3).T

Unnamed: 0,0,1,2
INSTNM,Alabama A & M University,University of Alabama at Birmingham,Amridge University
CITY,Normal,Birmingham,Montgomery
STABBR,AL,AL,AL
HBCU,1,0,0
MENONLY,0,0,0
WOMENONLY,0,0,0
RELAFFIL,0,0,1
SATVRMID,424,570,
SATMTMID,420,565,
DISTANCEONLY,0,0,1


In [59]:
# colleges + 5

TypeError: can only concatenate str (not "int") to str

In [60]:
colleges = pd.read_csv("D:\Learning Data Science\Python - Pandas\Pandas cookbook\Pandas-Cookbook-master\data\college.csv", 
                       index_col="INSTNM")

In [61]:
colleges_ugds = colleges.filter(like = "UGDS_")
colleges_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [63]:
colleges_ugds.dtypes

UGDS_WHITE    float64
UGDS_BLACK    float64
UGDS_HISP     float64
UGDS_ASIAN    float64
UGDS_AIAN     float64
UGDS_NHPI     float64
UGDS_2MOR     float64
UGDS_NRA      float64
UGDS_UNKN     float64
dtype: object

In [64]:
name = "Northwest-Shoals Community College"
colleges_ugds.loc[name]

UGDS_WHITE    0.7912
UGDS_BLACK    0.1250
UGDS_HISP     0.0339
UGDS_ASIAN    0.0036
UGDS_AIAN     0.0088
UGDS_NHPI     0.0006
UGDS_2MOR     0.0012
UGDS_NRA      0.0033
UGDS_UNKN     0.0324
Name: Northwest-Shoals Community College, dtype: float64

In [65]:
colleges_ugds.loc[name].round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.12
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [67]:
(colleges_ugds.loc[name] + 0.001).round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.13
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [68]:
colleges_ugds + 0.00501

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03831,0.94031,0.01051,0.00691,0.00741,0.00691,0.00501,0.01091,0.01881
University of Alabama at Birmingham,0.59721,0.26501,0.03331,0.05681,0.00721,0.00571,0.04181,0.02291,0.01501
Amridge University,0.30401,0.42421,0.01191,0.00841,0.00501,0.00501,0.00501,0.00501,0.27651
University of Alabama in Huntsville,0.70381,0.13051,0.04321,0.04261,0.01931,0.00521,0.02221,0.03821,0.04001
Alabama State University,0.02081,0.92581,0.01711,0.00691,0.00601,0.00561,0.01481,0.02931,0.01871
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [69]:
(colleges_ugds + 0.00501) // 0.01

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,3.0,94.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
University of Alabama at Birmingham,59.0,26.0,3.0,5.0,0.0,0.0,4.0,2.0,1.0
Amridge University,30.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,27.0
University of Alabama in Huntsville,70.0,13.0,4.0,4.0,1.0,0.0,2.0,3.0,4.0
Alabama State University,2.0,92.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [76]:
colleges_ugds_op_round = ((colleges_ugds + 0.00501) // 0.01 / 100)

In [77]:
colleges_ugds_round = (colleges_ugds + 0.00001).round(2)
colleges_ugds_round

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.00,0.00,0.0,0.00,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.00,0.0,0.04,0.02,0.01
Amridge University,0.30,0.42,0.01,0.00,0.00,0.0,0.00,0.00,0.27
University of Alabama in Huntsville,0.70,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.00,0.00,0.0,0.01,0.02,0.01
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [80]:
colleges_ugds_op_round.equals(colleges_ugds_round)

True

In [85]:
colleges_ugds_op_round == colleges_ugds_round

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,True,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,True,True,True,True,True
Amridge University,True,True,True,True,True,True,True,True,True
University of Alabama in Huntsville,True,True,True,True,True,True,True,True,True
Alabama State University,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False


### Comparing missing values

In [86]:
np.nan == np.nan

False

In [88]:
None == None

True

In [90]:
np.nan > 5

False

In [91]:
np.nan < 5

False

In [92]:
np.nan != 5

True

In [84]:
colleges_ugds_op_round == colleges_ugds_round

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,True,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,True,True,True,True,True
Amridge University,True,True,True,True,True,True,True,True,True
University of Alabama in Huntsville,True,True,True,True,True,True,True,True,True
Alabama State University,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False


In [93]:
colleges_ugds = colleges.filter(like="UGDS_")

In [97]:
colleges_ugds

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0000,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.2600,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.0100
Amridge University,0.2990,0.4192,0.0069,0.0034,0.0000,0.0000,0.0000,0.0000,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.0350
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.0010,0.0006,0.0098,0.0243,0.0137
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [99]:
colleges_ugds == 2

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,False,False,False,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,False,False,False,False,False
Alabama State University,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False


In [107]:
colleges_ugds.eq(0.0019) # same as college_ugds == .0019

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,True,False,True,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,False,False,False,False,False
Alabama State University,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False


In [101]:
colleges_ugds_selfcompare = (colleges_ugds == colleges_ugds)
colleges_ugds_selfcompare

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,True,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,True,True,True,True,True
Amridge University,True,True,True,True,True,True,True,True,True
University of Alabama in Huntsville,True,True,True,True,True,True,True,True,True
Alabama State University,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False


In [102]:
colleges_ugds_selfcompare.all()

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

In [103]:
(colleges_ugds == np.nan).sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [104]:
colleges_ugds.isna().sum()

UGDS_WHITE    661
UGDS_BLACK    661
UGDS_HISP     661
UGDS_ASIAN    661
UGDS_AIAN     661
UGDS_NHPI     661
UGDS_2MOR     661
UGDS_NRA      661
UGDS_UNKN     661
dtype: int64

In [105]:
# The correct way to compare two entire DataFrames with one another is not with the
# equals operator (==) but with the .equals method. This method treats NaNs that
# are in the same location as equal (note that the .eq method is the equivalent of ==)

colleges_ugds.equals(colleges_ugds)

True

In [110]:
# In pandas.testing sub-package, the assert_frame_equal function raises an AssertionError if two DataFrames are not equal
# It returns None if the two DataFrames are equal 
# To be used when creating unit tests 

from pandas.testing import assert_frame_equal
assert_frame_equal(colleges_ugds, colleges_ugds) is None

True

### Transposing the direction of a DataFrame operation

In [112]:
colleges_ugds = colleges.filter(like="UGDS_")

In [113]:
colleges_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [115]:
# Axis parameter controls the direction in which the operation takes place. Axis parameters can be 'index'(0) or 'columns'(1)

colleges_ugds.count() # For count, default axis parameter is set to 0

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

In [116]:
colleges_ugds.count(axis="columns")

INSTNM
Alabama A & M University                                  9
University of Alabama at Birmingham                       9
Amridge University                                        9
University of Alabama in Huntsville                       9
Alabama State University                                  9
                                                         ..
SAE Institute of Technology  San Francisco                0
Rasmussen College - Overland Park                         0
National Personal Training Institute of Cleveland         0
Bay Area Medical Academy - San Jose Satellite Location    0
Excel Learning Center-San Antonio South                   0
Length: 7535, dtype: int64

In [119]:
colleges_ugds.sum(axis = "columns")

INSTNM
Alabama A & M University                                  1.0000
University of Alabama at Birmingham                       0.9999
Amridge University                                        1.0000
University of Alabama in Huntsville                       1.0000
Alabama State University                                  1.0000
                                                           ...  
SAE Institute of Technology  San Francisco                0.0000
Rasmussen College - Overland Park                         0.0000
National Personal Training Institute of Cleveland         0.0000
Bay Area Medical Academy - San Jose Satellite Location    0.0000
Excel Learning Center-San Antonio South                   0.0000
Length: 7535, dtype: float64

In [121]:
colleges_ugds.median(axis="index")

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
UGDS_NHPI     0.00000
UGDS_2MOR     0.01750
UGDS_NRA      0.00000
UGDS_UNKN     0.01430
dtype: float64

##### Key to remembering this is that a Series only has one axis, the index (or 0). A DataFrame also has an index (axis 0) and columns (axis 1).

In [122]:
colleges_ugds.cumsum(axis = "columns")

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9686,0.9741,0.9760,0.9784,0.9803,0.9803,0.9862,1.0000
University of Alabama at Birmingham,0.5922,0.8522,0.8805,0.9323,0.9345,0.9352,0.9720,0.9899,0.9999
Amridge University,0.2990,0.7182,0.7251,0.7285,0.7285,0.7285,0.7285,0.7285,1.0000
University of Alabama in Huntsville,0.6988,0.8243,0.8625,0.9001,0.9144,0.9146,0.9318,0.9650,1.0000
Alabama State University,0.0158,0.9366,0.9487,0.9506,0.9516,0.9522,0.9620,0.9863,1.0000
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [123]:
colleges_ugds.cumsum(axis = "index")

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0000,0.0059,0.0138
University of Alabama at Birmingham,0.6255,1.1953,0.0338,0.0537,0.0046,0.0026,0.0368,0.0238,0.0238
Amridge University,0.9245,1.6145,0.0407,0.0571,0.0046,0.0026,0.0368,0.0238,0.2953
University of Alabama in Huntsville,1.6233,1.7400,0.0789,0.0947,0.0189,0.0028,0.0540,0.0570,0.3303
Alabama State University,1.6391,2.6608,0.0910,0.0966,0.0199,0.0034,0.0638,0.0813,0.3440
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


## Determining college campus diversity

In [128]:
ls "D:\Learning Data Science\Python - Pandas\Pandas cookbook\Pandas-Cookbook-master\data"

 Volume in drive D is Data
 Volume Serial Number is 4E50-31E2

 Directory of D:\Learning Data Science\Python - Pandas\Pandas cookbook\Pandas-Cookbook-master\data

12/11/2019  09:49 AM    <DIR>          .
12/11/2019  09:49 AM    <DIR>          ..
12/11/2019  09:49 AM            75,869 aapl_stock.csv
12/11/2019  09:49 AM            87,672 amzn_stock.csv
12/11/2019  09:49 AM    <DIR>          backup
12/11/2019  09:49 AM             1,468 baseball14.csv
12/11/2019  09:49 AM             1,402 baseball15.csv
12/11/2019  09:49 AM             1,481 baseball16.csv
12/11/2019  09:49 AM           884,736 chinook.db
12/11/2019  09:49 AM         1,252,058 college.csv
12/11/2019  09:49 AM             1,028 college_data_dictionary.csv
12/11/2019  09:49 AM               514 college_diversity.csv
12/11/2019  09:49 AM        32,165,563 crime.h5
12/11/2019  09:49 AM             1,360 denver_neigh_pop.csv
12/11/2019  09:49 AM            60,303 denver_neighborhood_pop.csv
12/11/2019  09:49 AM              

In [129]:
pd.read_csv ("D:\Learning Data Science\Python - Pandas\Pandas cookbook\Pandas-Cookbook-master\data\college_diversity.csv")

Unnamed: 0,School,Diversity Index
0,"Rutgers University--Newark Newark, NJ",0.76
1,"Andrews University Berrien Springs, MI",0.74
2,"Stanford University Stanford, CA",0.74
3,"University of Houston Houston, TX",0.74
4,"University of Nevada--Las Vegas Las Vegas, NV",0.74
5,"University of San Francisco San Francisco, CA",0.74
6,"San Francisco State University San Francisco, CA",0.73
7,"University of Illinois--Chicago Chicago, IL",0.73
8,"New Jersey Institute of Technology Newark, NJ",0.72
9,"Texas Woman's University Denton, TX",0.72


In [130]:
# Of the 9 different categories of race in college dataset, the categories having more than 15% of the student population
# contribute to the diversity of the college

In [135]:
# Reading data in "college"

college = pd.read_csv ("D:\Learning Data Science\Python - Pandas\Pandas cookbook\Pandas-Cookbook-master\data\college.csv", 
                      index_col="INSTNM")
college_ugds = college.filter(like = "UGDS_")
college_ugds

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0000,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.2600,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.0100
Amridge University,0.2990,0.4192,0.0069,0.0034,0.0000,0.0000,0.0000,0.0000,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.0350
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.0010,0.0006,0.0098,0.0243,0.0137
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [140]:
# Finding how many null values colleges have and sorting colleges in descending order of null values

college_ugds.isnull().sum(axis = "columns").sort_values(ascending = False)

INSTNM
Excel Learning Center-San Antonio South         9
Philadelphia College of Osteopathic Medicine    9
Assemblies of God Theological Seminary          9
Episcopal Divinity School                       9
Phillips Graduate Institute                     9
                                               ..
Carroll Community College                       0
University of Phoenix-New Mexico                0
Galen College of Nursing-Tampa Bay              0
Galen College of Nursing-San Antonio            0
Alabama A & M University                        0
Length: 7535, dtype: int64

In [142]:
college_ugds.dropna(how = "all")

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0000,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.2600,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.0100
Amridge University,0.2990,0.4192,0.0069,0.0034,0.0000,0.0000,0.0000,0.0000,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.0350
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.0010,0.0006,0.0098,0.0243,0.0137
...,...,...,...,...,...,...,...,...,...
Hollywood Institute of Beauty Careers-West Palm Beach,0.2182,0.4182,0.2364,0.0182,0.0000,0.0000,0.0000,0.0182,0.0909
Hollywood Institute of Beauty Careers-Casselberry,0.1200,0.3333,0.4400,0.0000,0.0000,0.0000,0.0400,0.0000,0.0667
Coachella Valley Beauty College-Beaumont,0.3284,0.1045,0.4925,0.0149,0.0299,0.0149,0.0149,0.0000,0.0000
Dewey University-Mayaguez,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000


In [146]:
college_ugds = college_ugds.dropna(how="all")
college_ugds.isnull().sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [147]:
college_ugds.shape

(6874, 9)

In [148]:
college_ugds.ge(0.15)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,True,False,False,False,False,False,False,False
University of Alabama at Birmingham,True,True,False,False,False,False,False,False,False
Amridge University,True,True,False,False,False,False,False,False,True
University of Alabama in Huntsville,True,False,False,False,False,False,False,False,False
Alabama State University,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
Hollywood Institute of Beauty Careers-West Palm Beach,True,True,True,False,False,False,False,False,False
Hollywood Institute of Beauty Careers-Casselberry,False,True,True,False,False,False,False,False,False
Coachella Valley Beauty College-Beaumont,True,False,True,False,False,False,False,False,False
Dewey University-Mayaguez,False,False,True,False,False,False,False,False,False


In [153]:
diversity_matrix = college_ugds.ge(0.15).sum(axis = "columns")
diversity_matrix

INSTNM
Alabama A & M University                                 1
University of Alabama at Birmingham                      2
Amridge University                                       3
University of Alabama in Huntsville                      1
Alabama State University                                 1
                                                        ..
Hollywood Institute of Beauty Careers-West Palm Beach    3
Hollywood Institute of Beauty Careers-Casselberry        2
Coachella Valley Beauty College-Beaumont                 2
Dewey University-Mayaguez                                1
Coastal Pines Technical College                          2
Length: 6874, dtype: int64

In [154]:
diversity_matrix.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

In [156]:
diversity_matrix.sort_values(ascending=False).head()

INSTNM
Regency Beauty Institute-Austin          5
Central Texas Beauty College-Temple      5
Sullivan and Cogliano Training Center    4
Ambria College of Nursing                4
Berkeley College-New York                4
dtype: int64

In [204]:
college_ugds.loc[["Regency Beauty Institute-Austin","Central Texas Beauty College-Temple"]]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,0.0,0.0,0.1733,0.0,0.2667
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,0.0,0.0,0.1717,0.0,0.1515


In [161]:
us_news_top = [
    "Rutgers University-Newark",
    "Andrews University",
    "Stanford University",
    "University of Houston",
    "University of Nevada-Las Vegas",
]
diversity_matrix.loc[us_news_top]

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64

In [160]:
college_ugds.loc[us_news_top]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rutgers University-Newark,0.256,0.1887,0.2554,0.2146,0.0012,0.0041,0.0285,0.0304,0.0212
Andrews University,0.2732,0.2063,0.1332,0.1357,0.0018,0.0037,0.0282,0.1983,0.0196
Stanford University,0.3752,0.0591,0.1607,0.1979,0.0114,0.0038,0.1067,0.0819,0.0031
University of Houston,0.2718,0.1083,0.3133,0.2167,0.001,0.0027,0.0333,0.0451,0.0077
University of Nevada-Las Vegas,0.3663,0.0763,0.2517,0.1549,0.0029,0.0144,0.0874,0.036,0.0101


In [188]:
#find the schools that are least diverse by ordering them by their maximum race percentage

college_ugds.max(axis = "columns").sort_values(ascending = False).head(277)

INSTNM
Dewey University-Manati                                  1.0000
Yeshiva and Kollel Harbotzas Torah                       1.0000
Mr Leon's School of Hair Design-Lewiston                 1.0000
Dewey University-Bayamon                                 1.0000
Shepherds Theological Seminary                           1.0000
                                                          ...  
Kehilath Yakov Rabbinical Seminary                       1.0000
Antilles School of Technical Careers                     1.0000
Beth Hatalmud Rabbinical College                         1.0000
Inter American University of Puerto Rico-Barranquitas    0.9991
Palau Community College                                  0.9983
Length: 277, dtype: float64

In [201]:
college_ugds > 0.01

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,False,False,False,False,False,False,True
University of Alabama at Birmingham,True,True,True,True,False,False,True,True,False
Amridge University,True,True,False,False,False,False,False,False,True
University of Alabama in Huntsville,True,True,True,True,True,False,True,True,True
Alabama State University,True,True,True,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...
Hollywood Institute of Beauty Careers-West Palm Beach,True,True,True,True,False,False,False,True,True
Hollywood Institute of Beauty Careers-Casselberry,True,True,True,False,False,False,True,False,True
Coachella Valley Beauty College-Beaumont,True,True,True,True,True,True,True,False,False
Dewey University-Mayaguez,False,False,True,False,False,False,False,False,False


In [199]:
(college_ugds > 0.01).all(axis = "columns").any()

True

In [194]:
(college_ugds > 0.01).sum(axis = "columns").sort_values(ascending = False)

INSTNM
South Puget Sound Community College    9
Northwest College of Art & Design      9
Salt Lake Community College            9
DeVry University-Nevada                9
Access Careers                         9
                                      ..
Professional Business College          0
Education and Technology Institute     0
Taft University System                 0
Prince Institute-Rocky Mountains       0
Spanish-American Institute             0
Length: 6874, dtype: int64

In [206]:
college_ugds.loc[["South Puget Sound Community College"]]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
South Puget Sound Community College,0.6351,0.0322,0.085,0.0509,0.0125,0.0102,0.0926,0.0112,0.0706


### End of Chapter 2