# Chapter 2: Essential DataFrame Operations

In [None]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)

## Introduction

## Selecting Multiple DataFrame Columns

### How to do it\...

In [None]:
movies = pd.read_csv('data/movie.csv')
movie_actor_director = movies[['actor_1_name', 'actor_2_name',
    'actor_3_name', 'director_name']]
movie_actor_director.head()

In [None]:
type(movies[['director_name']])

In [None]:
type(movies['director_name'])

In [None]:
type(movies.loc[:, ['director_name']])

In [None]:
type(movies.loc[:, 'director_name'])

### How it works\...

### There\'s more\...

In [None]:
cols = ['actor_1_name', 'actor_2_name',
        'actor_3_name', 'director_name']
movie_actor_director = movies[cols]

In [None]:
movies['actor_1_name', 'actor_2_name',
      'actor_3_name', 'director_name']

## Selecting Columns with Methods

### How it works\...

In [None]:
movies = pd.read_csv('data/movie.csv')
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )
movies = movies.rename(columns=shorten)
movies.dtypes.value_counts()

In [None]:
movies.select_dtypes(include='int').head()

In [None]:
movies.select_dtypes(include='number').head()

In [None]:
movies.select_dtypes(include=['int', 'object']).head()

In [None]:
movies.select_dtypes(exclude='float').head()

In [None]:
movies.filter(like='fb').head()

In [None]:
cols = ['actor_1_name', 'actor_2_name',
        'actor_3_name', 'director_name']
movies.filter(items=cols).head()

In [None]:
movies.filter(regex=r'\d').head()

### How it works\...

### There\'s more\...

### See also

## Ordering Column Names

### How to do it\...

In [None]:
movies = pd.read_csv('data/movie.csv')
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )
movies = movies.rename(columns=shorten)

In [None]:
movies.columns

In [None]:
cat_core = ['movie_title', 'title_year',
            'content_rating', 'genres']
cat_people = ['director_name', 'actor_1_name',
              'actor_2_name', 'actor_3_name']
cat_other = ['color', 'country', 'language',
             'plot_keywords', 'movie_imdb_link']
cont_fb = ['director_fb', 'actor_1_fb',
           'actor_2_fb', 'actor_3_fb',
           'cast_total_fb', 'movie_fb']
cont_finance = ['budget', 'gross']
cont_num_reviews = ['num_voted_users', 'num_user',
                    'num_critic']
cont_other = ['imdb_score', 'duration',
               'aspect_ratio', 'facenumber_in_poster']

In [None]:
new_col_order = cat_core + cat_people + \
                cat_other + cont_fb + \
                cont_finance + cont_num_reviews + \
                cont_other
set(movies.columns) == set(new_col_order)

In [None]:
movies[new_col_order].head()

### How it works\...

### There\'s more\...

### See also

## Summarizing a DataFrame

### How to do it\...

In [None]:
movies = pd.read_csv('data/movie.csv')
movies.shape

In [None]:
movies.size

In [None]:
movies.ndim

In [None]:
len(movies)

In [None]:
movies.count()

In [None]:
movies.min()

In [None]:
movies.describe().T

In [None]:
movies.describe(percentiles=[.01, .3, .99]).T

### How it works\...

### There\'s more\...

In [None]:
movies.min(skipna=False)

## Chaining DataFrame Methods

### How to do it\...

In [None]:
movies = pd.read_csv('data/movie.csv')
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )
movies = movies.rename(columns=shorten)
movies.isnull().head()

In [None]:
(movies
   .isnull()
   .sum()
   .head()
)

In [None]:
movies.isnull().sum().sum()

In [None]:
movies.isnull().any().any()

### How it works\...

In [None]:
movies.isnull().get_dtype_counts()

### There\'s more\...

In [None]:
movies[['color', 'movie_title', 'color']].max()

In [None]:
with pd.option_context('max_colwidth', 20):
    movies.select_dtypes(['object']).fillna('').max()

In [None]:
with pd.option_context('max_colwidth', 20):
    (movies
        .select_dtypes(['object'])
        .fillna('')
        .max()
    )

### See also

## DataFrame Operations

In [None]:
colleges = pd.read_csv('data/college.csv')
colleges + 5

In [None]:
colleges = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = colleges.filter(like='UGDS_')
college_ugds.head()

In [None]:
name = 'Northwest-Shoals Community College'
college_ugds.loc[name]

In [None]:
college_ugds.loc[name].round(2)

In [None]:
(college_ugds.loc[name] + .0001).round(2)

In [None]:
college_ugds + .00501

In [None]:
(college_ugds + .00501) // .01

In [None]:
college_ugds_op_round = (college_ugds + .00501) // .01 / 100
college_ugds_op_round.head()

In [None]:
college_ugds_round = (college_ugds + .00001).round(2)
college_ugds_round

In [None]:
college_ugds_op_round.equals(college_ugds_round)

### How it works\...

In [None]:
.045 + .005

### There\'s more\...

In [None]:
college2 = (college_ugds
    .add(.00501) 
    .floordiv(.01) 
    .div(100)
)
college2.equals(college_ugds_op_round)

### See also

## Comparing Missing Values

In [None]:
np.nan == np.nan

In [None]:
None == None

In [None]:
np.nan > 5

In [None]:
5 > np.nan

In [None]:
np.nan != 5

### Getting ready

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')

In [None]:
college_ugds == .0019

In [None]:
college_self_compare = college_ugds == college_ugds
college_self_compare.head()

In [None]:
college_self_compare.all()

In [None]:
(college_ugds == np.nan).sum()

In [None]:
college_ugds.isnull().sum()

In [None]:
college_ugds.equals(college_ugds)

### How it works\...

### There\'s more\...

In [None]:
college_ugds.eq(.0019)    # same as college_ugds == .0019

In [None]:
from pandas.testing import assert_frame_equal
assert_frame_equal(college_ugds, college_ugds) is None

## Transposing the direction of a DataFrame operation

### How to do it\...

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')
college_ugds.head()

In [None]:
college_ugds.count()

In [None]:
college_ugds.count(axis='columns').head()

In [None]:
college_ugds.sum(axis='columns').head()

In [None]:
college_ugds.median(axis='index')

### How it works\...

### There\'s more\...

In [None]:
college_ugds_cumsum = college_ugds.cumsum(axis=1)
college_ugds_cumsum.head()

### See also

## Determining college campus diversity

In [None]:
pd.read_csv('data/college_diversity.csv', index_col='School')

### How to do it\...

In [None]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')

In [None]:
(college_ugds.isnull()
   .sum(axis='columns')
   .sort_values(ascending=False)
   .head()
)

In [None]:
college_ugds = college_ugds.dropna(how='all')
college_ugds.isnull().sum()

In [None]:
college_ugds.ge(.15)

In [None]:
diversity_metric = college_ugds.ge(.15).sum(axis='columns')
diversity_metric.head()

In [None]:
diversity_metric.value_counts()

In [None]:
diversity_metric.sort_values(ascending=False).head()

In [None]:
college_ugds.loc[['Regency Beauty Institute-Austin',
                   'Central Texas Beauty College-Temple']]

In [None]:
us_news_top = ['Rutgers University-Newark',
                  'Andrews University',
                  'Stanford University',
                  'University of Houston',
                  'University of Nevada-Las Vegas']
diversity_metric.loc[us_news_top]

### How it works\...

### There\'s more\...

In [None]:
(college_ugds
   .max(axis=1)
   .sort_values(ascending=False)
   .head(10)
)

In [None]:
(college_ugds > .01).all(axis=1).any()

### See also