<a href="https://colab.research.google.com/github/Saifullah785/python-data-science-handbook-notes/blob/main/03_09_Pivot_Tables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Pivot Tables**

### **Motivating Pivot Tables**

In [15]:
# Import necessary libraries: numpy for numerical operations, pandas for data manipulation, and seaborn for data visualization.
import numpy as np
import pandas as pd
import seaborn as sns

# Load the 'titanic' dataset from seaborn.
titanic = sns.load_dataset('titanic')

In [16]:
# Display the first 5 rows of the titanic DataFrame to get a preview of the data.
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


#**Pivot Tables by Hand**

In [17]:
# Group the titanic DataFrame by 'sex' and calculate the mean of the 'survived' column for each sex.
# This shows the survival rate for males and females.
titanic.groupby('sex')[['survived']].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [18]:
# Group the titanic DataFrame by 'sex' and 'class', calculate the mean of 'survived',
# and then unstack the result to create a pivot-table-like structure showing survival rates by sex and class.
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

  titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()


class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


#**Pivot Table Syntax**

In [19]:
# Create a pivot table to show the survival rate ('survived') based on 'sex' (index) and 'class' (columns).
# The aggregation function used is 'mean'.
titanic.pivot_table('survived', index='sex', columns='class', aggfunc='mean')
#

  titanic.pivot_table('survived', index='sex', columns='class', aggfunc='mean')


class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


#**Multilevel Pivot Tables**

In [20]:
# Create a new categorical variable 'age' by cutting the 'age' column into bins [0, 18] and [18, 80].
age = pd.cut(titanic['age'], [0, 18, 80])
# Create a pivot table showing survival rate based on 'sex' and the new 'age' categories (multilevel index),
# and 'class' (columns).
titanic.pivot_table('survived', ['sex', age], 'class')

  titanic.pivot_table('survived', ['sex', age], 'class')


Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


In [21]:
# Create a new categorical variable 'fare' by quantiling the 'fare' column into 2 bins.
fare = pd.qcut(titanic['fare'], 2)
# Create a pivot table showing survival rate based on 'sex' and 'age' categories (multilevel index),
# and 'fare' quantiles and 'class' (multilevel columns).
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])
#

  titanic.pivot_table('survived', ['sex', age], [fare, 'class'])


Unnamed: 0_level_0,fare,"(-0.001, 14.454]","(-0.001, 14.454]","(-0.001, 14.454]","(14.454, 512.329]","(14.454, 512.329]","(14.454, 512.329]"
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",,1.0,0.714286,0.909091,1.0,0.318182
female,"(18, 80]",,0.88,0.444444,0.972973,0.914286,0.391304
male,"(0, 18]",,0.0,0.26087,0.8,0.818182,0.178571
male,"(18, 80]",0.0,0.098039,0.125,0.391304,0.030303,0.192308


#**Additional Pivot Table Options**

# call signature as of Pandas 1.3.5
DataFrame.pivot_table(
                      data,
                      
                      values=None, index=None,
                      
                      columns=None,
                      
                      aggfunc='mean',
                      
                      fill_value=None,
                      
                      margins=False,
                      
                      dropna=True,
                      
                      margins_name='All',
                      
                      observed=False,
                      
                      sort=True)

In [22]:
# Create a pivot table with 'sex' as the index and 'class' as the columns.
# Use different aggregation functions for 'survived' (sum) and 'fare' (mean).
titanic.pivot_table(index='sex', columns='class',
                    aggfunc={'survived':sum, 'fare':'mean'})

  titanic.pivot_table(index='sex', columns='class',
  titanic.pivot_table(index='sex', columns='class',


Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


In [23]:
# Create a pivot table showing the survival rate based on 'sex' and 'class'.
# Include margins=True to add row and column sums (labeled 'All') showing overall survival rates.
titanic.pivot_table('survived', index='sex', columns='class',
                    margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


#**Example: Birthrate Data**

In [24]:
# Read the births data from a CSV file hosted on GitHub into a pandas DataFrame.
births = pd.read_csv('https://raw.githubusercontent.com/jakevdp/data-CDCbirths/master/births.csv')

In [25]:
# Display the first 5 rows of the births DataFrame to get a preview of the data.
births.head()

Unnamed: 0,year,month,day,gender,births
0,1969,1,1.0,F,4046
1,1969,1,1.0,M,4440
2,1969,1,2.0,F,4454
3,1969,1,2.0,M,4548
4,1969,1,3.0,F,4548
