# Table Basics

31.10.2025 Sanna Määttä

### Introduction
- This Notebook contains examples and tasks how to use Pandas to create tables.


In [1]:
# Take libraries in use
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns

## Read data to DataFrame

In [2]:
df = sns.load_dataset('taxis')
df.head()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan


In [3]:
df.describe()

Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total
count,6433,6433,6433.0,6433.0,6433.0,6433.0,6433.0,6433.0
mean,2019-03-16 08:31:28.514223616,2019-03-16 08:45:49.491217408,1.539251,3.024617,13.091073,1.97922,0.325273,18.517794
min,2019-02-28 23:29:03,2019-02-28 23:32:35,0.0,0.0,1.0,0.0,0.0,1.3
25%,2019-03-08 15:50:34,2019-03-08 16:12:51,1.0,0.98,6.5,0.0,0.0,10.8
50%,2019-03-15 21:46:58,2019-03-15 22:06:44,1.0,1.64,9.5,1.7,0.0,14.16
75%,2019-03-23 17:41:38,2019-03-23 17:51:56,2.0,3.21,15.0,2.8,0.0,20.3
max,2019-03-31 23:43:45,2019-04-01 00:13:58,6.0,36.7,150.0,33.2,24.02,174.82
std,,,1.203768,3.827867,11.551804,2.44856,1.415267,13.81557


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6433 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pickup           6433 non-null   datetime64[ns]
 1   dropoff          6433 non-null   datetime64[ns]
 2   passengers       6433 non-null   int64         
 3   distance         6433 non-null   float64       
 4   fare             6433 non-null   float64       
 5   tip              6433 non-null   float64       
 6   tolls            6433 non-null   float64       
 7   total            6433 non-null   float64       
 8   color            6433 non-null   object        
 9   payment          6389 non-null   object        
 10  pickup_zone      6407 non-null   object        
 11  dropoff_zone     6388 non-null   object        
 12  pickup_borough   6407 non-null   object        
 13  dropoff_borough  6388 non-null   object        
dtypes: datetime64[ns](2), float64(5), int64(

In [5]:
# remove rows with null values
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6341 entries, 0 to 6432
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pickup           6341 non-null   datetime64[ns]
 1   dropoff          6341 non-null   datetime64[ns]
 2   passengers       6341 non-null   int64         
 3   distance         6341 non-null   float64       
 4   fare             6341 non-null   float64       
 5   tip              6341 non-null   float64       
 6   tolls            6341 non-null   float64       
 7   total            6341 non-null   float64       
 8   color            6341 non-null   object        
 9   payment          6341 non-null   object        
 10  pickup_zone      6341 non-null   object        
 11  dropoff_zone     6341 non-null   object        
 12  pickup_borough   6341 non-null   object        
 13  dropoff_borough  6341 non-null   object        
dtypes: datetime64[ns](2), float64(5), int64(1), o

### Create tables using data

In [6]:
df.columns

Index(['pickup', 'dropoff', 'passengers', 'distance', 'fare', 'tip', 'tolls',
       'total', 'color', 'payment', 'pickup_zone', 'dropoff_zone',
       'pickup_borough', 'dropoff_borough'],
      dtype='object')

## Cross-tabulation with Pandas
- https://pandas.pydata.org/docs/reference/api/pandas.crosstab.html

In [None]:
# Default
# pandas.crosstab(index, columns, values=None, rownames=None, colnames=None,
#                aggfunc=None, margins=False, margins_name='All',
#                dropna=True, normalize=False)

In [7]:
# Cross-tabulatin table with orginal variable names
pd.crosstab(df['pickup_borough'],df['payment'],
            margins=True, margins_name="Total")

payment,cash,credit card,Total
pickup_borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bronx,25,74,99
Brooklyn,118,260,378
Manhattan,1394,3832,5226
Queens,258,380,638
Total,1795,4546,6341


In [8]:
# Change the names of row and column titles
pd.crosstab(df['pickup_borough'],df['payment'],
            rownames =['Pickup place'], colnames = ['Payment method'],
            margins=True, margins_name="Total")

Payment method,cash,credit card,Total
Pickup place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bronx,25,74,99
Brooklyn,118,260,378
Manhattan,1394,3832,5226
Queens,258,380,638
Total,1795,4546,6341


In [9]:
# Show values as fractions
# normalize based on 'index', when total per row = 1.0
# option is to normalize per 'columns'
pd.crosstab(df['pickup_borough'],df['payment'],
            rownames =['Pickup place'], colnames = ['Payment method'],
            normalize = 'index',
            margins=True, margins_name="Total")


Payment method,cash,credit card
Pickup place,Unnamed: 1_level_1,Unnamed: 2_level_1
Bronx,0.252525,0.747475
Brooklyn,0.312169,0.687831
Manhattan,0.266743,0.733257
Queens,0.404389,0.595611
Total,0.283078,0.716922


In [10]:
# Show values as % by multiplying result with 100
pd.crosstab(df['pickup_borough'],df['payment'],
            rownames =['Pickup place'], colnames = ['Payment method'],
            normalize = 'index',
            margins=True, margins_name="Total")*100

Payment method,cash,credit card
Pickup place,Unnamed: 1_level_1,Unnamed: 2_level_1
Bronx,25.252525,74.747475
Brooklyn,31.216931,68.783069
Manhattan,26.674321,73.325679
Queens,40.438871,59.561129
Total,28.307838,71.692162


In [12]:
# check variable names
df.columns

Index(['pickup', 'dropoff', 'passengers', 'distance', 'fare', 'tip', 'tolls',
       'total', 'color', 'payment', 'pickup_zone', 'dropoff_zone',
       'pickup_borough', 'dropoff_borough'],
      dtype='object')

In [13]:
# Use more variables in cross-tabulation
pd.crosstab([df['pickup_borough'], df['payment']],df['dropoff_borough'],
            rownames=['Pickup place', 'Payment method'], colnames=['Dropoff place'],
            margins=True, margins_name="Total")

Unnamed: 0_level_0,Dropoff place,Bronx,Brooklyn,Manhattan,Queens,Staten Island,Total
Pickup place,Payment method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bronx,cash,21,0,4,0,0,25
Bronx,credit card,45,4,21,4,0,74
Brooklyn,cash,0,104,11,3,0,118
Brooklyn,credit card,5,176,56,23,0,260
Manhattan,cash,27,35,1298,34,0,1394
Manhattan,credit card,27,116,3559,128,2,3832
Queens,cash,4,14,47,193,0,258
Queens,credit card,7,48,176,149,0,380
Total,,136,497,5172,534,2,6341


## Pivot table

### Cross-tabulation using Pivot table
https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html

In [None]:
# default
# pandas.pivot_table(data, values=None, index=None, columns=None,
#                   aggfunc='mean', fill_value=None, margins=False,
#                   dropna=True, margins_name='All', observed=<no_default>, sort=True)


In [14]:
# Show pivot table for variables with default (i.e. aggfunc = 'mean')
# Parameter "values" define
pd.pivot_table(df, values='passengers',
               index=['pickup_borough'])

Unnamed: 0_level_0,passengers
pickup_borough,Unnamed: 1_level_1
Bronx,1.191919
Brooklyn,1.314815
Manhattan,1.569269
Queens,1.528213


In [15]:
# use aggfunc "sum" to get frequency
pd.pivot_table(df, values='passengers',
               index=['pickup_borough'],
               aggfunc="sum")

Unnamed: 0_level_0,passengers
pickup_borough,Unnamed: 1_level_1
Bronx,118
Brooklyn,497
Manhattan,8201
Queens,975


In [16]:
#  One variable in row and another in column
pd.pivot_table(df, values='passengers',
               index=['pickup_borough'],
               columns=['dropoff_borough'],
               aggfunc="sum")

dropoff_borough,Bronx,Brooklyn,Manhattan,Queens,Staten Island
pickup_borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bronx,84.0,4.0,26.0,4.0,
Brooklyn,7.0,354.0,101.0,35.0,
Manhattan,70.0,223.0,7634.0,272.0,2.0
Queens,15.0,106.0,348.0,506.0,


In [17]:
# fill NaN values with 0
#  One variable in row and another in column
pd.pivot_table(df, values='passengers',
               index=['pickup_borough'],
               columns=['dropoff_borough'],
               fill_value = 0,
               aggfunc="sum")

dropoff_borough,Bronx,Brooklyn,Manhattan,Queens,Staten Island
pickup_borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bronx,84,4,26,4,0
Brooklyn,7,354,101,35,0
Manhattan,70,223,7634,272,2
Queens,15,106,348,506,0


In [18]:
# present several aggfunctions is result
df.pivot_table(values='passengers', index=['pickup_borough', 'dropoff_borough'],
                   aggfunc=['min', 'median', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,min,median,max
Unnamed: 0_level_1,Unnamed: 1_level_1,passengers,passengers,passengers
pickup_borough,dropoff_borough,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Bronx,Bronx,1,1.0,5
Bronx,Brooklyn,1,1.0,1
Bronx,Manhattan,0,1.0,2
Bronx,Queens,1,1.0,1
Brooklyn,Bronx,1,1.0,3
Brooklyn,Brooklyn,0,1.0,6
Brooklyn,Manhattan,1,1.0,6
Brooklyn,Queens,1,1.0,6
Manhattan,Bronx,1,1.0,5
Manhattan,Brooklyn,0,1.0,6


In [19]:
# use same precision for all data (no decimals)
# use .style.format(precision=0)
df.pivot_table(values='passengers', index=['pickup_borough', 'dropoff_borough'],
                   aggfunc=['min', 'median', 'max']).style.format(precision=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,median,max
Unnamed: 0_level_1,Unnamed: 1_level_1,passengers,passengers,passengers
pickup_borough,dropoff_borough,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Bronx,Bronx,1,1,5
Bronx,Brooklyn,1,1,1
Bronx,Manhattan,0,1,2
Bronx,Queens,1,1,1
Brooklyn,Bronx,1,1,3
Brooklyn,Brooklyn,0,1,6
Brooklyn,Manhattan,1,1,6
Brooklyn,Queens,1,1,6
Manhattan,Bronx,1,1,5
Manhattan,Brooklyn,0,1,6


In [20]:
# Highlight maximum and minimum values
df.pivot_table(values='passengers', index=['pickup_borough', 'dropoff_borough'],
                   aggfunc=['min', 'median', 'max']).style.format(precision=0)\
                    .highlight_max(color='lightgreen', axis=0)\
                    .highlight_min(color='yellow', axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,median,max
Unnamed: 0_level_1,Unnamed: 1_level_1,passengers,passengers,passengers
pickup_borough,dropoff_borough,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Bronx,Bronx,1,1,5
Bronx,Brooklyn,1,1,1
Bronx,Manhattan,0,1,2
Bronx,Queens,1,1,1
Brooklyn,Bronx,1,1,3
Brooklyn,Brooklyn,0,1,6
Brooklyn,Manhattan,1,1,6
Brooklyn,Queens,1,1,6
Manhattan,Bronx,1,1,5
Manhattan,Brooklyn,0,1,6


## Frequency and Percentage distributions in table
- use crosstab() to calculate values
- https://pandas.pydata.org/docs/reference/api/pandas.crosstab.html

In [21]:
# Create DataFrame to include the data
pd.crosstab(df['pickup_borough'], 'Freq')

col_0,Freq
pickup_borough,Unnamed: 1_level_1
Bronx,99
Brooklyn,378
Manhattan,5226
Queens,638


In [22]:
# Modify row and column names
pd.crosstab(df['pickup_borough'], 'Freq',
            rownames =['Pickup place'], colnames = [''])

Unnamed: 0_level_0,Freq
Pickup place,Unnamed: 1_level_1
Bronx,99
Brooklyn,378
Manhattan,5226
Queens,638


## Data formatting
- https://pandas.pydata.org/docs/reference/api/pandas.io.formats.style.Styler.format.html

In [23]:
# Store the result in new DataFrame for formatting
df1 = pd.crosstab(df['pickup_borough'], 'Freq',
            rownames =['Pickup place'], colnames = [''])
df1

Unnamed: 0_level_0,Freq
Pickup place,Unnamed: 1_level_1
Bronx,99
Brooklyn,378
Manhattan,5226
Queens,638


In [24]:
# count sum and values for percentages
n = df1['Freq'].sum() # count total sum
df1['%'] = df1['Freq']/n*100 # Count % value and add new column
df1

Unnamed: 0_level_0,Freq,%
Pickup place,Unnamed: 1_level_1,Unnamed: 2_level_1
Bronx,99,1.561268
Brooklyn,378,5.961205
Manhattan,5226,82.416023
Queens,638,10.061504


In [32]:
# Create format for data
format = {'1km':'{:.0f}', '%':'{:.1f} %'}
# present table using the format
df1.style.format(format)

Unnamed: 0_level_0,Freq,%
Pickup place,Unnamed: 1_level_1,Unnamed: 2_level_1
Bronx,99,1.6 %
Brooklyn,378,6.0 %
Manhattan,5226,82.4 %
Queens,638,10.1 %


## Tasks

Load data of 'penguins' using Seaborn.
Then make 3 different tables using data where
- table contains frequencies and % of one variable
- table contains cross-tabulation for two variables using crosstab
- table contain cross-tabulation for two variables using pivot_table
Pay attention, which would be good variables to select (categorical or numerical, missing values vs. no missing values, etc.)

In [34]:
df = sns.load_dataset('penguins')
df.head(20)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,Female
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [36]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.8+ KB


In [47]:
df2 = pd.crosstab(df['species'], 'Freq',
            rownames =['Type of Species'], colnames = [''])
df2

m = df2['Freq'].sum() 
df2['%'] = df2['Freq']/m*100
df2

format = {'1km':'{:.0f}', '%':'{:.1f} %'}

df2.style.format(format)

Unnamed: 0_level_0,Freq,%
Type of Species,Unnamed: 1_level_1,Unnamed: 2_level_1
Adelie,146,43.8 %
Chinstrap,68,20.4 %
Gentoo,119,35.7 %


In [61]:
pd.crosstab(df['species'], df['island'],
            rownames=['Types of Species'], colnames=['Per Island'],
            margins=True, margins_name="Total")


Per Island,Biscoe,Dream,Torgersen,Total
Types of Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelie,44,55,47,146
Chinstrap,0,68,0,68
Gentoo,119,0,0,119
Total,163,123,47,333


In [63]:
df.pivot_table(values='body_mass_g', index=['species', 'island'],
               aggfunc=['min', 'median', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,min,median,max
Unnamed: 0_level_1,Unnamed: 1_level_1,body_mass_g,body_mass_g,body_mass_g
species,island,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Adelie,Biscoe,2850.0,3750.0,4775.0
Adelie,Dream,2900.0,3600.0,4650.0
Adelie,Torgersen,2900.0,3700.0,4700.0
Chinstrap,Dream,2700.0,3700.0,4800.0
Gentoo,Biscoe,3950.0,5050.0,6300.0
