In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

- Pandas is a newer package built on top of NumPy
- Designed to make working with structured data (like tables or spreadsheets) easy and intuitive

# Intro

## The Pandas Series Object

- Series is an analog of a one-dimensional array with flexible indices
- The essential difference is the presence of the index: while the Numpy Array has an implicitly defined integer index used to access the values, the Pandas Series has an explicitly defined index associated with the values

In [4]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [5]:
# data.index
# data['b']
data.values

array([0.25, 0.5 , 0.75, 1.  ])

### Series as specialized dictionary

- Pandas Series a bit like a specialization of a Python dictionary
- Series is a structure which maps typed keys to a set of typed values
- Can be made even more clear by constructing a Series object directly from a Python dictionary


In [6]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [7]:
# Slicing
population['California':'New York']

California    38332521
Texas         26448193
New York      19651127
dtype: int64

## Pandas dataframe object

- DataFrame is an analog of a two-dimensional array with both flexible row indices and flexible column names
- DataFrame as a specialization of a dictionary. Where a dictionary maps a key to a value, a DataFrame maps a column name to a Series of column data
- A DataFrame is a collection of Series objects, and a single-column DataFrame can be constructed from a single Series

<br>![image.png](attachment:image.png)

In [8]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)

states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [9]:
# index attribute that gives access to the index labels
states.index

# columns attribute, which is an Index object holding the column labels
# states.columns

# values attribute
# states.values

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [10]:
# DataFrame as specialized dictionary
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [11]:
# Constructing DataFrame objects
# From a single Series object
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [12]:
# From dictionary of Series objects
data = pd.DataFrame({'population': population,
                     'area': area})
data

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [13]:
# From a two-dimensional NumPy array
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.409543,0.88999
b,0.610392,0.277766
c,0.735973,0.027019


In [14]:
# From a NumPy structured array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


## Pandas Index object

- Immutable array & Ordered set
- Facilitate operations such as joins across datasets
- Used by Python's built-in set data structure, so that unions, intersections, differences, and other combinations can be computed in a familiar way

In [15]:
# The Pandas Index Object (immutable array)
ind = pd.Index([2, 3, 5, 7, 11])
ind

Index([2, 3, 5, 7, 11], dtype='int64')

In [16]:
# Index as ordered set (intersection, union, difference, symmetric difference)
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

indA & indB  # intersection
indA | indB  # union
indA ^ indB  # symmetric difference

Index([3, 0, 0, 0, 2], dtype='int64')

# Data Indexing and Selection

## Data selection in series

- A Series object acts in many ways like a one-dimensional NumPy array

In [17]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])

# Series as dictionary
# data['b']

# using python expressions
# 'a' in data

# data.keys()

# Series objects can be modified with a dictionary-like syntax
data['e'] = 1.25

## Series as one-dimensional array

In [18]:
# slicing by explicit index
data['a':'c']

# slicing by implicit integer index
# data[0:2]

# masking
# data[(data > 0.3) & (data < 0.8)]

# fancy indexing
# data[['a', 'e']]

a    0.25
b    0.50
c    0.75
dtype: float64

## Indexers: loc, iloc and ix (deprecated)

- Slicing and indexing conventions can be a source of confusion:
  - ![image.png](attachment:image.png)
- Not functional methods, but attributes that expose a particular slicing interface to the data in the Series

In [19]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])

# loc attribute allows indexing and slicing that always references the explicit index
# data.loc[3: 5]

# iloc attribute allows indexing and slicing that always references the implicit Python-style index
# data.iloc[1]


## Data selection in DataFrame

- Acts in many ways like a two-dimensional or structured array
- In other ways like a dictionary of Series structures

### DataFrame as a dictionary

In [20]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [21]:
# Accessing columns
# data['area']

# equivalent to
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [22]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


### Additional indexing conventions

In [23]:
# data['Florida':'Illinois']

# data[1:3]

data[data.density > 100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


# Operating on Data in Pandas

## Ufuncs: Index Preservation

- Pandas is designed to work with NumPy, any NumPy ufunc will work on Pandas Series and DataFrame objects

In [24]:
# Series
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))

# DataFrame
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])

# element-wise exponentiation of values in the Series to the power of e (Euler's number)
print(f'With series values: \n{ser},\nthere are exponentiation of each \n{np.exp(ser)}')  


With series values: 
0    6
1    3
2    7
3    4
dtype: int32,
there are exponentiation of each 
0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64


## Ufuncs: Index Alignment

For binary operations on two Series or DataFrame objects, Pandas will align indices in the process of performing the operation

### Index Alignment in Series

In [25]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [26]:
# The resulting array contains the union of indices of the two input arrays, which could be determined using standard Python set arithmetic on these indices
# Which one or the other does not have an entry is marked with NaN, or "Not a Number", which is how Pandas marks missing data
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [27]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])

# The result is still has NaN values
# A + B

# The fill value can be modified using the add method
A.add(B, fill_value=0) # equivalent to A + B, but with fill_value=0, which fills in missing entries with the value 0

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index alignment in DataFrame

![image.png](attachment:image.png)

In [28]:
rng = np.random.RandomState(42)  # Using seed 42 for reproducibility

A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))

B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))

# A + B  # The indices are aligned correctly irrespective of their order in the two objects, and indices in the result are sorted

fill = A.stack().mean()
A.add(B, fill_value=fill)  # fill with 0 if there is no value

Unnamed: 0,A,B,C
0,10.0,26.0,18.25
1,16.0,19.0,18.25
2,16.25,19.25,15.25


### Ufuncs: Operations Between DataFrame and Series

- Operations between a DataFrame and a Series are similar to operations between a two-dimensional and one-dimensional NumPy array

In [29]:
A = rng.randint(10, size=(3, 4))
A

array([[7, 7, 2, 5],
       [4, 1, 7, 5],
       [1, 4, 0, 9]], dtype=int32)

In [30]:
A - A[0]  # subtracts the first row from all rows

array([[ 0,  0,  0,  0],
       [-3, -6,  5,  0],
       [-6, -3, -2,  4]], dtype=int32)

In [31]:

df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]  # subtracts the first

# df.subtract(df['R'], axis=0)  # subtracts the values in column 'R' from all columns

halfrow = df.iloc[0, ::2]

print(f'Dataframe: \n{df}')
print(f'First row: \n{df.iloc[0]}')
print(f'Half row: \n{halfrow}')

Dataframe: 
   Q  R  S  T
0  7  7  2  5
1  4  1  7  5
2  1  4  0  9
First row: 
Q    7
R    7
S    2
T    5
Name: 0, dtype: int32
Half row: 
Q    7
S    2
Name: 0, dtype: int32


In [32]:
df - halfrow  # subtracts the half row from all rows

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,-3.0,,5.0,
2,-6.0,,-2.0,


# Handling Missing Data

- Real-world data is rarely clean and homogeneous

## Trade-Offs in Missing Data Conventions

- Masking approach, the mask might be an entirely separate **Boolean array**, or it may involve appropriation of one bit in the data representation to locally indicate the null status of a value
- Sentinel approach, the sentinel value could be some data-specific convention, such as indicating a missing **integer value with -9999 or some rare bit pattern**, or it could be a more global convention, such as indicating **a missing floating-point value with NaN (Not a Number)**, a special value which is part of the IEEE floating-point specification
- None of these approaches is without trade-offs:
  - Use of a separate mask array requires allocation of an additional Boolean array, which adds overhead in both storage and computation
  - A sentinel value reduces the range of valid values that can be represented, and may require extra (often non-optimized) logic in CPU and GPU arithmetic

## Mising Data in Pandas

### Pythonic mising data

- 1st sentinel value used by Pandas is **None**
- **None** can not be used in any arbitrary Numpy/Pandas array
- Only in arrays with data type "Object"
- Performing aggregations like **sum()**, **min()** across an array with **None** type

In [33]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

### Missing numerical data

- NaN is different (a special floating-point value recognized by all systems that use the standard IEEE floating-point representation)
- NaN virus-it infects any other object it touches (1 + np.nan = nan, vals2.min() = nan, ...)
- NaN is specifically a floating-point value; there is no equivalent NaN value for integers, strings, or other types

In [34]:
vals2 = np.array([1, np.nan, 3, 4]) 
vals2.dtype

dtype('float64')

In [35]:
# TO ignore these missing values
print(f'return the value for valse2.nansum() = {np.nansum(vals2)}')

return the value for valse2.nansum() = 8.0


### NaN and None in Pandas

- If we set a value in an integer array to np.nan, it will automatically be upcast to a floating-point type
- Casting the integer array to floating point, Pandas automatically converts the None to a NaN value
- In Pandas, string data is always stored with an object dtype

![image.png](attachment:image.png)

## Operating on Null Values

- Pandas treats None and NaN as essentially interchangeable for indicating missing or null values
-  To facilitate this convention, there are several useful methods for detecting, removing, and replacing null values in Pandas data structures

![image.png](attachment:image.png)

### Detecting null values

In [36]:
# isnull() functions
data = pd.Series([1, np.nan, 'hello', None])
print(f'Check for null values: \n{data.isnull()}')

Check for null values: 
0    False
1     True
2    False
3     True
dtype: bool


In [37]:
# notnull() functions
print(f'Check for not null values: \n{data.notnull()}')

Check for not null values: 
0     True
1    False
2     True
3    False
dtype: bool


### Droping null values

In [38]:
# dropna() function
print(f'Drop null values: \n{data.dropna()}')

Drop null values: 
0        1
2    hello
dtype: object


In [39]:
# For dataframes
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])

print(f'Dataframe: \n{df}')

print(f'Drop null values: \n{df.dropna()}') # By default, dropna() will drop all rows in which any null value is present

print(f'Drop null values: \n{df.dropna(axis=1)}') # drop columns with null values

Dataframe: 
     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6
Drop null values: 
     0    1  2
1  2.0  3.0  5
Drop null values: 
   2
0  2
1  5
2  6


In [40]:
# add new column 3 with all null values
df[3] = np.nan
print(f'Dataframe: \n{df}')

# using thresh parameter to drop rows or columns with a certain number of non-null values
print(f'Drop null values: \n{df.dropna(axis=0, thresh=3)}') # drop rows with at least 2 non-null values

Dataframe: 
     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN
Drop null values: 
     0    1  2   3
1  2.0  3.0  5 NaN


### Filling null values

In [41]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print(f'Data: \n{data}')

# fillna() function
print(f'Fill null values: \n{data.fillna(0)}')

# forward-fill
print(f'Forward-fill: \n{data.fillna(method="ffill")}')

# back-fill
print(f'Back-fill: \n{data.fillna(method="bfill")}')

Data: 
a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64
Fill null values: 
a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64
Forward-fill: 
a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64
Back-fill: 
a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64


  print(f'Forward-fill: \n{data.fillna(method="ffill")}')
  print(f'Back-fill: \n{data.fillna(method="bfill")}')


# Hierarchical Indexing (Multi-Indexing)

- Also known as multi-level indexing
- Allows you to have multiple index levels on sing axis
- Enables you to work with higher-dimensional data in a 1D (Series) or 2D (DataFrame) format

## Multipy Indexed Series

### The best way

In [42]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]

# MultiIndex object from a simple list of arrays giving the index values within each level
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [43]:
pop = pd.Series(populations, index=index)
# reindex the data to create a MultiIndex Series from the data
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [44]:
pop[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

### Multiindex as extra dimension

In [45]:
# unstack() method will quickly convert a multiply indexed Series into a conventionally indexed DataFrame
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [46]:
# stack() method provides the opposite operation
pop_df.stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [47]:
# MultiIndex for columns in a DataFrame
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [48]:
# fraction of people under 18 by year
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


## Methods of MultiIndex Creation

**The most straighforward way to contruct a multiply indexed Series or DataFrame**

In [49]:
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.445309,0.840972
a,2,0.695865,0.741594
b,1,0.179521,0.842363
b,2,0.55363,0.906866


In [50]:
# if you pass a dictionary with appropriate tuples as keys, Pandas will automatically recognize this and use a MultiIndex by default

data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64

## Explicit MultiIndex constructors

In [51]:
# From array
a = pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])

# From tuples
b = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

# From product, it provides all possible pairs of the two sequences
c = pd.MultiIndex.from_product([['a', 'b'], [1, 2]])

# From Levels
d = pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
              codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

print(f'From arrays: \n{a}')
print(f'From tuples: \n{b}')
print(f'From product: \n{c}')
print(f'From levels: \n{d}')

From arrays: 
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
From tuples: 
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
From product: 
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
From levels: 
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


## MultiIndex level names

In [52]:
pop.index.names = ['state', 'year']
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

## MultiIndex for columns

In [53]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,36.5,20.0,36.5,25.0,36.7
2013,2,48.0,37.5,39.0,37.6,26.0,38.1
2014,1,19.0,36.8,37.0,38.1,40.0,37.2
2014,2,30.0,35.7,17.0,37.1,41.0,35.0


In [54]:
health_data['Guido'] # Accessing the data

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,20.0,36.5
2013,2,39.0,37.6
2014,1,37.0,38.1
2014,2,17.0,37.1


## Indexing and slicing a MultiIndex

### Multiply indexed Series

In [55]:
pop

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [56]:
# Access single element
print(f'Access single element: \n{pop["California", 2000]}')

# Partial indexing
print(f'\nPartial indexing: \n{pop["California"]}')

# Partial slicing 
print(f'\nPartial slicing: \n{pop["California":"New York"]}')

# Passing an empty slice in the first index
print(f'\nEmpty slice: \n{pop[:, 2000]}')

# Boolean mask
print(f'\nBoolean mask: \n{pop[pop > 22000000]}')

# Fancy indexing
print(f'\nFancy indexing: \n{pop[["California", "Texas"]]}')

Access single element: 
33871648

Partial indexing: 
year
2000    33871648
2010    37253956
dtype: int64

Partial slicing: 
state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
dtype: int64

Empty slice: 
state
California    33871648
New York      18976457
Texas         20851820
dtype: int64

Boolean mask: 
state       year
California  2000    33871648
            2010    37253956
Texas       2010    25145561
dtype: int64

Fancy indexing: 
state       year
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
dtype: int64


### Multiply Indexed DataFrames

In [57]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,36.5,20.0,36.5,25.0,36.7
2013,2,48.0,37.5,39.0,37.6,26.0,38.1
2014,1,19.0,36.8,37.0,38.1,40.0,37.2
2014,2,30.0,35.7,17.0,37.1,41.0,35.0


In [58]:
print(f'Accessing the data: \n{health_data["Guido", "HR"]}')

# Slicing the data
print(f'\nSlicing the data: \n{health_data.iloc[:2, :2]}') # Getting the first two rows and columns

# Slicing the data
print(f'\nSlicing the data: \n{health_data.loc[:, ("Bob", "HR")]}') # Getting the Bob's HR data

# Slicing the data
# idx is a slice object that allows for a more intuitive way to slice the data using the loc accessor
idx = pd.IndexSlice
# idx[:, 1] is a slice that allows us to take all values in the first index
# idx[:, "HR"] is a slice that allows us to take all values in the second index
print(f'\nSlicing the data: \n{health_data.loc[idx[:, 1], idx[:, "HR"]]}') # Getting the HR data for the first visit

Accessing the data: 
year  visit
2013  1        20.0
      2        39.0
2014  1        37.0
      2        17.0
Name: (Guido, HR), dtype: float64

Slicing the data: 
subject      Bob      
type          HR  Temp
year visit            
2013 1      25.0  36.5
     2      48.0  37.5

Slicing the data: 
year  visit
2013  1        25.0
      2        48.0
2014  1        19.0
      2        30.0
Name: (Bob, HR), dtype: float64

Slicing the data: 
subject      Bob Guido   Sue
type          HR    HR    HR
year visit                  
2013 1      25.0  20.0  25.0
2014 1      19.0  37.0  40.0


## Rearranging Multi-Indices

### Sorted and unsorted indices

- Many of the MultiIndex slicing operations will fail if the index is not sorted

In [59]:
# from_product() method helps to generate every possible combination of the given arrays
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
print(f'From product: \n{index}')
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
print(f'\nData: \n{data}')

From product: 
MultiIndex([('a', 1),
            ('a', 2),
            ('c', 1),
            ('c', 2),
            ('b', 1),
            ('b', 2)],
           )

Data: 
char  int
a     1      0.246124
      2      0.589941
c     1      0.063660
      2      0.428820
b     1      0.580567
      2      0.850822
dtype: float64


In [61]:
# Error due to the fact that the data is not sorted
# print(f'Slicing the unsorted data: \n{data["a":"b"]}')

# Sorting the data
data = data.sort_index()
print(f'\nSorted data: \n{data}')

# Now the slicing works
print(f'\nSlicing the sorted data: \n{data['a':'b']}')


Sorted data: 
char  int
a     1      0.246124
      2      0.589941
b     1      0.580567
      2      0.850822
c     1      0.063660
      2      0.428820
dtype: float64

Slicing the sorted data: 
char  int
a     1      0.246124
      2      0.589941
b     1      0.580567
      2      0.850822
dtype: float64


### Stacking and UnStacking indices

- To reshape DataFrames and Series that have multiple objects
- Allow to move levels of the index between rows and columns
- Effectively transposing sections of the data
- Stacking (.stack())
  - Transforms a DataFrame with a multi-level column index into a Series with a multi-level row index
  - It "pivots" the innermost level of the column index to become the innermost level of the row index
- Unstacking (.unstack())
  - Transforms a Series with a multi-level row index into a DataFrame with a multi-level column index
  - It "pivots" the innermost level of the row index to become the innermost level of the column index

In [64]:
# pop.unstack() method will quickly convert a multiply indexed Series into a conventionally indexed DataFrame
# level=0 indicates the first level of the index
print(f'Unstacking the data: \n{pop.unstack(level=0)}')

# level=1 indicates the second level of the index
print(f'\nUnstacking the data: \n{pop.unstack(level=1)}')

Unstacking the data: 
state  California  New York     Texas
year                                 
2000     33871648  18976457  20851820
2010     37253956  19378102  25145561

Unstacking the data: 
year            2000      2010
state                         
California  33871648  37253956
New York    18976457  19378102
Texas       20851820  25145561


### Index setting and resetting

In [65]:
# Print the data
print(f'Original data: \n{pop}')

# stack() method provides the opposite operation
pop_flat = pop.reset_index(name='population')
pop_flat

Original data: 
state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64


Unnamed: 0,state,year,population
0,California,2000,33871648
1,California,2010,37253956
2,New York,2000,18976457
3,New York,2010,19378102
4,Texas,2000,20851820
5,Texas,2010,25145561


In [66]:
pop_flat.set_index(['state', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2000,33871648
California,2010,37253956
New York,2000,18976457
New York,2010,19378102
Texas,2000,20851820
Texas,2010,25145561


## Data Aggregations on Multi-Indices

Data aggregation methods (mean(), sum(), and max()), these can be passed a level parameter that controls which subset of the data the aggregate is computed on

In [67]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,25.0,36.5,20.0,36.5,25.0,36.7
2013,2,48.0,37.5,39.0,37.6,26.0,38.1
2014,1,19.0,36.8,37.0,38.1,40.0,37.2
2014,2,30.0,35.7,17.0,37.1,41.0,35.0


# Concat

In [73]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


### Concatenation

- Similar with concatenation of **Numpy**
- One important difference between np.concatenate and pd.concat is that Pandas concatenation preserves indices 

In [77]:
# Concat of Numpy
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
print(f'Concatenation of Numpy: \n{np.concatenate([x, y, z])}')

# Concat of Pandas
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
print(f'\nConcatenation of Pandas: \n{pd.concat([ser1, ser2])}')

Concatenation of Numpy: 
[1 2 3 4 5 6 7 8 9]

Concatenation of Pandas: 
1    A
2    B
3    C
4    D
5    E
6    F
dtype: object


In [81]:
# Pandas preserves indices, even if the result will have duplicate indices
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
df1.index = df2.index # make duplicate indices!

print(f'Concatenation of DataFrames: \n{pd.concat([df1, df2])}')

Concatenation of DataFrames: 
    A   B
3  A1  B1
4  A2  B2
3  A3  B3
4  A4  B4


In [86]:
# Catching the duplicate indices as errors
try:
    pd.concat([df1, df2], verify_integrity=True) # verify_integrity=True will catch the error
except ValueError as e:
    print("ValueError:", e)

# Ignoring the index
print(f'\nConcatenation of DataFrames: \n{pd.concat([df1, df2], ignore_index=True)}') # ignore_index=True will create a new integer index

# Adding MultiIndex keys
print(f'\nConcatenation of DataFrames: \n{pd.concat([df1, df2], keys=['x', 'y'])}') # keys option will add a hierarchical index

ValueError: Indexes have overlapping values: Index([3, 4], dtype='int64')

Concatenation of DataFrames: 
    A   B
0  A1  B1
1  A2  B2
2  A3  B3
3  A4  B4

Concatenation of DataFrames: 
      A   B
x 3  A1  B1
  4  A2  B2
y 3  A3  B3
  4  A4  B4


### Concatenation with joins

In [91]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])

print(f'Concatenation of DataFrames: \n{pd.concat([df5, df6])}') # By default, entries for which no data is available are filled with NaN

# join outer
print(f'\nJoin outer: \n{pd.concat([df5, df6], join="outer")}') # join="outer" takes the union of the columns

# join inner
print(f'\nJoin inner: \n{pd.concat([df5, df6], join="inner")}') # join="inner" takes the intersection of the columns

Concatenation of DataFrames: 
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4

Join outer: 
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4

Join inner: 
    B   C
1  B1  C1
2  B2  C2
3  B3  C3
4  B4  C4


# Merge and Join

## Categories of Joins

- The pd.merge() function implements a number of types of joins
  - one-to-one
  - many-to-one
  - many-to-many

### One-to-one Joins

In [95]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})

display(df1, df2)

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [None]:
# One-to-one joins
df3 = pd.merge(df1, df2) # merge() function recognizes that each DataFrame has an "employee" column, and automatically joins using this column as a key
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


### Many-to-one Joins

In [97]:
# Many-to-one joins
# Many-to-one joins are joins in which one of the two key columns contains duplicate entries
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})

display(df3, df4, pd.merge(df3, df4)) # merge() function automatically aligns the data using the common column names

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


### Many-to-many Joins

In [98]:
# Many-to-many joins
# Many-to-many joins are a bit confusing conceptually, but are nevertheless well defined
# If the key column in both the left and right array contains duplicates, then the result is a many-to-many merge
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
                              'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'coding', 'linux',
                               'spreadsheets', 'organization']})

display(df1, df5, pd.merge(df1, df5)) # merge() function automatically aligns the data using the common column names

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheets
2,Engineering,coding
3,Engineering,linux
4,HR,spreadsheets
5,HR,organization


Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheets
2,Jake,Engineering,coding
3,Jake,Engineering,linux
4,Lisa,Engineering,coding
5,Lisa,Engineering,linux
6,Sue,HR,spreadsheets
7,Sue,HR,organization


## Specification of the Merge Key

### The 'on' Keyword

In [None]:
# Specification of the Merge Key
# The on keyword specifies the name of the key column
# If the two input DataFrames have different column names, you can specify them separately using the left_on and right_on keywords
display(df1, df2, pd.merge(df1, df2, on='employee'))

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


In [None]:
# left_on and right_on keywords
# If you wish to merge by different names, you can use the left_on and right_on keywords to specify the two column names
# The result has a redundant column that we can drop if desired—for example, by using the drop() method of DataFrames
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})

display(df1, df3, pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)) # drop the redundant column

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,120000
3,Sue,90000


Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


### The left_index and right_index Keywords

**Sometimes rather than merge on a column, you can merge on index instead**

In [None]:
# The left_index and right_index keywords
# Sometimes, rather than merging on a column, you would instead like to merge on an index
# For this, you can use the left_index and right_index keywords
df1a = df1.set_index('employee') # set_index() method can be used to set the index
df2a = df2.set_index('employee')

display(df1a, df2a, pd.merge(df1a, df2a, left_index=True, right_index=True))

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR


Unnamed: 0_level_0,hire_date
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014


Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [None]:
# join and merge are similar operations but have some differences
# join() method performs a merge that defaults to joining on indices
display(df1a, df2a, df1a.join(df2a)) # join() method performs a merge that defaults to joining on indices

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR


Unnamed: 0_level_0,hire_date
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014


Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


## Specifying Set Arithmetic for Joins

In [103]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
                   columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
                   columns=['name', 'drink'])

display(df6, df7, pd.merge(df6, df7)) # merge() function automatically aligns the data using the common column names

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread


Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer


Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [None]:
# outer and inner joins using merge() function
display(df6, df7, pd.merge(df6, df7, how='outer')) # how='outer' takes the union of the input columns

display(pd.merge(df6, df7, how='inner')) # how='inner' takes the intersection of the input columns

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread


Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer


Unnamed: 0,name,food,drink
0,Joseph,,beer
1,Mary,bread,wine
2,Paul,beans,
3,Peter,fish,


Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread


Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer


Unnamed: 0,name,food,drink
0,Mary,bread,wine
