# Data Manipulation with Pandas
- provides an efficient implementation of a Dataframe
- A dataframe is a multidimensional array with attached row and column labels
- often with heterogeneous types and/or missing data

In [None]:
import pandas
pandas.__version__

In [None]:
# we generally import pandas as pd
import pandas as pd
import numpy as np

## Pandas Objects
Can be thought of as enhanced bersions of NumPy structed arrays where rows and columns are identified with labels rather than simple integer indices.

Lets examine the fundamental Pandas data structures:
- Series
- DataFrame
- Index

### The Pandas Series Object
- A 1-d array of indexed data.
- Wraps both a sequence of values and sequence of indices
- Can be accessed with the values and index attributes.

In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

In [None]:
# values are a numpy array
data.values

In [None]:
# index is an array-like object of type pd.Index
data.index

In [None]:
# Data can be access with square-bracket notation
data[1]

In [None]:
data[1:3]

In [None]:
# Series' index has explicitly defined indexs.
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index=['a','b','c','d'])
data

In [None]:
# Which can be accessed as
data['b']

In [None]:
# We can use non-contiguous or non-sequential indices:
data = pd.Series([0.25,0.5,0.75,1.0],
                index=[2,5,3,7])
data

In [None]:
data[5]

#### Series as a specialized dictionary
-  a structure which maps typed keys to a set of typed values

In [None]:
population_dict = {'California': 23534,
                  'Texas': 346457,
                  'New York': 785643,
                  'Florida': 876543,
                  'Illinois': 1234576}
population = pd.Series(population_dict)
population

In [None]:
# By default, the index is drawn from the sorted keys 
population['California']

In [None]:
# Supports array-style slicing
population['California':'Illinois']

In [None]:
#### Constructing Series Objects

In [None]:
# From a list or numpy array
pd.Series([2,4,6])

In [None]:
# from a scalar to fill the specified index
pd.Series(5, index=[100,200,300])

In [None]:
# From a dictionary
pd.Series({2:'a', 1:'b', 3:'c'})

In [None]:
# From a dictionary, but only the specified keys
pd.Series({2:'a',1:'b',3:'c'}, index=[3,2])

## Pandas DataFrame Object
A DataFrame can be thought of either as a generalization of a NumPy array, or as a specialization of a Python dictionary.

### DataFrame as a generalized NumPy array
- an analog of a 2d array with both flexible row indices and flixible column names.

In [None]:
# contruct a new Series
area_dict = {'California': 23453, 'Texas': 3456, 'New York': 234503409, 'Florida': 23453634, 'Illinois': 2345342}
area = pd.Series(area_dict)
area

In [None]:
# From before..
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)

# We can construct a single 2d object containing population info
states = pd.DataFrame({'population': population, 'area': area})
states

In [None]:
# DataFrame has an index attribute to access index labels
states.index

In [None]:
# It also has a column attribute to access the column labels
states.columns

### DataFrame as specialized dictionary
- maps column names to a Series of column data.

In [None]:
states['area']

### Constructing DataFrame objects

In [None]:
# DataFrame is a collection of Series objects. 
pd.DataFrame(population, columns=['population'])

In [None]:
# Lists of dictionaries can be made into a DataFrame.
data = [{'a': i, 'b':2 * i}
       for i in range(3)]
pd.DataFrame(data)

In [None]:
# PD will will missing values with NaN
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

In [None]:
# Can be constructed from a dictionary of Series objects
pd.DataFrame({'population': population, 'area': area})

In [None]:
# Can be constructed from a 2d array of data.
pd.DataFrame(np.random.rand(3, 2),
            columns=['foo', 'bar'],
            index=['a', 'b', 'c'])

In [None]:
# Can be constructed from a structed array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

In [None]:
pd.DataFrame(A)

### The Pandas Index Object
- an immutable array, or ordered set

In [None]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

In [None]:
# Can retrieve values or slices
ind[1]

In [None]:
ind[::2]

In [None]:
# Has other attributes familiar from NumPy
print(ind.size, ind.shape, ind.ndim, ind.dtype)

In [None]:
# Can't be modified by normal means
int[1] = 0

#### Index as an ordered set

In [None]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [None]:
# intersection
indA & indB

In [None]:
# union
indA | indB

In [None]:
# symmetric difference
indA ^ indB

## Data indexing and selection
- Many of the same means of accessing and modifying values in Pandas as NumPy

In [None]:
### Data selection in series
#### Series as a dictionary

In [None]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index=['a', 'b', 'c', 'd'])
data

In [None]:
data['b']

In [None]:
# We can use dictionary-like python expressions and methods to examine keys/indices and values
'a' in data

In [None]:
data.keys()

In [None]:
list(data.items())

In [None]:
# modify values
data['e'] = 1.25
data

#### Series as a 1d array

In [None]:
# slicing by explicit index
data['a':'c']

In [None]:
# slicing by implicit integer index
data[0:2]

In [None]:
# masking
data[(data > 0.3) & (data < 0.8)]

In [None]:
# fancy indexing
data[['a', 'e']]

#### Indexers: loc, iloc, ix

In [None]:
data = pd.Series(['a', 'b', 'c'], index=[1,3,5])
data

In [None]:
# explicit index when indexing
data[1]

In [None]:
# implicit index when slicing
data[1:3]

In [None]:
# indexing and slicing with loc always uses explicit index
data.loc[1]

In [None]:
data.loc[1:3]

In [None]:
# iloc allows indexing and slicing with implicit index
data.iloc[1]

In [None]:
data.iloc[1:3]

### Data Selection in DataFrame
#### DataFrame as a dictionary
- lets consider the dataframe as a dictionary of related Series objects

In [None]:
area = pd.Series({'Califonia':23454523, 'Texas': 23445, 'New York': 23890490, 'Florida': 23890450, 'Illinois': 2348035890})
pop = pd.Series({'Califonia':254523, 'Texas': 234453, 'New York': 238904, 'Florida': 290450, 'Illinois': 2348035})
data = pd.DataFrame({'area':area, 'pop': pop})
data

In [None]:
# Individual series make up the columns can be accessed via dictionary style indexing of the column name
data['area']

In [None]:
# We can also use attributes
data.area

In [None]:
# it's the same thing 
data.area is data['area']

In [None]:
# We can add data to dataframes
data['density'] = data['pop'] / data['area']
data

In [None]:
#### DataFrame as a 2d array
data.values

In [None]:
# We can transpose the data
data.T

In [None]:
# Get a row
data.values[0]

In [None]:
# get a column
data['area']

In [None]:
# We can use iloc indexer to access data, maintaining dataframe index and column lavels
data.iloc[:3, :2]

In [None]:
# loc indexer can be used with explicit index and coluimn names
data.loc[:'Illinois', :'pop']

In [None]:
# ix index is a hybrid approach
data.ix[:3, :'pop']

In [None]:
# We can use loc for masking and fancy indexing
data.loc[data.density > 10, ['pop', 'density']]

In [None]:
# We can modify data with them too
data.iloc[0, 2] = 90
data

#### Additional indexing conventions
- index refers to columns
- slicing refers to rows

In [None]:
data['Florida':'Illinois']

In [None]:
# We can also refer to them as numbers
data[1:3]

In [None]:
# Direct masking operations are row-wise
data[data.density > 10]

## Operation on Data in Pandas
- can perform numpy element-wise operations
- pandas keeps context when performing operations

### Ufuncs: index preservation

In [None]:
import pandas as pd
import numpy as np

# define a series
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

In [None]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                 columns=['A', 'B', 'C', 'D'])
df

In [None]:
# applying a NumPy ufunc on either of these object will result in another pandas object with the indices preserved
np.exp(ser)

In [None]:
# something more complicated
np.sin(df * np.pi / 4)

### UFuncs: Index Alignment
- for binary operations on 2 series or dataframe object, pandas will align indices. 
- this is useful when working with incomplete data

#### Index alignment in Series
- Lets combine 2 different data sources and find only the top 3 us states by area, and the top 3 states by population

In [None]:
area = pd.Series({'Alaska': 123234, 'Texas': 234523,
                 'California': 457623}, name='area')
population = pd.Series({'Texas': 234523, 'California': 457623,
                'New York': 24523}, name='population')

In [None]:
# calculate population density
population / area

In [None]:
# The resulting array contains the unioni of indices.
# Could be found with
area.index | population.index

In [None]:
# NaN is the default
A = pd.Series([2,4,6], index=[0,1,2])
B = pd.Series([1,3,5], index=[1,2,3])
A + B

In [None]:
# We can fill NaN with another number
A.add(B, fill_value=0)

#### Index Alignment in DataFrame
- similar alignment happens for dataframe on both row and column

In [None]:
A = pd.DataFrame(rng.randint(0, 20, (2,2)), columns=list('AB'))
A

In [None]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
B

In [None]:
A + B

In [None]:
# We can also fill values
# lets use the mean value of A, computed by stacking rows of A
fill = A.stack().mean()
A.add(B, fill_value=fill)

### Ufuncs: operations between dataframe and series
- column alignment is maintained in these cases as well

In [None]:
# find the difference of a 2d array and one of it's rows
A = rng.randint(10, size=(3,4))
A

In [None]:
A - A[0]

In [None]:
# with a dataframe
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

In [None]:
# now column-wise
df.subtract(df['R'], axis=0)

In [None]:
# Automatic index alignment takes place
halfrow = df.iloc[0, ::2]
halfrow

In [None]:
df - halfrow

## Handling Missing Data
- often, data will be missing from sources
- Lets learn how to represent this
- We will refer to missing data as:
    - null
    - NaN
    - NA

### Trade-offs in missing data conventions
- we can use a mask that globally indicates missing values
    - may use a bool array
    - or a bit in data represenation to indicate null status
    - additional overhead in both storage and computation
- or we can choose a sentinel value
    - may indicate using -9999 or some rate bit pattern
    - or NaN
    - require extra logic in CPU and GPU arithmetic
    - NaN may not be available for the data-type
   
### Missing Data in Pandas
- because of reliance on numpy, there is no built in notion of NA values for non-floating point data types
- Following R's strategy is unwieldly because of the many datatypes
- Pandas uses sentinels for missing data:
    - NaN
    - None
- tends to be a good compromise

#### None: Pythonic missing data
- a python singleton object that is often used for missing data
- can only be used in arrays with datatype `object`

In [None]:
import numpy as np
import pandas as pd

vals1 = np.array([1, None, 3, 4])
vals1

# object is the best common type
# operations will be performed at python level

In [None]:
# Operations are slower with this type
for dtype in ['object', 'int']:
    print('dtype =', dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

In [None]:
# aggregations of python objects will result in an error
vals1.sum()

#### NaN: Missing numberical data
- a special floating-point value

In [None]:
vals2 = np.array([1, np.nan, 3, 4])
vals2.dtype

# numpy choses floating point for this array

In [None]:
# NaN operations will result in NaN
1 + np.nan

In [None]:
0 * np.nan

In [None]:
# aggregations wont result in errors but aren't very useful
vals2.sum(), vals2.min(), vals2.max()

In [None]:
# There are special aggregations that ignore NaN
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

#### NaN and None in pandas
- both have their place
- pandas will convert where appropriate

In [None]:
pd.Series([1, np.nan, 2, None])

In [None]:
# Pandas will type-case when NA values are present
x = pd.Series(range(2), dtype=int)
x

In [None]:
x[0] = None
x

### Operating on Null values
- there are many methods for detecting, removing, and replacing null values
- isnull(): generate a boolean mask indicating missing values
- notnull(): opposite of isnull()
- dropna(): return filter version of the data
- fillna(): return a copy of the data with missing values filled or imputed

#### Detecting null values
- lets get a mask for all null values or nonnull values

In [None]:
data = pd.Series([1, np.nan, 'hello', None])
data.isnull()

In [None]:
data[data.notnull()]

#### Dropping null values

In [None]:
data.dropna()

In [None]:
df = pd.DataFrame([[1, np.nan, 2],
                  [2, 3, 5],
                  [np.nan, 4, 6]])
df

In [None]:
# Drop all rows in which any null value is present
df.dropna()

In [None]:
# drop columns containing a null value
df.dropna(axis="columns")

In [None]:
df[3] = np.nan
df

In [None]:
# drop columns where all values are null values
df.dropna(axis='columns', how='all')

In [None]:
# keep rows where a min number of non-null values are
df.dropna(axis='rows', thresh=3)

#### Fililng null values
- sometimes you may want to fill a null value with something that makes sense for the dataset

In [None]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

In [None]:
# fill null with 0
data.fillna(0)

In [None]:
# forward-fill to propagate the previous value forward
data.fillna(method='ffill')

In [None]:
# back-fill to propagate the next values backward
data.fillna(method='bfill')

In [None]:
df

In [None]:
# forward-fill a dataframe by specficing the axis
df.fillna(method='ffill', axis=1)

# null values remain if there is no previous value

## Hierarchical Indexing
- When we need to index at higher dimensions, we can use hierarchical indexing.
- This incorporates multiple index levels within a single index
- We will explore MultiIndex objects

In [1]:
import pandas as pd
import numpy as np

### Multiple Indexed Series
- Lets consider representing 2d data in a 1d series.

#### Pandas MultiIndex

In [5]:
index = [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2010)]

index = pd.MultiIndex.from_tuples(index)
index

# Contains multiple levels of indexing and multiple labels per datapoint

MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])

In [8]:
populations = [23423454, 345345,
              2342341, 678768,
              643278, 34565677]
pop = pd.Series(populations, index=index)
pop = pop.reindex(index)
pop

# The first 2 columns show labels and the last shows data
# Missing values means that it's the same as the value the row above

California  2000    23423454
            2010      345345
New York    2000     2342341
            2010      678768
Texas       2000      643278
            2010    34565677
dtype: int64

In [12]:
# access all data which the second index is 2010
pop[:, 2010]

California      345345
New York        678768
Texas         34565677
dtype: int64

#### MutliIndex as extra dimension
- We can use the unstack method to quickly convert a multi indexed Series into a DataFrame

In [15]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,23423454,345345
New York,2342341,678768
Texas,643278,34565677


In [17]:
# To convert from a dataframe to a mutliindex Series
pop_df.stack()

California  2000    23423454
            2010      345345
New York    2000     2342341
            2010      678768
Texas       2000      643278
            2010    34565677
dtype: int64

In [20]:
# add another column of data
pop_df = pd.DataFrame({'total': pop,
                      'under18': [2346345,234245234,
                                 36341435,23452453,
                                 234234235,34563453]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
California,2000,23423454,2346345
California,2010,345345,234245234
New York,2000,2342341,36341435
New York,2010,678768,23452453
Texas,2000,643278,234234235
Texas,2010,34565677,34563453


In [21]:
# Compute the fraction of people unser 18 by year
f_u18 = pop_df['under18'] / pop_df['total']
f_u18.unstack()

Unnamed: 0,2000,2010
California,0.100171,678.293399
New York,15.515006,34.551501
Texas,364.125984,0.999936


### Methods of MultiIndex Creation

In [23]:
df = pd.DataFrame(np.random.rand(4,2),
                 index=[['a','a','b','b'], [1,2,1,2]],
                 columns=['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.094223,0.723727
a,2,0.241645,0.3862
b,1,0.147553,0.948731
b,2,0.455878,0.421788


In [24]:
data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
pd.Series(data)

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

#### Explicit MultiIndex Constructors

In [25]:
# From arrays
pd.MultiIndex.from_arrays([['a','a','b','b'], [1,2,1,2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [26]:
# From a list of tuples
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [27]:
# From a cartesian product of single indices
pd.MultiIndex.from_product([['a','b'], [1,2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [28]:
# From internal encodings
pd.MultiIndex(levels=[['a','b'], [1,2]],
             labels=[[0,0,1,1], [0,1,0,1]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])

### MultiIndex level names
We can push a names argument to give labels

In [29]:
pop.index.names = ['states', 'year']
pop

states      year
California  2000    23423454
            2010      345345
New York    2000     2342341
            2010      678768
Texas       2000      643278
            2010    34565677
dtype: int64

### MultiIndex for columns
- Columns can have multiple levels as well

In [30]:
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013,2014], [1,2]],
                                  names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'], ['HR','Temp']],
                                    names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4,6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame 
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,23.0,37.7,22.0,36.5,33.0,38.7
2013,2,39.0,38.2,16.0,37.3,48.0,36.9
2014,1,38.0,36.1,44.0,36.6,39.0,37.8
2014,2,44.0,37.0,31.0,36.9,33.0,35.0


In [31]:
health_data['Guido']

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,22.0,36.5
2013,2,16.0,37.3
2014,1,44.0,36.6
2014,2,31.0,36.9


### Indexing and slicing a multiindex
#### Multiply indexed series
- Consider the Series of state populations

In [32]:
pop

states      year
California  2000    23423454
            2010      345345
New York    2000     2342341
            2010      678768
Texas       2000      643278
            2010    34565677
dtype: int64

In [33]:
# Access single elements by indexing with multiple terms
pop['California', 2000]

23423454

In [34]:
# Access just 1 level with partial indexing
pop['California']

year
2000    23423454
2010      345345
dtype: int64

In [35]:
# Use Partial sorting on a sorted MultiIndex
pop.loc['California': 'New York']

states      year
California  2000    23423454
            2010      345345
New York    2000     2342341
            2010      678768
dtype: int64

In [36]:
# Use partial index on lower levels
pop[:, 2000]

states
California    23423454
New York       2342341
Texas           643278
dtype: int64

In [38]:
# selection with boolean mask
pop[pop > 2300000]

states      year
California  2000    23423454
New York    2000     2342341
Texas       2010    34565677
dtype: int64

In [39]:
# Selection based on fancy indexing
pop[['California', 'Texas']]

states      year
California  2000    23423454
            2010      345345
Texas       2000      643278
            2010    34565677
dtype: int64

#### Multiply Indexed DataFrames

In [40]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,23.0,37.7,22.0,36.5,33.0,38.7
2013,2,39.0,38.2,16.0,37.3,48.0,36.9
2014,1,38.0,36.1,44.0,36.6,39.0,37.8
2014,2,44.0,37.0,31.0,36.9,33.0,35.0


In [41]:
# Get Guido's heart rate data
health_data['Guido','HR']

year  visit
2013  1        22.0
      2        16.0
2014  1        44.0
      2        31.0
Name: (Guido, HR), dtype: float64

In [47]:
# We can do the same with iloc, loc, ix
health_data.iloc[:2, :2]

Unnamed: 0_level_0,subject,Bob,Bob
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,23.0,37.7
2013,2,39.0,38.2


In [48]:
# loc and iloc can be passed a tuple of multiple indices
health_data.loc[:, ('Bob', 'HR')]

year  visit
2013  1        23.0
      2        39.0
2014  1        38.0
      2        44.0
Name: (Bob, HR), dtype: float64

In [49]:
# Get a slice with IndexSlice
idx = pd.IndexSlice
health_data.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,23.0,22.0,33.0
2014,1,38.0,44.0,39.0


### Rearranging Multi-indices
- sometimes we need to rearrange the data for computation

#### Sorted and unsorted indices
- many multiindex slicing operations will fail if the index is not sorted

In [51]:
# create a non-lexographically sorted dataset
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1,2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.014237
      2      0.903485
c     1      0.279286
      2      0.004960
b     1      0.011698
      2      0.323127
dtype: float64

In [52]:
# if we try to take a partial slice with this it will produce an error
try:
    data['a': 'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


In [54]:
# sort the index 
data = data.sort_index()
data

char  int
a     1      0.014237
      2      0.903485
b     1      0.011698
      2      0.323127
c     1      0.279286
      2      0.004960
dtype: float64

In [55]:
data['a':'b']

char  int
a     1      0.014237
      2      0.903485
b     1      0.011698
      2      0.323127
dtype: float64

#### Stacking and unstacking indices
- convert a dataset

In [57]:
pop.unstack(level=0)

states,California,New York,Texas
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,23423454,2342341,643278
2010,345345,678768,34565677


In [59]:
pop.unstack(level=1)

year,2000,2010
states,Unnamed: 1_level_1,Unnamed: 2_level_1
California,23423454,345345
New York,2342341,678768
Texas,643278,34565677


#### Index setting and resetting
- we can turn index levels into columns

In [61]:
pop_flat = pop.reset_index(name='population')
pop_flat

Unnamed: 0,states,year,population
0,California,2000,23423454
1,California,2010,345345
2,New York,2000,2342341
3,New York,2010,678768
4,Texas,2000,643278
5,Texas,2010,34565677


In [63]:
# it can be useful to convert flat data into a multiindex
pop_flat.set_index(['states', 'year'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
states,year,Unnamed: 2_level_1
California,2000,23423454
California,2010,345345
New York,2000,2342341
New York,2010,678768
Texas,2000,643278
Texas,2010,34565677


### Data Aggregations on Mutli-Indices
- mean(), sum(), and max() can be passed a level index to determine how to aggregate data

In [65]:
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,23.0,37.7,22.0,36.5,33.0,38.7
2013,2,39.0,38.2,16.0,37.3,48.0,36.9
2014,1,38.0,36.1,44.0,36.6,39.0,37.8
2014,2,44.0,37.0,31.0,36.9,33.0,35.0


In [67]:
# average out measurements of the visits
data_mean = health_data.mean(level='year')
data_mean

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,31.0,37.95,19.0,36.9,40.5,37.8
2014,41.0,36.55,37.5,36.75,36.0,36.4


In [68]:
# Take mean amoung columns
data_mean.mean(axis=1, level='type')

type,HR,Temp
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2013,30.166667,37.55
2014,38.166667,36.566667


## Combining Datasets: Concat and Append
- It's important to know how to combine data sources and handle overlaps
- lets look at how to combine Series and DataFrames

In [2]:
import pandas as pd
import numpy as np

# Define a dataframe
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
           for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [4]:
# Define a function to render multiple dataframes side by side
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}</div>"""
    
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                        for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                          for a in self.args)

### Recall: Concatentation of NumPy Arrays
- Concat of Series and DataFrame objects similar to NumPy arrays.

In [5]:
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]

np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [9]:
# We can specify the axis along which the result will be concat
x = [[1, 2], 
    [3, 4]]
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

### Simple Concatenation with pd.concat
- pd.concat() is similar to np.concatenate()

In [10]:
# concat 2 series
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [12]:
# Also works on Dataframes
# works axis=0 be default
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
display('df1', 'df2', 'pd.concat([df1, df2])')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [16]:
# concat along col
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
display('df3', 'df4', "pd.concat([df3, df4], axis=1)")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,C,D
0,C0,D0
1,C1,D1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


### Duplicate Indices
- Pandas preserves indices

In [17]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index # make duplicate indices!
display('x', 'y', 'pd.concat([x, y])')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


In [18]:
# Catch repeats as an error
try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print('ValueError:', e)

ValueError: Indexes have overlapping values: [0, 1]


In [19]:
# Ignore the index
display('x', 'y', 'pd.concat([x, y], ignore_index=True)')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [20]:
# Adding Multiindex keys
display('x', 'y', "pd.concat([x, y], keys=['x', 'y'])")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,0,A2,B2
y,1,A3,B3


### Concatenation with joins
- we may need to join data that does not have the same column names

In [28]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])

# outer join be default
display('df5', 'df6', 'pd.concat([df5, df6])')

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [30]:
# use an inner join
display('df5', 'df6',
       "pd.concat([df5, df6], join='inner')")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [32]:
# specify which columns to keep
display('df5', 'df6',
       "pd.concat([df5, df6], join_axes=[df5.columns])")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,,B3,C3
4,,B4,C4


## Append()
- a shorthand

In [34]:
display('df1', 'df2', 'df1.append(df2)')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4
