# Data Manipulation with Pandas
- provides an efficient implementation of a Dataframe
- A dataframe is a multidimensional array with attached row and column labels
- often with heterogeneous types and/or missing data

In [1]:
import pandas
pandas.__version__

'0.20.1'

In [2]:
# we generally import pandas as pd
import pandas as pd
import numpy as np

## Pandas Objects
Can be thought of as enhanced bersions of NumPy structed arrays where rows and columns are identified with labels rather than simple integer indices.

Lets examine the fundamental Pandas data structures:
- Series
- DataFrame
- Index

### The Pandas Series Object
- A 1-d array of indexed data.
- Wraps both a sequence of values and sequence of indices
- Can be accessed with the values and index attributes.

In [4]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
# values are a numpy array
data.values

array([ 0.25,  0.5 ,  0.75,  1.  ])

In [8]:
# index is an array-like object of type pd.Index
data.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
# Data can be access with square-bracket notation
data[1]

0.5

In [10]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [12]:
# Series' index has explicitly defined indexs.
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [13]:
# Which can be accessed as
data['b']

0.5

In [14]:
# We can use non-contiguous or non-sequential indices:
data = pd.Series([0.25,0.5,0.75,1.0],
                index=[2,5,3,7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [15]:
data[5]

0.5

#### Series as a specialized dictionary
-  a structure which maps typed keys to a set of typed values

In [16]:
population_dict = {'California': 23534,
                  'Texas': 346457,
                  'New York': 785643,
                  'Florida': 876543,
                  'Illinois': 1234576}
population = pd.Series(population_dict)
population

California      23534
Florida        876543
Illinois      1234576
New York       785643
Texas          346457
dtype: int64

In [17]:
# By default, the index is drawn from the sorted keys 
population['California']

23534

In [18]:
# Supports array-style slicing
population['California':'Illinois']

California      23534
Florida        876543
Illinois      1234576
dtype: int64

In [19]:
#### Constructing Series Objects

In [20]:
# From a list or numpy array
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [21]:
# from a scalar to fill the specified index
pd.Series(5, index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [22]:
# From a dictionary
pd.Series({2:'a', 1:'b', 3:'c'})

1    b
2    a
3    c
dtype: object

In [23]:
# From a dictionary, but only the specified keys
pd.Series({2:'a',1:'b',3:'c'}, index=[3,2])

3    c
2    a
dtype: object

## Pandas DataFrame Object
A DataFrame can be thought of either as a generalization of a NumPy array, or as a specialization of a Python dictionary.

### DataFrame as a generalized NumPy array
- an analog of a 2d array with both flexible row indices and flixible column names.

In [4]:
# contruct a new Series
area_dict = {'California': 23453, 'Texas': 3456, 'New York': 234503409, 'Florida': 23453634, 'Illinois': 2345342}
area = pd.Series(area_dict)
area

California        23453
Florida        23453634
Illinois        2345342
New York      234503409
Texas              3456
dtype: int64

In [7]:
# From before..
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)

# We can construct a single 2d object containing population info
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,area,population
California,23453,38332521
Florida,23453634,19552860
Illinois,2345342,12882135
New York,234503409,19651127
Texas,3456,26448193


In [8]:
# DataFrame has an index attribute to access index labels
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [9]:
# It also has a column attribute to access the column labels
states.columns

Index(['area', 'population'], dtype='object')

### DataFrame as specialized dictionary
- maps column names to a Series of column data.

In [10]:
states['area']

California        23453
Florida        23453634
Illinois        2345342
New York      234503409
Texas              3456
Name: area, dtype: int64

### Constructing DataFrame objects

In [11]:
# DataFrame is a collection of Series objects. 
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [12]:
# Lists of dictionaries can be made into a DataFrame.
data = [{'a': i, 'b':2 * i}
       for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [13]:
# PD will will missing values with NaN
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [14]:
# Can be constructed from a dictionary of Series objects
pd.DataFrame({'population': population, 'area': area})

Unnamed: 0,area,population
California,23453,38332521
Florida,23453634,19552860
Illinois,2345342,12882135
New York,234503409,19651127
Texas,3456,26448193


In [15]:
# Can be constructed from a 2d array of data.
pd.DataFrame(np.random.rand(3, 2),
            columns=['foo', 'bar'],
            index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.660799,0.968329
b,0.386832,0.341184
c,0.200671,0.206062


In [16]:
# Can be constructed from a structed array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0,  0.), (0,  0.), (0,  0.)], 
      dtype=[('A', '<i8'), ('B', '<f8')])

In [18]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


### The Pandas Index Object
- an immutable array, or ordered set

In [19]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [20]:
# Can retrieve values or slices
ind[1]

3

In [21]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [22]:
# Has other attributes familiar from NumPy
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [23]:
# Can't be modified by normal means
int[1] = 0

TypeError: 'type' object does not support item assignment

#### Index as an ordered set

In [25]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [26]:
# intersection
indA & indB

Int64Index([3, 5, 7], dtype='int64')

In [28]:
# union
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [29]:
# symmetric difference
indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

## Data indexing and selection
- Many of the same means of accessing and modifying values in Pandas as NumPy

In [31]:
### Data selection in series
#### Series as a dictionary

In [33]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [34]:
data['b']

0.5

In [35]:
# We can use dictionary-like python expressions and methods to examine keys/indices and values
'a' in data

True

In [36]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [37]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [38]:
# modify values
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

#### Series as a 1d array

In [39]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [40]:
# slicing by implicit integer index
data[0:2]

a    0.25
b    0.50
dtype: float64

In [41]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [42]:
# fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

#### Indexers: loc, iloc, ix

In [43]:
data = pd.Series(['a', 'b', 'c'], index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [44]:
# explicit index when indexing
data[1]

'a'

In [45]:
# implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

In [46]:
# indexing and slicing with loc always uses explicit index
data.loc[1]

'a'

In [47]:
data.loc[1:3]

1    a
3    b
dtype: object

In [48]:
# iloc allows indexing and slicing with implicit index
data.iloc[1]

'b'

In [49]:
data.iloc[1:3]

3    b
5    c
dtype: object

### Data Selection in DataFrame
#### DataFrame as a dictionary
- lets consider the dataframe as a dictionary of related Series objects

In [3]:
area = pd.Series({'Califonia':23454523, 'Texas': 23445, 'New York': 23890490, 'Florida': 23890450, 'Illinois': 2348035890})
pop = pd.Series({'Califonia':254523, 'Texas': 234453, 'New York': 238904, 'Florida': 290450, 'Illinois': 2348035})
data = pd.DataFrame({'area':area, 'pop': pop})
data

Unnamed: 0,area,pop
Califonia,23454523,254523
Florida,23890450,290450
Illinois,2348035890,2348035
New York,23890490,238904
Texas,23445,234453


In [4]:
# Individual series make up the columns can be accessed via dictionary style indexing of the column name
data['area']

Califonia      23454523
Florida        23890450
Illinois     2348035890
New York       23890490
Texas             23445
Name: area, dtype: int64

In [5]:
# We can also use attributes
data.area

Califonia      23454523
Florida        23890450
Illinois     2348035890
New York       23890490
Texas             23445
Name: area, dtype: int64

In [6]:
# it's the same thing 
data.area is data['area']

True

In [7]:
# We can add data to dataframes
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
Califonia,23454523,254523,0.010852
Florida,23890450,290450,0.012158
Illinois,2348035890,2348035,0.001
New York,23890490,238904,0.01
Texas,23445,234453,10.000128


In [8]:
#### DataFrame as a 2d array
data.values

array([[  2.34545230e+07,   2.54523000e+05,   1.08517662e-02],
       [  2.38904500e+07,   2.90450000e+05,   1.21575776e-02],
       [  2.34803589e+09,   2.34803500e+06,   9.99999621e-04],
       [  2.38904900e+07,   2.38904000e+05,   9.99996233e-03],
       [  2.34450000e+04,   2.34453000e+05,   1.00001280e+01]])

In [9]:
# We can transpose the data
data.T

Unnamed: 0,Califonia,Florida,Illinois,New York,Texas
area,23454520.0,23890450.0,2348036000.0,23890490.0,23445.0
pop,254523.0,290450.0,2348035.0,238904.0,234453.0
density,0.01085177,0.01215758,0.0009999996,0.01,10.000128


In [10]:
# Get a row
data.values[0]

array([  2.34545230e+07,   2.54523000e+05,   1.08517662e-02])

In [11]:
# get a column
data['area']

Califonia      23454523
Florida        23890450
Illinois     2348035890
New York       23890490
Texas             23445
Name: area, dtype: int64

In [15]:
# We can use iloc indexer to access data, maintaining dataframe index and column lavels
data.iloc[:3, :2]

Unnamed: 0,area,pop
Califonia,23454523,254523
Florida,23890450,290450
Illinois,2348035890,2348035


In [16]:
# loc indexer can be used with explicit index and coluimn names
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
Califonia,23454523,254523
Florida,23890450,290450
Illinois,2348035890,2348035


In [17]:
# ix index is a hybrid approach
data.ix[:3, :'pop']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


Unnamed: 0,area,pop
Califonia,23454523,254523
Florida,23890450,290450
Illinois,2348035890,2348035


In [30]:
# We can use loc for masking and fancy indexing
data.loc[data.density > 10, ['pop', 'density']]

Unnamed: 0,pop,density
Texas,234453,10.000128


In [31]:
# We can modify data with them too
data.iloc[0, 2] = 90
data

Unnamed: 0,area,pop,density
Califonia,23454523,254523,90.0
Florida,23890450,290450,0.012158
Illinois,2348035890,2348035,0.001
New York,23890490,238904,0.01
Texas,23445,234453,10.000128


#### Additional indexing conventions
- index refers to columns
- slicing refers to rows

In [35]:
data['Florida':'Illinois']

Unnamed: 0,area,pop,density
Florida,23890450,290450,0.012158
Illinois,2348035890,2348035,0.001


In [36]:
# We can also refer to them as numbers
data[1:3]

Unnamed: 0,area,pop,density
Florida,23890450,290450,0.012158
Illinois,2348035890,2348035,0.001


In [38]:
# Direct masking operations are row-wise
data[data.density > 10]

Unnamed: 0,area,pop,density
Califonia,23454523,254523,90.0
Texas,23445,234453,10.000128


## Operation on Data in Pandas
- can perform numpy element-wise operations
- pandas keeps context when performing operations

### Ufuncs: index preservation

In [39]:
import pandas as pd
import numpy as np

# define a series
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [40]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                 columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [41]:
# applying a NumPy ufunc on either of these object will result in another pandas object with the indices preserved
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [42]:
# something more complicated
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


### UFuncs: Index Alignment
- for binary operations on 2 series or dataframe object, pandas will align indices. 
- this is useful when working with incomplete data

#### Index alignment in Series
- Lets combine 2 different data sources and find only the top 3 us states by area, and the top 3 states by population

In [45]:
area = pd.Series({'Alaska': 123234, 'Texas': 234523,
                 'California': 457623}, name='area')
population = pd.Series({'Texas': 234523, 'California': 457623,
                'New York': 24523}, name='population')

In [46]:
# calculate population density
population / area

Alaska        NaN
California    1.0
New York      NaN
Texas         1.0
dtype: float64

In [47]:
# The resulting array contains the unioni of indices.
# Could be found with
area.index | population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [48]:
# NaN is the default
A = pd.Series([2,4,6], index=[0,1,2])
B = pd.Series([1,3,5], index=[1,2,3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [49]:
# We can fill NaN with another number
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

#### Index Alignment in DataFrame
- similar alignment happens for dataframe on both row and column

In [55]:
A = pd.DataFrame(rng.randint(0, 20, (2,2)), columns=list('AB'))
A

Unnamed: 0,A,B
0,11,5
1,1,0


In [56]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,9,5,8
1,0,9,2
2,6,3,8


In [57]:
A + B

Unnamed: 0,A,B,C
0,16.0,14.0,
1,10.0,0.0,
2,,,


In [58]:
# We can also fill values
# lets use the mean value of A, computed by stacking rows of A
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,16.0,14.0,12.25
1,10.0,0.0,6.25
2,7.25,10.25,12.25


### Ufuncs: operations between dataframe and series
- column alignment is maintained in these cases as well

In [61]:
# find the difference of a 2d array and one of it's rows
A = rng.randint(10, size=(3,4))
A

array([[3, 1, 5, 5],
       [9, 3, 5, 1],
       [9, 1, 9, 3]])

In [62]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 6,  2,  0, -4],
       [ 6,  0,  4, -2]])

In [63]:
# with a dataframe
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,6,2,0,-4
2,6,0,4,-2


In [67]:
# now column-wise
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,2,0,4,4
1,6,0,2,-2
2,8,0,8,2


In [68]:
# Automatic index alignment takes place
halfrow = df.iloc[0, ::2]
halfrow

Q    3
S    5
Name: 0, dtype: int64

In [69]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,6.0,,0.0,
2,6.0,,4.0,
