# Data Manipulation with Pandas
- provides an efficient implementation of a Dataframe
- A dataframe is a multidimensional array with attached row and column labels
- often with heterogeneous types and/or missing data

In [1]:
import pandas
pandas.__version__

'0.20.1'

In [2]:
# we generally import pandas as pd
import pandas as pd
import numpy as np

## Pandas Objects
Can be thought of as enhanced bersions of NumPy structed arrays where rows and columns are identified with labels rather than simple integer indices.

Lets examine the fundamental Pandas data structures:
- Series
- DataFrame
- Index

### The Pandas Series Object
- A 1-d array of indexed data.
- Wraps both a sequence of values and sequence of indices
- Can be accessed with the values and index attributes.

In [4]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [6]:
# values are a numpy array
data.values

array([ 0.25,  0.5 ,  0.75,  1.  ])

In [8]:
# index is an array-like object of type pd.Index
data.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
# Data can be access with square-bracket notation
data[1]

0.5

In [10]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [12]:
# Series' index has explicitly defined indexs.
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [13]:
# Which can be accessed as
data['b']

0.5

In [14]:
# We can use non-contiguous or non-sequential indices:
data = pd.Series([0.25,0.5,0.75,1.0],
                index=[2,5,3,7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [15]:
data[5]

0.5

#### Series as a specialized dictionary
-  a structure which maps typed keys to a set of typed values

In [16]:
population_dict = {'California': 23534,
                  'Texas': 346457,
                  'New York': 785643,
                  'Florida': 876543,
                  'Illinois': 1234576}
population = pd.Series(population_dict)
population

California      23534
Florida        876543
Illinois      1234576
New York       785643
Texas          346457
dtype: int64

In [17]:
# By default, the index is drawn from the sorted keys 
population['California']

23534

In [18]:
# Supports array-style slicing
population['California':'Illinois']

California      23534
Florida        876543
Illinois      1234576
dtype: int64

In [19]:
#### Constructing Series Objects

In [20]:
# From a list or numpy array
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [21]:
# from a scalar to fill the specified index
pd.Series(5, index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [22]:
# From a dictionary
pd.Series({2:'a', 1:'b', 3:'c'})

1    b
2    a
3    c
dtype: object

In [23]:
# From a dictionary, but only the specified keys
pd.Series({2:'a',1:'b',3:'c'}, index=[3,2])

3    c
2    a
dtype: object

## Pandas DataFrame Object
A DataFrame can be thought of either as a generalization of a NumPy array, or as a specialization of a Python dictionary.

### DataFrame as a generalized NumPy array
- an analog of a 2d array with both flexible row indices and flixible column names.

In [4]:
# contruct a new Series
area_dict = {'California': 23453, 'Texas': 3456, 'New York': 234503409, 'Florida': 23453634, 'Illinois': 2345342}
area = pd.Series(area_dict)
area

California        23453
Florida        23453634
Illinois        2345342
New York      234503409
Texas              3456
dtype: int64

In [7]:
# From before..
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)

# We can construct a single 2d object containing population info
states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,area,population
California,23453,38332521
Florida,23453634,19552860
Illinois,2345342,12882135
New York,234503409,19651127
Texas,3456,26448193


In [8]:
# DataFrame has an index attribute to access index labels
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [9]:
# It also has a column attribute to access the column labels
states.columns

Index(['area', 'population'], dtype='object')

### DataFrame as specialized dictionary
- maps column names to a Series of column data.

In [10]:
states['area']

California        23453
Florida        23453634
Illinois        2345342
New York      234503409
Texas              3456
Name: area, dtype: int64

### Constructing DataFrame objects

In [11]:
# DataFrame is a collection of Series objects. 
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [12]:
# Lists of dictionaries can be made into a DataFrame.
data = [{'a': i, 'b':2 * i}
       for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [13]:
# PD will will missing values with NaN
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [14]:
# Can be constructed from a dictionary of Series objects
pd.DataFrame({'population': population, 'area': area})

Unnamed: 0,area,population
California,23453,38332521
Florida,23453634,19552860
Illinois,2345342,12882135
New York,234503409,19651127
Texas,3456,26448193


In [15]:
# Can be constructed from a 2d array of data.
pd.DataFrame(np.random.rand(3, 2),
            columns=['foo', 'bar'],
            index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.660799,0.968329
b,0.386832,0.341184
c,0.200671,0.206062


In [16]:
# Can be constructed from a structed array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0,  0.), (0,  0.), (0,  0.)], 
      dtype=[('A', '<i8'), ('B', '<f8')])

In [18]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


### The Pandas Index Object
- an immutable array, or ordered set

In [19]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [20]:
# Can retrieve values or slices
ind[1]

3

In [21]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [22]:
# Has other attributes familiar from NumPy
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [23]:
# Can't be modified by normal means
int[1] = 0

TypeError: 'type' object does not support item assignment

#### Index as an ordered set

In [25]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [26]:
# intersection
indA & indB

Int64Index([3, 5, 7], dtype='int64')

In [28]:
# union
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [29]:
# symmetric difference
indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

## Data indexing and selection
- Many of the same means of accessing and modifying values in Pandas as NumPy

In [31]:
### Data selection in series
#### Series as a dictionary

In [33]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [34]:
data['b']

0.5

In [35]:
# We can use dictionary-like python expressions and methods to examine keys/indices and values
'a' in data

True

In [36]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [37]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [38]:
# modify values
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

#### Series as a 1d array

In [39]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [40]:
# slicing by implicit integer index
data[0:2]

a    0.25
b    0.50
dtype: float64

In [41]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [42]:
# fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

#### Indexers: loc, iloc, ix

In [43]:
data = pd.Series(['a', 'b', 'c'], index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [44]:
# explicit index when indexing
data[1]

'a'

In [45]:
# implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

In [46]:
# indexing and slicing with loc always uses explicit index
data.loc[1]

'a'

In [47]:
data.loc[1:3]

1    a
3    b
dtype: object

In [48]:
# iloc allows indexing and slicing with implicit index
data.iloc[1]

'b'

In [49]:
data.iloc[1:3]

3    b
5    c
dtype: object