In [1]:
import numpy as np
import pandas as pd

### Data Indexing and Selection
* Recall methods and tools to access, set, and modify values in np arrays.

In [4]:
arr = np.random.randint(0, 100, size = (5, 5, 5))
arr

array([[[47, 60, 98,  6, 65],
        [94, 48, 54, 70, 21],
        [44, 48, 10,  5, 93],
        [88, 76, 37, 28, 28],
        [ 5, 50, 95, 91, 34]],

       [[14, 93, 50, 86, 63],
        [51, 42, 57, 65, 27],
        [44, 81, 86, 79, 29],
        [73, 86, 28, 53, 98],
        [37, 42, 82, 56, 61]],

       [[89,  0, 72, 31, 22],
        [81, 41, 37, 99, 64],
        [50, 57, 62, 49, 73],
        [36, 90,  3, 85, 81],
        [41, 61, 70,  1, 60]],

       [[ 3, 19, 65, 12,  0],
        [32, 16, 16, 49,  1],
        [ 0, 33, 68, 63, 94],
        [75, 96, 59, 65, 57],
        [ 4, 12, 51, 93,  8]],

       [[20, 95, 93, 96,  9],
        [98, 51, 44, 99, 30],
        [19, 22, 17, 85, 51],
        [81, 89, 32, 71, 56],
        [86, 49, 40, 17, 18]]])

In [8]:
# Indexing
arr[0]

array([[47, 60, 98,  6, 65],
       [94, 48, 54, 70, 21],
       [44, 48, 10,  5, 93],
       [88, 76, 37, 28, 28],
       [ 5, 50, 95, 91, 34]])

In [9]:
# slicing
arr[:, :, 0]

array([[47, 94, 44, 88,  5],
       [14, 51, 44, 73, 37],
       [89, 81, 50, 36, 41],
       [ 3, 32,  0, 75,  4],
       [20, 98, 19, 81, 86]])

In [10]:
# masking
arr[arr < 50]

array([47,  6, 48, 21, 44, 48, 10,  5, 37, 28, 28,  5, 34, 14, 42, 27, 44,
       29, 28, 37, 42,  0, 31, 22, 41, 37, 49, 36,  3, 41,  1,  3, 19, 12,
        0, 32, 16, 16, 49,  1,  0, 33,  4, 12,  8, 20,  9, 44, 30, 19, 22,
       17, 32, 49, 40, 17, 18])

In [11]:
# fancy indexing
arr[0, [1, 2]]

array([[94, 48, 54, 70, 21],
       [44, 48, 10,  5, 93]])

In [12]:
arr[:, [1, 3], 2]

array([[54, 37],
       [57, 28],
       [37,  3],
       [16, 59],
       [44, 32]])

#### Data Selection in Series
* A **Series object** acts in many ways:
    * like a 1D np array.
    * like a standard Python dictionary.
    
    
* Series as dictionary
    * Like a dictionary, the Series object provides a mapping from a collection of keys to a collection of values.
    * dictionary-like Python expressions and methods

In [13]:
# provides a mapping from a collection of keys to a collection of values
data = pd.Series([0.25, 0.5, 0.75, 1.0], 
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [15]:
np.arange?

In [18]:
data = pd.Series(np.arange(0.25, 1.01, 0.25), 
                 index=list('abcd'))
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [19]:
# dictionary-like Python expressions and methods
# Check is 'a' in keys
'a' in data

True

In [23]:
# List all the keys
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [25]:
# List all the values
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [37]:
# List the counts of values
data.value_counts()

1.00    1
0.75    1
0.50    1
0.25    1
dtype: int64

In [41]:
# List all the key:value pairs (or items)
data.items

<bound method Series.iteritems of a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64>

In [43]:
# covert all items in to a list.
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [26]:
list?

In [27]:
# modification
data['e'] = 1.25 # here is appending
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

* Series as 1D array
    * slices, masking, and fancy indexing

In [28]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [30]:
# slicing by implicit integer index
data[0:3]

a    0.25
b    0.50
c    0.75
dtype: float64

<font color = red size = 2>  
* **When you are slicing with an explicit index (i.e., data['a':'c']), <mark> the final index is included </mark> in the slice**
* **While when you’re slicing with an implicit index (i.e., data[0:2]), <mark> the final index is excluded </mark> from the slice.**
 
<font color = black> 
* Indexers: loc, iloc, and ix:
    * Slicing and indexing conventions can be a source of confusion
    * If the **Series** has an explicit integer index:
        * An <mark>indexing</mark> operation such as data[1] will use the explicit indices.
        * While a <mark>slicing</mark> operation like data[1: 3] will use the implicit Python-style index.

In [33]:
data = pd.Series(['a', 'b', 'c'], index = [1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [34]:
# explicit index when indexing
data[1]

'a'

In [35]:
# implicit index with slicing
data[1:3]

3    b
5    c
dtype: object

* special *indexer* attributes that explicitly expose certain indexing schemes.
    * Not functional methods.
    * Attributes that expose a particular slicing interface to the data in the Series.
* **loc** attribute: index and slice that always references the explicit index
* **iloc** attribute: index and slice that always references the <mark>*implicit Python-style*</mark> index
* **ix** attribute: standard-[]-based indexing, will be discussed in DataFrame

<font color = red size =2>
***One guiding principle of Python code is that “explicit is better than implicit.” <br>The explicit nature of loc and iloc make them very useful in maintaining clean and readable code; <br>especially in the case of integer indexes, I recommend using these both to make code easier to read and understand, and to prevent subtle bugs due to the mixed indexing/slicing convention.*** 

In [51]:
# loc attribute: index and slice that always references the explicit index
data.loc[1]

'a'

In [52]:
# loc attribute: index and slice that always references the explicit index
data.loc[1:3]

1    a
3    b
dtype: object

In [53]:
# iloc attribute: index and slice that always references the implicit Python-style index
data.iloc[1]

'b'

In [54]:
# iloc attribute: index and slice that always references the implicit Python-style index
data.iloc[1:3]

3    b
5    c
dtype: object

#### Data Selection in DataFrame
* A **DataFrame** acts in many ways like:
    * a 2D or structured np array.
    * a dictionary of Series structures sharing the same index.

* DataFrame as a dictionary: a dictionary of related Series objects.
    * The individual **Series** that make up the columns of the DF can be accessed via dictionalry-style indexing of the column name. 
    * <font color = blue> Eqivalently, we can use attribute-style access with column names <mark>that are strings</mark></font>
        * This attribute-style column access actually accesses the exact same object as the dictionary-style access
        * Useful shorthand, but doesn't work for all cases.
            * Columns names are not strings.
            * Column names conflict with methods of the DataFrame
            * Avoid assignment via attribute. 
    * The dictionary-style syntax can also be used to modify the object. 

In [36]:
# Dictionary to Series
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
# Dictionary to DataFrame. The values are Series.
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [42]:
# dictionalry-style indexing of the column name
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [45]:
# attribute-style access with column names that are strings
data.area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [44]:
# This attribute-style column access actually accesses the exact same object 
# as the dictionary-style access:
data.area is data['area']

True

In [59]:
# Useful shorthand, but doesn't work for all cases.
# Columns names are not strings.
# Column names conflict with methods of the DataFrame
data['pop']

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
Name: pop, dtype: int64

In [62]:
data.pop is data['pop']

False

In [64]:
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [46]:
data.pop?

In [49]:
# dictionary-style syntax can also be used to modify the object, e.g. to add a new column
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


* DataFrame as 2D array
    

In [56]:
# underlying data
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01]])

In [61]:
# Transpose
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [63]:
# passing a single index to an array accesses a row
data.values[1]

array([1.70312000e+05, 1.95528600e+07, 1.14806121e+02])

In [70]:
# passing a single "index" to a DataFrame accesses a column
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

* <font color = red> For array-style indexing, Pandas uses the **loc**, **iloc**, **ix** </font>

* **iloc** indexer:
    * index the underlying array as if it is a simple np array.
    * using the implicit Python-style index.
    * the DF index and column labels are maintained

In [72]:
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [73]:
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [74]:
# the ix indexer allows a hybrid of these two approaches
data.ix[:3, :'pop']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


<font color = red size = 3> 
* iloc: integer based
* loc: label based
* ix: mix (depreciated)

In [64]:
data.loc?

In [79]:
# loc indexer to combine masking and fancy indexing
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
Florida,19552860,114.806121
New York,19651127,139.076746


In [80]:
data.density > 100

California    False
Florida        True
Illinois      False
New York       True
Texas         False
Name: density, dtype: bool

In [65]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [66]:
data.iloc[0, 2] = 900000
data

Unnamed: 0,area,pop,density
California,423967,38332521,900000.0
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


* Additional indexing conventions
    * While indexing refers to columns, slicing refers to rows
    * direct masking operations are also interpreted row-wise rather than column-wise

In [83]:
# While indexing refers to columns, slicing refers to rows
data['Florida':'Illinois']

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [87]:
data[1:3]  is data['Florida':'Illinois']

False

In [88]:
# direct masking operations are also interpreted row-wise rather than column-wise
data[data.density > 100]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746
