In [2]:
# Lets check our verion of Pandas
import pandas
pandas.__version__

u'0.23.4'

In [3]:
# We import NumPy under the alias np, so we will import Pandas under the alias pd
import pandas as pd

In [4]:
# At basic level, Pandas objects can be thought of as enhanced version of NumPy structured arrays
# The rows and columns are identified with labels rather than simple integer indicies
# Lets introduce 3 fundamental Pandas data structures: Series, DataFrame, and Index
import numpy as np
import pandas as pd
# A Pandas Series is a 1D array of indexed data
# Can be created from a list of arrays as follows:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [5]:
# As we can see, the Series wraps both a sequence of values and a sequence of indicies
# We can access data using values and index attributes
# Values are simply a familiar NumPy array
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [6]:
# Index is an array-like object of type pd.Index:
data.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
# Like NumPy arrays, data can be accessed by the associated index via the Python square-bracket notation
data[1]

0.5

In [8]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [9]:
# As we will see, the Pandas Series is much more flexible than the 1D NumPy array that it emulates
# Series and NumPy arrays are very similar
# NumPy array has an implicitly defined integer index used to access values
# Pandas Series has an explicitly defined index associated with the values
# Explicit index definition gives the series object additional capabilities
# Ex. Index doesn't have to be an integer, can consist of values of any type
# Ex. We can use strings as an index:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [10]:
# And item access works as expected:
data['b']

0.5

In [11]:
# We can even use noncontiguous/nonsequential indicies:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [12]:
data[5]

0.5

In [13]:
# In this way, we can think of Pandas Series as a specialization of a Python dictionary
# Dictionary is structures that maps arbitrary keys to set of arbitrary values
# Series is a structure that maps typed keys to a set of typed values
# Just as type-specific compiled code behind a NumPy array makes it more efficient than a Python list sometimes,
# Type information of Pandas Series makes it more efficient than Python dictionaries sometimes
# Lets make this Series-as-dictionary analogy more clear by creating a Series object directly from a Python Library:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [14]:
# By default, Series will be created where the index is drawn from sorted keys
# From here, typical dictionary-style item access can be performed:
population['California']

38332521

In [15]:
# Unlike dictionary, Series also supports array-style operations such as slicing
population['California':'Illinois']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

In [16]:
# Already seen few ways of constructing a Pandas Series from scratch
# All of them are some version of pd.Series(data, index=index)
# Ex. data can be a list or NumPy array, in which case the index defaults to integer sequence
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [17]:
# data can be a scalar which is repeated to fill the specified index:
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [18]:
# data can be dictionary which defaults to the sorted dictionary keys:
pd.Series({2:'a', 1:'b', 3:'c'})

1    b
2    a
3    c
dtype: object

In [19]:
# In each case, index can be explicitly set if different result is preferred:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

In [20]:
# Above, the Series is populated only with the explicitly identified keys

In [21]:
# Next fundamental structure in Pandas is the DataFrame
# Like Series, DataFrame can be thought of as a generalization of NumPy array
# Or a specialization of a Python dictionary
# Lets take a look at each perspective

In [22]:
# If Series is an analog of 1D array with flexible indicies,
# DataFrame is an analog of a 2D array with both flexible row indicies and flexible column names
# Just like a 2D array is an ordered sequence of alligned 1D columns,
# DataFrame can be thought of as a sequence of aligned Series objects
# To demonstrate, lets construct a new Series:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

In [23]:
# Now that we have area of the 5 states and the population series from before,
# We can use a dictionary to construct a single 2D object containing this information:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [24]:
# Like Series, DataFrame has an index attribute that gives access to the index labels
states.index

Index([u'California', u'Florida', u'Illinois', u'New York', u'Texas'], dtype='object')

In [25]:
# Additionally, DataFrame has columns attribute which is an Index object holding the column labels:
states.columns

Index([u'area', u'population'], dtype='object')

In [26]:
# Thus, DataFrame can be thought of as a generalization of a 2D NumPy array
# Both rows and columns have a generalized index for accessing the data

In [27]:
# Lets cover the specialized dictionary perspective
# Where a dictionary maps a key to a value, DataFrame maps a column name to a Series of column data
# Ex. Asking for 'area' attribute returns the Series object containing the areas we saw earlier:
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [28]:
# There is a potential point of confusion here:
# In a 2D NumPy array, data[0] will return the first row
# For DataFrame, data[col0] will return the first column
# Because of this, probably better to think of DataFrames as generalized dictionaries rather than generalized arrays

In [29]:
# Pandas DataFrame can be constructed in many ways
# DataFrame is a collection of Series objects, and single-column DataFrame can be constructed from a single Series:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [30]:
# Any list of dictionaries can be made into a DataFrame
# We'll use a simple list comprehension to create some data:
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [31]:
# Even if some keys in the dictionary are missing, Pandas will fill them with NaN (not a number) values:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [32]:
# As seen before, DataFrame can be constructed from a dictionary of Series objects as well:
pd.DataFrame({'population': population,
              'area': area})

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [33]:
# Given a 2D array of data, we can create DataFrame with any specified column and index names
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.686682,0.686516
b,0.453445,0.139162
c,0.852382,0.634795


In [34]:
# We covered structured arrays before
# Pandas DataFrame operates much like a structured array and can be created directly from one:
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [35]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


In [36]:
# We have now seen both Series and DataFrame objects contain explicit index that lets you reference and modify data
# Index object is interesting in itself and can be thought of as an immutable array or an ordered set
# Both these views have interesting consequences in the operations available on Index objects
# Ex. Let's make an Index from a list of integers:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [37]:
# Index object in many ways operates like an array
# Ex. we can use standard Python indexing notation to retrieve values or slices:
ind[1]

3

In [38]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [39]:
# Index also has many attributes familiar with NumPy arrays:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

(5, (5L,), 1, dtype('int64'))


In [40]:
# One difference between Index objects and NumPy arrays is that indicies are immutable
# They cant be modified via the normal means:
ind[1] = 0

TypeError: Index does not support mutable operations

In [None]:
# Immutability makes it safer to share indicies between multiple DataFrames and arrays

In [41]:
# Pandas objects are designed to facilitate operations such as joins across datasets
# This depends on many aspects of set arithmetic
# Index object follows many conventions used by Python's built-in set data structure
# This way, unions, intersections, differences, and other combinations can be computed in a familiar way:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

indA & indB # Intersection

Int64Index([3, 5, 7], dtype='int64')

In [42]:
indA | indB # Union

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [43]:
indA ^ indB # Symmetric Difference

Int64Index([1, 2, 9, 11], dtype='int64')