# Introduction to Pandas

In [1]:
import pandas
pandas.__version__

'0.24.2'

# Introducing Pandas Objects

In [2]:
import numpy as np
import pandas as pd

### Series

A **Series** is a single vector of data (like a NumPy array) with an *index* that labels each element in the vector.

In [3]:
counts = pd.Series([632, 1638, 569, 115])
counts

0     632
1    1638
2     569
3     115
dtype: int64

In [None]:
counts.values

In [None]:
counts.index

In [None]:
bacteria = pd.Series([632, 1638, 569, 115], 
    index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])

bacteria

In [None]:
bacteria['Actinobacteria']

In [None]:
bacteria[0]

In [None]:
bacteria.name = 'counts'
bacteria.index.name = 'phylum'
bacteria

In [None]:
np.log(bacteria)

In [None]:
bacteria[bacteria>1000]

In [None]:
bacteria_dict = {'Firmicutes': 632, 'Proteobacteria': 1500, 'Actinobacteria': 569, 'Bacteroidetes': 115}
print(bacteria_dict)
pd.Series(bacteria_dict)

### ``Series`` as generalized NumPy array

In [14]:
data = {"Province": ["FL", "FL", "NH", "NH", "ZH"],
        "Year": [2013, 2014, 2013, 2014, 2014],
        "Literacy": [0.2, 0.1, 0.5, 0.3, 0.5]}
print(data)
data = pd.DataFrame(data)
data

{'Province': ['FL', 'FL', 'NH', 'NH', 'ZH'], 'Year': [2013, 2014, 2013, 2014, 2014], 'Literacy': [0.2, 0.1, 0.5, 0.3, 0.5]}


Unnamed: 0,Province,Year,Literacy
0,FL,2013,0.2
1,FL,2014,0.1
2,NH,2013,0.5
3,NH,2014,0.3
4,ZH,2014,0.5


To change the order of the columns:

In [15]:
df = pd.DataFrame(data, columns=["Year", "Province" ,"Literacy"])
df

Unnamed: 0,Year,Province,Literacy
0,2013,FL,0.2
1,2014,FL,0.1
2,2013,NH,0.5
3,2014,NH,0.3
4,2014,ZH,0.5


An `index` can be passed (as with Series), and passing column names not existing, will result in missing data.

Assigning values to new columns is easy

In [16]:
df['nonsense'] = df.Year / df.Literacy
df

Unnamed: 0,Year,Province,Literacy,nonsense
0,2013,FL,0.2,10065.0
1,2014,FL,0.1,20140.0
2,2013,NH,0.5,4026.0
3,2014,NH,0.3,6713.333333
4,2014,ZH,0.5,4028.0


In [17]:
df['Serie_aligned'] = pd.Series(range(5), index=[0,1,2, 3, 4])
df

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned
0,2013,FL,0.2,10065.0,0
1,2014,FL,0.1,20140.0,1
2,2013,NH,0.5,4026.0,2
3,2014,NH,0.3,6713.333333,3
4,2014,ZH,0.5,4028.0,4


Passing a dicts where the values are dicts is also possible

In [18]:
df.to_dict()

{'Year': {0: 2013, 1: 2014, 2: 2013, 3: 2014, 4: 2014},
 'Province': {0: 'FL', 1: 'FL', 2: 'NH', 3: 'NH', 4: 'ZH'},
 'Literacy': {0: 0.2, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.5},
 'nonsense': {0: 10065.0,
  1: 20140.0,
  2: 4026.0,
  3: 6713.333333333334,
  4: 4028.0},
 'Serie_aligned': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}}

In [19]:
pd.DataFrame(df.to_dict())

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned
0,2013,FL,0.2,10065.0,0
1,2014,FL,0.1,20140.0,1
2,2013,NH,0.5,4026.0,2
3,2014,NH,0.3,6713.333333,3
4,2014,ZH,0.5,4028.0,4


### DataFrame as specialized dictionary

#### From a list of dicts

Any list of dictionaries can be made into a ``DataFrame``.
We'll use a simple list comprehension to create some data:

In [None]:
data = [{'a': i, 'b':10* i}for i in range(6)]
print(data)
pd.DataFrame(data)

Even if some keys in the dictionary are missing, Pandas will fill them in with ``NaN`` (i.e., "not a number") values:

In [None]:
pd.DataFrame([{'aa': 1, 'bb': 2}, {'bb': 3, 'cc': 6}])

#### From a two-dimensional NumPy array

In [None]:
pd.DataFrame(np.random.randint(2, 12),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])



## The Pandas Index Object

In [None]:
ind = pd.Index([20, 34, 57, 7, 1, 8])
ind

In [None]:
ind[1]

In [None]:
ind[::]

``Index`` objects also have many of the attributes familiar from NumPy arrays:

In [None]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

One difference between ``Index`` objects and NumPy arrays is that indices are immutable–that is, they cannot be modified via normal means:

In [None]:
ind[1] = 0

This immutability makes it safer to share indices between multiple ``DataFrame``s and arrays, without the potential for side effects from inadvertent index modification.

# Operating on Data in Pandas

## Ufuncs: Index Preservation

In [4]:
rng = np.random.RandomState(15)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    8
1    5
2    5
3    7
dtype: int64

In [5]:
dfr = pd.DataFrame(rng.randint(0, 10, (5, 4)),
                  columns=['A', 'B', 'C', 'D'])
dfr

Unnamed: 0,A,B,C,D
0,0,7,5,6
1,1,7,0,4
2,9,7,5,3
3,6,8,2,1
4,1,0,5,2


In [6]:
np.exp(ser)

0    2980.957987
1     148.413159
2     148.413159
3    1096.633158
dtype: float64

In [7]:
np.sin(dfr * np.pi / 4)

Unnamed: 0,A,B,C,D
0,0.0,-0.7071068,-0.707107,-1.0
1,0.707107,-0.7071068,0.0,1.224647e-16
2,0.707107,-0.7071068,-0.707107,0.7071068
3,-1.0,-2.449294e-16,1.0,0.7071068
4,0.707107,0.0,-0.707107,1.0


## Universal Functions: Index Alignment

### Index alignment in Series

In [8]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')
print(area)
population


Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64


California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64

In [9]:
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [10]:
area.index | population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [11]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A)
print(B)
B
A + B

0    2
1    4
2    6
dtype: int64
1    1
2    3
3    5
dtype: int64


0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [12]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

## Ufuncs: Operations Between DataFrame and Series

# Data wrangling

## Merge operations

In [20]:
df

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned
0,2013,FL,0.2,10065.0,0
1,2014,FL,0.1,20140.0,1
2,2013,NH,0.5,4026.0,2
3,2014,NH,0.3,6713.333333,3
4,2014,ZH,0.5,4028.0,4


In [21]:
df2 = pd.DataFrame({"Province": ["FL", "NH", "ZH"], "Population": ["100000", "200000", "300000"]})
df2

Unnamed: 0,Province,Population
0,FL,100000
1,NH,200000
2,ZH,300000


Let's say we want a dataset with year, literacy, province and population. We can create it from `df` and `df2`.

In [22]:
df.merge(df2)  # merge is smart! If there are overlapping names, it uses those for the merge

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned,Population
0,2013,FL,0.2,10065.0,0,100000
1,2014,FL,0.1,20140.0,1,100000
2,2013,NH,0.5,4026.0,2,200000
3,2014,NH,0.3,6713.333333,3,200000
4,2014,ZH,0.5,4028.0,4,300000


If the column names are different, you need to specify them explicitely

In [23]:
df3 = pd.DataFrame({"province": ["FL", "NH"], "Population": ["100000", "200000"]})
df3
df.merge(df3, right_on='province', left_on='Province')

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned,province,Population
0,2013,FL,0.2,10065.0,0,FL,100000
1,2014,FL,0.1,20140.0,1,FL,100000
2,2013,NH,0.5,4026.0,2,NH,200000
3,2014,NH,0.3,6713.333333,3,NH,200000


In [None]:
df4 = pd.DataFrame({"Province": ["FL", "NH", "UT"], "Population": ["100000", "200000", "50000"]})
df.merge(df4, how='outer')

In [25]:
df5 = pd.DataFrame({"Province": ["FL", "NH", "FL"], "Population": ["100000", "200000", "50000"]})
print(df)
df.merge(df5, how='outer')

   Year Province  Literacy      nonsense  Serie_aligned
0  2013       FL       0.2  10065.000000              0
1  2014       FL       0.1  20140.000000              1
2  2013       NH       0.5   4026.000000              2
3  2014       NH       0.3   6713.333333              3
4  2014       ZH       0.5   4028.000000              4


Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned,Population
0,2013,FL,0.2,10065.0,0,100000.0
1,2013,FL,0.2,10065.0,0,50000.0
2,2014,FL,0.1,20140.0,1,100000.0
3,2014,FL,0.1,20140.0,1,50000.0
4,2013,NH,0.5,4026.0,2,200000.0
5,2014,NH,0.3,6713.333333,3,200000.0
6,2014,ZH,0.5,4028.0,4,


This was a many-to-many merge. Even though if you think about it, the behavior is what you expect, you might still not think about it and be surprised!

### Combining data with overlap
Sometimes some data is missing, and it can be "patched" with another dataset. Let's take a look.

In [26]:
serie_a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
                     index=['f', 'e', 'd', 'c', 'b', 'a'])
serie_b = pd.Series(np.arange(len(serie_a), dtype=np.float64),
                 index=['f', 'e', 'd', 'c', 'b', 'a'])

In [27]:
serie_a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [28]:
serie_b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

Let's say we want to update `a` with the values from `b`. The num-pythonic way to do that is

In [29]:
pd.Series(np.where(pd.isnull(serie_a), serie_b, serie_a), index=serie_a.index)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64

That's a bit verbose for something so simple. What about this:

In [30]:
serie_a.combine_first(serie_b)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64