# Pandas Pandas Pandas

In [255]:
import numpy as np
import pandas as pd

# Series

Series are like numpy arrays, but most usefully they can use labels in addition to indicies. Isn't it weird that the plural of index is indices? Like, what happends to that *e* to turn it into an *i*? Also, they can hold any kind of data, though it's got to be all the same kind of data.

### Make a Series from a list.

In [256]:
this_list = [10, 20, 30]
pd.Series(data=this_list)

0    10
1    20
2    30
dtype: int64

### Make a Series from an array.

In [257]:
this_array = np.array([2, 4, 6, 8])
pd.Series(data=this_array)

0    2
1    4
2    6
3    8
dtype: int64

### Add a named index, a.k.a. labels.

In [258]:
# Explicity declare data and labels.
my_list = [1.0, 1.1, 1.2, 1.3]
my_labels = 'a b c d'.split()
pd.Series(data=my_list, index=my_labels)

a    1.0
b    1.1
c    1.2
d    1.3
dtype: float64

In [259]:
# Declare data first, labels second.
pd.Series(my_list, my_labels)

a    1.0
b    1.1
c    1.2
d    1.3
dtype: float64

### A dict can pass data and labels simultaneously.

In [260]:
my_dict = {'a': 10, 'b': 20, 'c': 30, 'd': 40}
pd.Series(my_dict)

a    10
b    20
c    30
d    40
dtype: int64

### Use any kind data (all the same type).

In [261]:
# A Series of strings.
pd.Series(data=['a', 'b', 'c', 'd'])

0    a
1    b
2    c
3    d
dtype: object

In [262]:
# A Series of functions - dubiously useful.
pd.Series([len, sum, max, min])

0    <built-in function len>
1    <built-in function sum>
2    <built-in function max>
3    <built-in function min>
dtype: object

# Make use of indices.

Index labels in pandas makes it easy to look up data.

### Create a Series to use in examples.

In [263]:
pops_dict = {
    'China': 1409517397,
    'India': 1339180127,
    'America': 32445943,
    'Indonesia': 261115456,
    'Brazil': 207652865
}

pops = pd.Series(pops_dict)
pops

America        32445943
Brazil        207652865
China        1409517397
India        1339180127
Indonesia     261115456
dtype: int64

### Make selections in a variety of ways.

In [264]:
pops[0]

32445943

In [265]:
pops['Brazil']

207652865

In [266]:
pops.India

1339180127

In [267]:
pops['America'::2]

America        32445943
China        1409517397
Indonesia     261115456
dtype: int64

In [268]:
pops[pops > 1000000000]

China    1409517397
India    1339180127
dtype: int64

In [269]:
pops * 2

America        64891886
Brazil        415305730
China        2819034794
India        2678360254
Indonesia     522230912
dtype: int64

# Dataframes

Dataframes are where pandas reall shines. They are directly adapted from the R programming language. As most often encountered, they are like a bunch of Series objects set up as columns and lashed together with horizontal and vertical indices.

### Set things up.

In [270]:
import pandas as pd
import numpy as np

from numpy.random import randn
np.random.seed(101)

df = pd.DataFrame(randn(5,4),
                  index='A B C D E'.split(),
                  columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Get information in a variety of ways.

In [271]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [272]:
df[['Z', 'W', 'X']]

Unnamed: 0,Z,W,X
A,0.503826,2.70685,0.628133
B,0.605965,0.651118,-0.319318
C,-0.589001,-2.018168,0.740122
D,0.955057,0.188695,-0.758872
E,0.683509,0.190794,1.978757


In [273]:
# This is SQL syntax and it is not recommended.
df.Y

A    0.907969
B   -0.848077
C    0.528813
D   -0.933237
E    2.605967
Name: Y, dtype: float64

### Get some info about the DataFrame.

In [274]:
df.shape

(5, 4)

In [275]:
type(df['Z'])

pandas.core.series.Series

### Add a new columns (by using existing columns).

In [276]:
df['new'] = df['W'] * df['Z']
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,1.363781
B,0.651118,-0.319318,-0.848077,0.605965,0.394555
C,-2.018168,0.740122,0.528813,-0.589001,1.188702
D,0.188695,-0.758872,-0.933237,0.955057,0.180215
E,0.190794,1.978757,2.605967,0.683509,0.13041


### Remove a column.

By default pandas will try to drop a row (`axis=0`). If I want to drop a column I must specify `axis=1`. This default behavior makes sense, because a sample (row) is more likely to be dropped than a column, which would remove a single observation from every subject.

Also by default, the `df.drop()` function returns a new DataFrame, leaving the original unchanged, in the name of being careful.

In [277]:
# This returns a new DataFrame.
df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [278]:
# In place dropping must be declared explicitly.
df.drop('new', axis=1, inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [279]:
#  Dropping rows finally.
df.drop(['B', 'C', 'D', 'E'])

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826


### Select some rows.

`df.loc[]` locates a row based on its label. Putting in an index will not work

In [280]:
df.loc['A']

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

`df.iloc[]` locates a row based on its index.

In [281]:
df.iloc[:2]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965


### Select a subset of rows and columns.

In [282]:
df.loc['A', 'W']

2.7068498393999381

In [283]:
df.loc[['A', 'C'], ['X', 'Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
C,0.740122,-0.589001


### Selections can be made conditionally.

In [284]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [285]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [286]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [287]:
# Return a subset of df with only the rows that have a value greater than 0 in the W column.
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [288]:
# Select the Y column from the df above.
df[df['W']>0]['Y']

A    0.907969
B   -0.848077
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [289]:
# Select two columns from that df.
df[df['W']>0][['X', 'Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


### Use multiple conditionals.

In [290]:
df[(df>0) & (df<0.5)]

Unnamed: 0,W,X,Y,Z
A,,,,
B,,,,
C,,,,
D,0.188695,,,
E,0.190794,,,


In [291]:
df[(df['W']>0) & (df['Y'] > 1)]
df[
    (df['W']>0) &
    (df['Y']>1)
]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


# Setting and Resetting Indices

In [292]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


Reset the index to the default numerical, zero-based style, but keep the old index as a column.

In [293]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


### Create a new index.

In [294]:
new_index = 'OH PA WV KY CA'.split()
df['States'] = new_index
df.set_index('States', inplace=True)
df

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OH,2.70685,0.628133,0.907969,0.503826
PA,0.651118,-0.319318,-0.848077,0.605965
WV,-2.018168,0.740122,0.528813,-0.589001
KY,0.188695,-0.758872,-0.933237,0.955057
CA,0.190794,1.978757,2.605967,0.683509


# Multi-index and index Hierarchy

DataFrames can have multiple indices, and that's cool. You could most simply have two indices, with each row having two different labels to call it, which might be useful. More interesting is hierarchial indices, where a higher level index is used to label groups of rows and a lower level index individual rows.

In [295]:
outer_index = 'G1 G1 G1 G2 G2 G2'.split()
inner_index = [1, 2, 3, 1, 2, 3]
hier_index = pd.MultiIndex.from_tuples(list(zip(outer_index, inner_index)))
hier_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [296]:
df = pd.DataFrame(np.random.randn(6,2),
                  index=hier_index,
                  columns=['A', 'B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


### Make selections.

In [297]:
# Use df[] to get columns, as usual.
df['A']

G1  1    0.302665
    2   -1.706086
    3   -0.134841
G2  1    0.166905
    2    0.807706
    3    0.638787
Name: A, dtype: float64

In [298]:
# Use df.loc[] to get rows.
df.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [299]:
df.loc['G2']

Unnamed: 0,A,B
1,0.166905,0.184502
2,0.807706,0.07296
3,0.638787,0.329646


In [300]:
df.loc['G1'].loc[1]

A    0.302665
B    1.693723
Name: 1, dtype: float64

### Make a cross-section.

`DataFrame.xs()` is only capable of returning values, not setting them. So... I guess it's a safer way to get a cross-section than using bracket notation.

In [301]:
# These produce the same output.
# I'm worried I'm missing something important.
print(df.loc['G1'])
print(df.xs('G1'))

          A         B
1  0.302665  1.693723
2 -1.706086 -1.159119
3 -0.134841  0.390528
          A         B
1  0.302665  1.693723
2 -1.706086 -1.159119
3 -0.134841  0.390528


In [302]:
# These also produce the same output.
# Now I'm really confused.
print(df.xs(['G1', 1]))
print(df.loc['G1', 1])

A    0.302665
B    1.693723
Name: (G1, 1), dtype: float64
A    0.302665
B    1.693723
Name: (G1, 1), dtype: float64


This next example shows something that perhaps only `DataFrame.xs()` is useful for. I am able to return a cross-section of my DataFrame that includes each row labeled 1, from each group (G1 and G2).

In [323]:
# Naming the indices will make the next step easier
df.index.names = ['Group', 'Num']
df.xs(1, level='Num')

Unnamed: 0_level_0,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502


# Hey

Don't forget to watch the video all the way through again.