# Lecture 1: Series

In [4]:
import numpy as np
import pandas as pd

In [5]:
labels = ['a', 'b', 'c']
my_data = [10, 20, 30]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30}

In [6]:
# My first Pandas series
pd.Series(data=my_data, index=labels)

a    10
b    20
c    30
dtype: int64

In [7]:
# Can also be created from the other objects
pd.Series(arr, labels)
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [8]:
# A Pandas Series can hold any object type, including functions
pd.Series(data = [sum, print, len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

In [9]:
ser1 = pd.Series([1, 2, 3, 4], ['USA', 'Germany', 'USSR', 'Japan'])

In [10]:
ser2 = pd.Series([1, 2, 5, 4], ['USA', 'Germany', 'Italy', 'Japan'])

In [11]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [12]:
# Can access it as like a dict
ser1['USA']

1

In [13]:
# Can also add series, but it will try to match index
ser1 + ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

# Lecture 2: Dataframes
A DataFrame is a group of Pandas series that have column names

In [14]:
from numpy.random import randn

In [15]:
np.random.seed(101)

## Creating Dataframes

In [16]:
df = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z'])

In [17]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [18]:
# This will return a single column, which is in fact a series
print(df['W'])
print(type(df['W']))

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64
<class 'pandas.core.series.Series'>


In [19]:
# Can also call multiple columns
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


## Manipulating Dataframes (Adding, removing etc)

In [20]:
# To add a new column
df['new'] = df['W'] + df['Y']

In [21]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [22]:
# And to remove it:
df.drop('new', axis=1)
# The axis is important, as by default it is 0 which means rows

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [23]:
# BUT, note this doesn't actually delete it
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [24]:
# To proper delete it, we need the inplace argument
df.drop('new', axis=1, inplace=True)

In [25]:
# Pandas does this on purpose so that we don't accidentally lose data.
# Now we can see that the new column is removed
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [26]:
# This can also be used to drop rows. 
# Note, the axis is 0 by default so is not necessarily needed
df.drop('A')

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [27]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


## Working with rows

In [28]:
# Note that returning a row requires a method call
# Also note that returning a row is actually a series in itself
df.loc['C']

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [29]:
# This can also be done using an index
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

## Selecting subsets of Rows and Columns

In [30]:
# This returns a single value [Row, Coulumn]
df.loc['B', 'Y']

-0.84807698340363147

In [31]:
# For a larger subset, use lists or rows and columns
df.loc[['A', 'B'], ['W', 'Y']]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


# Lecture 3

## Dataframe selection
### Single Conditionals

In [32]:
# Conditionals can be used, much like in numpy. It returns a boolean dataframe
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [33]:
# It can be used directly on the dataframe to return values. Values will be NaN when false
df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [34]:
# It is more common to use this method on entire rows or columns. This will return a series
df['W']>0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [35]:
# And this can then be used on the entire dataframe.
# Notice it only returns the rows where it is true
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [36]:
# This can be used to return other information from the dataframe by stacking commands
# This should return columns X & Y where W is > 0
df[df['W']>0][['X','Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


In [37]:
# The above can be split into multiple steps
boolseries = df['W']>0
result = df[boolseries]
mycols = ['X', 'Y']
result[mycols]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


### Multiple Conditionals

When using multiple conditions, Pythons normal and operator will not work as it can only be applied to single boolean tests, not a series of booleans. What can be used instead is the & operator. This will compare the entire series

In [38]:
# The format it df[(cond1) OPERATOR (cond2)]
df[(df['W']>0) and (df['Y']>1)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [40]:
df[(df['W']>0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


## Indexing

In [41]:
# To reset the index back to 0, 1, 2...
# Notice the index is copied to a column now.
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [42]:
# Notice that the original df is not modified
# REMEMBER that by default pandas does not modify the dataframe, inplace=true is needed
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [43]:
# We can also add a column as an index
# NB this quick way of creating a list of strings
newcol = 'CO NY MA MI MO'.split()

In [44]:
df['States'] = newcol

In [45]:
# Note that unlike reset, set_index does not retain the original index as a column
# But it is ok, as we didn't use the inplace=true so our original dataframe still exisits
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CO,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
MA,-2.018168,0.740122,0.528813,-0.589001
MI,0.188695,-0.758872,-0.933237,0.955057
MO,0.190794,1.978757,2.605967,0.683509


# Lecture 4
## Multi Indexing & Hierarchy

In [46]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [49]:
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,1.025984,-0.156598
G1,2,-0.031579,0.649826
G1,3,2.154846,-0.610259
G2,1,-0.755325,-0.346419
G2,2,0.147027,-0.479448
G2,3,0.558769,1.02481


In [56]:
df.loc['G1'].loc[1]

A    1.025984
B   -0.156598
Name: 1, dtype: float64

In [58]:
# To name these index columns:
df.index.names = ['Groups', 'Num']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,1.025984,-0.156598
G1,2,-0.031579,0.649826
G1,3,2.154846,-0.610259
G2,1,-0.755325,-0.346419
G2,2,0.147027,-0.479448
G2,3,0.558769,1.02481


In [60]:
# It's possible to take a cross section
# so for example to get the index 1 from both G1 and G2, using .xs method
df.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,1.025984,-0.156598
G2,-0.755325,-0.346419
