## CS 210: Data Management for Data Science
### Lecture 11

# <font color='brown'>Pandas</font>

In [1]:
import numpy as np
import pandas as pd

### Dropping Elements

In [2]:
ser = pd.Series(np.arange(4), index=list("abcd"))

In [3]:
ser.drop("c")

a    0
b    1
d    3
dtype: int64

In [4]:
ser

a    0
b    1
c    2
d    3
dtype: int64

In [5]:
ser.drop(["a", "d"])

b    1
c    2
dtype: int64

In [6]:
df = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  index = ["NYC", "Boston", "Seattle"],
                  columns = ["Jan", "Feb", "Mar"])

df.drop(index = ["NYC", "Seattle"])


Unnamed: 0,Jan,Feb,Mar
Boston,3,4,5


In [7]:
df.drop(columns = ["Feb"])

Unnamed: 0,Jan,Mar
NYC,0,2
Boston,3,5
Seattle,6,8


In [8]:
df.drop("Feb", axis="columns")

Unnamed: 0,Jan,Mar
NYC,0,2
Boston,3,5
Seattle,6,8


#### Accessing Elements of a Series

In [9]:
ser1 = pd.Series([1, 2, 3])

ser1[[0, 1, 2]]

0    1
1    2
2    3
dtype: int64

In [10]:
ser1 = pd.Series([1, 2, 3], index = [2, 0, 1])
ser2 = pd.Series([1, 2, 3], index = ["a", "b", "c"])

In [11]:
ser1

2    1
0    2
1    3
dtype: int64

In [12]:
ser1[[0, 1, 2]]

0    2
1    3
2    1
dtype: int64

In [13]:
ser2[[0, 1, 2]]

a    1
b    2
c    3
dtype: int64

In [14]:
ser1 = pd.Series([1, 2, 3], index = [2, 0, 1])
ser2 = pd.Series([1, 2, 3], index = ["a", "b", "c"])

In [15]:
ser1.loc[[0, 1, 2]]

0    2
1    3
2    1
dtype: int64

In [16]:
ser2.loc[['a','b']]

a    1
b    2
dtype: int64

In [17]:
ser2.loc[[0, 1, 2]]

KeyError: "None of [Index([0, 1, 2], dtype='int64')] are in the [index]"

In [18]:
ser1 = pd.Series([1, 2, 3], index = [2, 0, 1])
ser2 = pd.Series([1, 2, 3], index = ["a", "b", "c"])

In [19]:
ser1[-1]

KeyError: -1

In [20]:
ser2[-1]

3

In [21]:
ser1 = pd.Series([1, 2, 3], index = [2, 0, 1])
ser2 = pd.Series([1, 2, 3], index = ["a", "b", "c"])

In [22]:
ser1.iloc[-1]

3

In [23]:
ser2.iloc[-1]

3

In [24]:
ser = pd.Series([1, 2, 3], index = ["a", "b", "c"])

ser[0:1] = 100
ser.iloc[0:1] = 100   
ser.loc["a":"b"] = 100

In [25]:
ser

a    100
b    100
c      3
dtype: int64

#### Accessing Elements of a DataFrame

Accessing columns of a DataFrame


In [26]:
df["Feb"]

NYC        1
Boston     4
Seattle    7
Name: Feb, dtype: int64

In [27]:
df[["Feb", "Mar"]]

Unnamed: 0,Feb,Mar
NYC,1,2
Boston,4,5
Seattle,7,8


Accessing rows of a DataFrame. 

In [28]:
df.loc[['NYC']]

Unnamed: 0,Jan,Feb,Mar
NYC,0,1,2


In [29]:
df.iloc[1]

Jan    3
Feb    4
Mar    5
Name: Boston, dtype: int64

In [30]:
df.iloc[[1,0]]

Unnamed: 0,Jan,Feb,Mar
Boston,3,4,5
NYC,0,1,2


NOT like this: 

In [31]:
df['NYC']

KeyError: 'NYC'

and NOT like this: 

In [None]:
df[1]

In [None]:
df[:2]

In [None]:
df.iloc[:2]

In [None]:
df.iloc[[0,2]]

In [32]:
df.loc[['NYC','Seattle']]

Unnamed: 0,Jan,Feb,Mar
NYC,0,1,2
Seattle,6,7,8


Fancy pandas indexing

In [33]:
df.loc[['NYC','Seattle'],['Feb','Mar']]

Unnamed: 0,Feb,Mar
NYC,1,2
Seattle,7,8


In [34]:
df.iloc[[0, 2],[1, 2]]

Unnamed: 0,Feb,Mar
NYC,1,2
Seattle,7,8


Filter rows by column value

In [35]:
df[df["Feb"] < 5]

Unnamed: 0,Jan,Feb,Mar
NYC,0,1,2
Boston,3,4,5


Less useful: Replace any value with NaN that doesn't satisfy criteria. 

In [36]:
df[df < 5]

Unnamed: 0,Jan,Feb,Mar
NYC,0.0,1.0,2.0
Boston,3.0,4.0,
Seattle,,,


In [37]:
df = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  index=["a", "c", "d"],
                  columns=["Ohio", "Texas", "California"])

In [38]:
aser = df.loc['a']
aser

Ohio          0
Texas         1
California    2
Name: a, dtype: int64

In [39]:
type(aser)

pandas.core.series.Series

In [40]:
adf = df.loc[['a']]
adf

Unnamed: 0,Ohio,Texas,California
a,0,1,2


In [41]:
type(adf)

pandas.core.frame.DataFrame

#### Exercise 1

Suppose we have the following `DataFrame`:

In [42]:
df = pd.DataFrame(np.arange(90, 99).reshape((3, 3)),
                  index=["Alice", "Bob", "Carol"],
                  columns=["m1", "m2", "fin"])

How do we get the midterm scores for Bob? 

In [43]:
df.loc["Bob"][0:2]

m1    93
m2    94
Name: Bob, dtype: int64

In [44]:
df.loc[["Bob"],["m1","m2"]]

Unnamed: 0,m1,m2
Bob,93,94


### Updating values in a DataFrame

In [45]:
df = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  index=["a", "c", "d"],
                  columns=["Ohio", "Texas", "California"])

In [46]:
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [47]:
df.loc[df["Texas"] > 5] = 100

In [48]:
df.loc[df["Texas"] > 5]

Unnamed: 0,Ohio,Texas,California
d,100,100,100


In [49]:
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,100,100,100


In [50]:
df = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  index=["a", "c", "d"],
                  columns=["Ohio", "Texas", "California"])

Filter rows by column value, then update another column

In [51]:
df.loc[df["Texas"] > 3, "California"] = 11

In [52]:
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,11
d,6,7,11


Adding Series

In [53]:
ser1 = pd.Series([1, 2, 3], index = ["a", "b", "c"])
ser2 = pd.Series([1, 2, 3], index = ["b", "c", "d"])

ser1 + ser2

a    NaN
b    3.0
c    5.0
d    NaN
dtype: float64

In [54]:
df1 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  index=["a", "b", "c"],
                  columns=["Ohio", "Texas", "California"])
df2 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  index=["b", "c", "d"],
                  columns=["Iowa", "Texas", "Nevada"])

df1 + df2

Unnamed: 0,California,Iowa,Nevada,Ohio,Texas
a,,,,,
b,,,,,5.0
c,,,,,11.0
d,,,,,


In [55]:
df1 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  columns=["a", "b", "c"])
df2 = pd.DataFrame(np.arange(12).reshape((3, 4)),
                  columns=["a", "b", "c", "d"])

df1 + df2

Unnamed: 0,a,b,c,d
0,0,2,4,
1,7,9,11,
2,14,16,18,


In [56]:
df1 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  columns=["a", "b", "c"])
df2 = pd.DataFrame(np.arange(12).reshape((3, 4)),
                  columns=["a", "b", "c", "d"])

df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d
0,0,2,4,3.0
1,7,9,11,7.0
2,14,16,18,11.0


In [57]:
df1 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  columns=["a", "b", "c"])
df2 = pd.DataFrame(np.arange(12).reshape((3, 4)),
                  columns=["a", "b", "c", "d"])

df1.reindex(columns=df2.columns, fill_value=0) 


Unnamed: 0,a,b,c,d
0,0,1,2,0
1,3,4,5,0
2,6,7,8,0


In [58]:
df = pd.DataFrame(np.arange(12).reshape((4, 3)),
                  columns=list("abc"),
                  index=["Utah", "Ohio", "Texas", "Oregon"])
ser1 = df.iloc[0]
ser2 = df.loc["Ohio"]
ser3 = df["b"]

In [59]:
df

Unnamed: 0,a,b,c
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [60]:
df - ser1

Unnamed: 0,a,b,c
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [61]:
df.sub(ser1)

Unnamed: 0,a,b,c
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [62]:
df

Unnamed: 0,a,b,c
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [63]:
df.sub(ser2)

Unnamed: 0,a,b,c
Utah,-3,-3,-3
Ohio,0,0,0
Texas,3,3,3
Oregon,6,6,6


In [64]:
df.sub(ser3, axis="index")

Unnamed: 0,a,b,c
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1


In [65]:
df = pd.DataFrame(np.arange(-5, 7).reshape((4, 3)),
                  columns=list("abc"),
                  index=["Utah", "Ohio", "Texas", "Oregon"])

np.abs(df)

Unnamed: 0,a,b,c
Utah,5,4,3
Ohio,2,1,0
Texas,1,2,3
Oregon,4,5,6


In [66]:
df

Unnamed: 0,a,b,c
Utah,-5,-4,-3
Ohio,-2,-1,0
Texas,1,2,3
Oregon,4,5,6


In [67]:
df.apply(lambda x: x.max() - x.min())

a    9
b    9
c    9
dtype: int64

In [68]:
df.apply(lambda x: x.max() - x.min(), axis="columns")

Utah      2
Ohio      2
Texas     2
Oregon    2
dtype: int64

In [69]:
def min_max(x):
    return pd.Series([x.min(), x.max()],
                     index = ["min", "max"])

df = pd.DataFrame(np.arange(-5, 7).reshape((4, 3)),
                  columns=list("abc"),
                  index=["Utah", "Ohio", "Texas", "Oregon"])

df.apply(min_max)

Unnamed: 0,a,b,c
min,-5,-4,-3
max,4,5,6


In [70]:
df.apply(min_max, axis=1)

Unnamed: 0,min,max
Utah,-5,-3
Ohio,-2,0
Texas,1,3
Oregon,4,6


In [71]:
df = pd.DataFrame(np.arange(-5, 7).reshape((4, 3)),
                  columns=list("abc"),
                  index=["Utah", "Ohio", "Texas", "Oregon"])

df.applymap(lambda x : 2*x + 1)

Unnamed: 0,a,b,c
Utah,-9,-7,-5
Ohio,-3,-1,1
Texas,3,5,7
Oregon,9,11,13


In [72]:
df.sort_index()

Unnamed: 0,a,b,c
Ohio,-2,-1,0
Oregon,4,5,6
Texas,1,2,3
Utah,-5,-4,-3


In [73]:
df.sort_index(axis="columns", ascending=False)

Unnamed: 0,c,b,a
Utah,-3,-4,-5
Ohio,0,-1,-2
Texas,3,2,1
Oregon,6,5,4


In [74]:
df.sort_values("b")

Unnamed: 0,a,b,c
Utah,-5,-4,-3
Ohio,-2,-1,0
Texas,1,2,3
Oregon,4,5,6


Other functions

In [75]:
df

Unnamed: 0,a,b,c
Utah,-5,-4,-3
Ohio,-2,-1,0
Texas,1,2,3
Oregon,4,5,6


In [76]:
df.sum()

a   -2
b    2
c    6
dtype: int64

In [77]:
df.sum(axis=1)

Utah     -12
Ohio      -3
Texas      6
Oregon    15
dtype: int64

In [78]:
df.product()

a    40
b    40
c     0
dtype: int64

In [79]:
df.mean(axis="rows")

a   -0.5
b    0.5
c    1.5
dtype: float64

In [80]:
df.describe()

Unnamed: 0,a,b,c
count,4.0,4.0,4.0
mean,-0.5,0.5,1.5
std,3.872983,3.872983,3.872983
min,-5.0,-4.0,-3.0
25%,-2.75,-1.75,-0.75
50%,-0.5,0.5,1.5
75%,1.75,2.75,3.75
max,4.0,5.0,6.0


#### Exercise 2

Suppose we have a `DataFrame` with an unknown number of columns and rows. 

How can we slice the `DataFrame` to include all but the last row and column? 

In [81]:
rows = np.random.randint(11)+2
columns = np.random.randint(9)+2
df = pd.DataFrame(np.arange(rows*columns).reshape(rows,columns))

In [82]:
df[:-1][list(range(len(df.columns)-1))]

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,5,6,7,8
2,10,11,12,13
3,15,16,17,18
4,20,21,22,23


In [83]:
df.iloc[:-1, :-1]

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,5,6,7,8
2,10,11,12,13
3,15,16,17,18
4,20,21,22,23


In [84]:
df.shape

(6, 5)

In [85]:
df.drop(index=[df.shape[0]-1], columns=[df.shape[1]-1])

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,5,6,7,8
2,10,11,12,13
3,15,16,17,18
4,20,21,22,23


In [86]:
df.drop(index=[rows-1], columns=[columns-1])

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,5,6,7,8
2,10,11,12,13
3,15,16,17,18
4,20,21,22,23


#### Exercise 3

Suppose you have the following quiz grades in a DataFrame. 
Take the mean for each student, 
dropping the lowest score (per my grading policy). 

In [87]:
grades = np.array([[100, 95, 40, 85],
                   [65, 95, 100, 80],
                   [90, 90, 80, 0]])
grades_df = pd.DataFrame(grades, 
                         index=["Arnab", "Beth", "Carlos"], 
                         columns=[f"Q{i+1}" for i in range(4)])

In [88]:
grades_df

Unnamed: 0,Q1,Q2,Q3,Q4
Arnab,100,95,40,85
Beth,65,95,100,80
Carlos,90,90,80,0


In [89]:
(grades_df.sum(axis="columns") - grades_df.min(axis="columns")) / (grades_df.shape[1]-1)

Arnab     93.333333
Beth      91.666667
Carlos    86.666667
dtype: float64