# CHAPTER 12 Advanced pandas

- Creating dummy variables for modeling: `pd.get_dummy()`
- Techniques for Method Chaining
    - `df2=df.assign(col_name=2)` with `df` unchanged, `df2` has a new row named `col_name` and its value is 2.
- 

In [1]:
import pandas as pd
import numpy as np

In [3]:
values = pd.Series([0,1,0,0]*2)
dim = pd.Series(['apple','orange'])

In [4]:
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [5]:
dim

0     apple
1    orange
dtype: object

In [6]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [11]:
cat_s = pd.DataFrame({
    'key':['a','b','c','d']*2,
    'data':np.random.randn(8)})
cat_s

Unnamed: 0,key,data
0,a,-1.287705
1,b,-0.398744
2,c,-2.36595
3,d,0.231206
4,a,0.164882
5,b,2.282652
6,c,-1.504751
7,d,-0.931613


In [12]:
pd.get_dummies(cat_s)

Unnamed: 0,data,key_a,key_b,key_c,key_d
0,-1.287705,1,0,0,0
1,-0.398744,0,1,0,0
2,-2.36595,0,0,1,0
3,0.231206,0,0,0,1
4,0.164882,1,0,0,0
5,2.282652,0,1,0,0
6,-1.504751,0,0,1,0
7,-0.931613,0,0,0,1


In [13]:
df = pd.DataFrame({
    'key':list('abc')*4,
    'value':np.arange(12)
})
df

Unnamed: 0,key,value
0,a,0
1,b,1
2,c,2
3,a,3
4,b,4
5,c,5
6,a,6
7,b,7
8,c,8
9,a,9


In [20]:
g = df.groupby('key').value

In [22]:
g.transform(lambda x: x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [23]:
N=15

In [24]:
times = pd.date_range('2017 5 20 00:00',freq='1min',periods=N)
times

DatetimeIndex(['2017-05-20 00:00:00', '2017-05-20 00:01:00',
               '2017-05-20 00:02:00', '2017-05-20 00:03:00',
               '2017-05-20 00:04:00', '2017-05-20 00:05:00',
               '2017-05-20 00:06:00', '2017-05-20 00:07:00',
               '2017-05-20 00:08:00', '2017-05-20 00:09:00',
               '2017-05-20 00:10:00', '2017-05-20 00:11:00',
               '2017-05-20 00:12:00', '2017-05-20 00:13:00',
               '2017-05-20 00:14:00'],
              dtype='datetime64[ns]', freq='T')

In [25]:
df = pd.DataFrame({
    'time':times,
    'value':np.arange(N)
})
df

Unnamed: 0,time,value
0,2017-05-20 00:00:00,0
1,2017-05-20 00:01:00,1
2,2017-05-20 00:02:00,2
3,2017-05-20 00:03:00,3
4,2017-05-20 00:04:00,4
5,2017-05-20 00:05:00,5
6,2017-05-20 00:06:00,6
7,2017-05-20 00:07:00,7
8,2017-05-20 00:08:00,8
9,2017-05-20 00:09:00,9


In [27]:
df.set_index('time').resample('5min').count()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,5
2017-05-20 00:05:00,5
2017-05-20 00:10:00,5


In [28]:
df2 = pd.DataFrame({
    'time':times.repeat(3),
    'key':np.tile(['a','b','c'],N),
    'value':np.arange(N*3)
})
df2

Unnamed: 0,time,key,value
0,2017-05-20 00:00:00,a,0
1,2017-05-20 00:00:00,b,1
2,2017-05-20 00:00:00,c,2
3,2017-05-20 00:01:00,a,3
4,2017-05-20 00:01:00,b,4
5,2017-05-20 00:01:00,c,5
6,2017-05-20 00:02:00,a,6
7,2017-05-20 00:02:00,b,7
8,2017-05-20 00:02:00,c,8
9,2017-05-20 00:03:00,a,9


In [34]:
resampled = (df2.set_index('time').groupby(['key','time']).sum())
resampled

Unnamed: 0_level_0,Unnamed: 1_level_0,value
key,time,Unnamed: 2_level_1
a,2017-05-20 00:00:00,0
a,2017-05-20 00:01:00,3
a,2017-05-20 00:02:00,6
a,2017-05-20 00:03:00,9
a,2017-05-20 00:04:00,12
a,2017-05-20 00:05:00,15
a,2017-05-20 00:06:00,18
a,2017-05-20 00:07:00,21
a,2017-05-20 00:08:00,24
a,2017-05-20 00:09:00,27


In [37]:
df['value2']= df['value']/2
df

Unnamed: 0,time,value,value2
0,2017-05-20 00:00:00,0,0.0
1,2017-05-20 00:01:00,1,0.5
2,2017-05-20 00:02:00,2,1.0
3,2017-05-20 00:03:00,3,1.5
4,2017-05-20 00:04:00,4,2.0
5,2017-05-20 00:05:00,5,2.5
6,2017-05-20 00:06:00,6,3.0
7,2017-05-20 00:07:00,7,3.5
8,2017-05-20 00:08:00,8,4.0
9,2017-05-20 00:09:00,9,4.5


In [41]:
df2=df.assign(value3=10)
df2

Unnamed: 0,time,value,value2,value3
0,2017-05-20 00:00:00,0,0.0,10
1,2017-05-20 00:01:00,1,0.5,10
2,2017-05-20 00:02:00,2,1.0,10
3,2017-05-20 00:03:00,3,1.5,10
4,2017-05-20 00:04:00,4,2.0,10
5,2017-05-20 00:05:00,5,2.5,10
6,2017-05-20 00:06:00,6,3.0,10
7,2017-05-20 00:07:00,7,3.5,10
8,2017-05-20 00:08:00,8,4.0,10
9,2017-05-20 00:09:00,9,4.5,10


In [42]:
df2

Unnamed: 0,time,value,value2,value3
0,2017-05-20 00:00:00,0,0.0,10
1,2017-05-20 00:01:00,1,0.5,10
2,2017-05-20 00:02:00,2,1.0,10
3,2017-05-20 00:03:00,3,1.5,10
4,2017-05-20 00:04:00,4,2.0,10
5,2017-05-20 00:05:00,5,2.5,10
6,2017-05-20 00:06:00,6,3.0,10
7,2017-05-20 00:07:00,7,3.5,10
8,2017-05-20 00:08:00,8,4.0,10
9,2017-05-20 00:09:00,9,4.5,10
