In [1]:
# %load command.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

%config InlineBackend.figure_format='svg'
plt.rcParams['figure.dpi']=120

pd.options.display.float_format='{:,.2f}'.format
pd.set_option('display.max_colwidth', None)


In [2]:
df=pd.DataFrame({'A':[1,2,3],
                 'B':[10, 20,30]})

df

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [3]:
df.transform(lambda x:x+10)
df

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [4]:
def plus_10(x):
    return x+10

df.transform(plus_10)
df

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [5]:
df.transform('sqrt')

Unnamed: 0,A,B
0,1.0,3.16
1,1.41,4.47
2,1.73,5.48


In [6]:
df.transform([np.sqrt, np.exp])

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,sqrt,exp,sqrt,exp
0,1.0,2.72,3.16,22026.47
1,1.41,7.39,4.47,485165195.41
2,1.73,20.09,5.48,10686474581524.46


In [7]:
df.transform({'A':np.sqrt,
             'B':np.exp})

Unnamed: 0,A,B
0,1.0,22026.47
1,1.41,485165195.41
2,1.73,10686474581524.46


In [8]:
# combine groupby results
df=pd.DataFrame({'restaurant_id':[101, 102, 103, 104, 105, 106, 107],
               'address':['A', 'B', 'C', 'D', 'E', 'F', 'G'],
               'city':['London', 'London', 'London', 'Oxford', 'Oxford', 'Durham', 'Durham'],
               'Sales': [10, 500, 48, 12, 21, 22, 14]})
df

Unnamed: 0,restaurant_id,address,city,Sales
0,101,A,London,10
1,102,B,London,500
2,103,C,London,48
3,104,D,Oxford,12
4,105,E,Oxford,21
5,106,F,Durham,22
6,107,G,Durham,14


In [9]:
city_sales=df.groupby('city')['Sales'].apply(sum).rename('City Total Sales').reset_index()
city_sales

Unnamed: 0,city,City Total Sales
0,Durham,36
1,London,558
2,Oxford,33


In [10]:
city_total_sales=df.groupby(['city'])['Sales'].agg('sum').rename('City Total Sales')
city_total_sales
city_total_sales.index

city
Durham     36
London    558
Oxford     33
Name: City Total Sales, dtype: int64

Index(['Durham', 'London', 'Oxford'], dtype='object', name='city')

In [11]:
city_total_sales=df.groupby(['city'])['Sales'].apply(sum).rename('City Total Sales').reset_index()
city_total_sales
city_total_sales.index

Unnamed: 0,city,City Total Sales
0,Durham,36
1,London,558
2,Oxford,33


RangeIndex(start=0, stop=3, step=1)

In [12]:
city_total_sales=df.groupby(['city'])['Sales'].sum().rename('City Total Sales').reset_index()
city_total_sales
city_total_sales.index

Unnamed: 0,city,City Total Sales
0,Durham,36
1,London,558
2,Oxford,33


RangeIndex(start=0, stop=3, step=1)

In [13]:
# merge
df_new=pd.merge(df, city_total_sales, how='left')
df_new

Unnamed: 0,restaurant_id,address,city,Sales,City Total Sales
0,101,A,London,10,558
1,102,B,London,500,558
2,103,C,London,48,558
3,104,D,Oxford,12,33
4,105,E,Oxford,21,33
5,106,F,Durham,22,36
6,107,G,Durham,14,36


In [14]:
df_new['pct']=df_new['Sales']/df_new['City Total Sales']
df_new
df_new['pct']=df_new['pct'].apply(lambda x:f'{x:.2%}')
df_new

Unnamed: 0,restaurant_id,address,city,Sales,City Total Sales,pct
0,101,A,London,10,558,0.02
1,102,B,London,500,558,0.9
2,103,C,London,48,558,0.09
3,104,D,Oxford,12,33,0.36
4,105,E,Oxford,21,33,0.64
5,106,F,Durham,22,36,0.61
6,107,G,Durham,14,36,0.39


Unnamed: 0,restaurant_id,address,city,Sales,City Total Sales,pct
0,101,A,London,10,558,1.79%
1,102,B,London,500,558,89.61%
2,103,C,London,48,558,8.60%
3,104,D,Oxford,12,33,36.36%
4,105,E,Oxford,21,33,63.64%
5,106,F,Durham,22,36,61.11%
6,107,G,Durham,14,36,38.89%


In [15]:
# groupby and transform

df['city_total_sales']=df.groupby('city')['Sales'].transform('sum')
df

Unnamed: 0,restaurant_id,address,city,Sales,city_total_sales
0,101,A,London,10,558
1,102,B,London,500,558
2,103,C,London,48,558
3,104,D,Oxford,12,33
4,105,E,Oxford,21,33
5,106,F,Durham,22,36
6,107,G,Durham,14,36


In [16]:
df['pct']=df['Sales']/df['city_total_sales']
df
df['pct']=df['pct'].apply(lambda x:f'{x:.2%}')
df

Unnamed: 0,restaurant_id,address,city,Sales,city_total_sales,pct
0,101,A,London,10,558,0.02
1,102,B,London,500,558,0.9
2,103,C,London,48,558,0.09
3,104,D,Oxford,12,33,0.36
4,105,E,Oxford,21,33,0.64
5,106,F,Durham,22,36,0.61
6,107,G,Durham,14,36,0.39


Unnamed: 0,restaurant_id,address,city,Sales,city_total_sales,pct
0,101,A,London,10,558,1.79%
1,102,B,London,500,558,89.61%
2,103,C,London,48,558,8.60%
3,104,D,Oxford,12,33,36.36%
4,105,E,Oxford,21,33,63.64%
5,106,F,Durham,22,36,61.11%
6,107,G,Durham,14,36,38.89%


In [17]:
df[df.groupby('city')['Sales'].transform('sum')>40]

Unnamed: 0,restaurant_id,address,city,Sales,city_total_sales,pct
0,101,A,London,10,558,1.79%
1,102,B,London,500,558,89.61%
2,103,C,London,48,558,8.60%


In [18]:
# handling missing values at the group level

df=pd.DataFrame({'name':['A', 'A', 'B', 'B', 'B', 'C', 'C', 'C'],
             'value':[1, np.nan, np.nan, 2, 8, 2, np.nan, 3]})

df

Unnamed: 0,name,value
0,A,1.0
1,A,
2,B,
3,B,2.0
4,B,8.0
5,C,2.0
6,C,
7,C,3.0


In [19]:
df.groupby('name')['value'].mean()

name
A   1.00
B   5.00
C   2.50
Name: value, dtype: float64

In [20]:
df['value']=df.groupby('name').transform(lambda x:x.fillna(x.mean()))
df

Unnamed: 0,name,value
0,A,1.0
1,A,1.0
2,B,5.0
3,B,2.0
4,B,8.0
5,C,2.0
6,C,2.5
7,C,3.0


### apply() vs transform()

In [21]:
df=pd.DataFrame({'A':[1,2,3],
               'B':[10, 20, 30]})

def plus_10(x):
    return x+10

df.apply(plus_10)
df.transform(plus_10)

df.apply(lambda x:x+10)
df.transform(lambda x:x+10)

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


In [22]:
df['B-apply']=df['B'].apply(plus_10)
df

Unnamed: 0,A,B,B-apply
0,1,10,20
1,2,20,30
2,3,30,40


In [23]:
df['B_transform']=df['B'].transform(plus_10)
df

Unnamed: 0,A,B,B-apply,B_transform
0,1,10,20,20
1,2,20,30,30
2,3,30,40,40


1. transform() can take a function, a string function, a list of functions, and a dict. However, apply() is only allowed a function.
2. transform() **cannot produce aggregated results**
3. apply() works with multiple Series at a time. However, transform() is only allowed to work with a single Series at a time.

In [24]:
# 1. transform() can takes a function, a string function, a list of functions, and a dict. However, apply() is only allowed a function. 

df=pd.DataFrame({'A':[1,2,3],
               'B':[10, 20, 30]})

df.transform('sqrt')
df.transform([np.sqrt, np.exp])
df.transform({'A':np.sqrt,
             'B':np.exp})

Unnamed: 0,A,B
0,1.0,3.16
1,1.41,4.47
2,1.73,5.48


Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,sqrt,exp,sqrt,exp
0,1.0,2.72,3.16,22026.47
1,1.41,7.39,4.47,485165195.41
2,1.73,20.09,5.48,10686474581524.46


Unnamed: 0,A,B
0,1.0,22026.47
1,1.41,485165195.41
2,1.73,10686474581524.46


In [25]:
# 2. transform() cannot produce aggregated results 
df.apply(lambda x:x.sum())

A     6
B    60
dtype: int64

In [26]:
# getting error
df.transform(lambda x:x.sum())

ValueError: Function did not transform

In [27]:
# 3. apply() works with multiple Series at a time. However, transform() is only allowed to work with a single Series at a time. 

df

def subtract_two(x):
    return x['B']-x['A']

df.apply(subtract_two, axis=1)

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


0     9
1    18
2    27
dtype: int64

In [30]:
# getting error
df.transform(subtract_two, axis=1)

ValueError: Function did not transform

In [31]:
df.apply(lambda x:x['B']-x['A'], axis=1)

0     9
1    18
2    27
dtype: int64

In [32]:
# getting error
df.transform(lambda x:x['B']-x['A'], axis=1)

ValueError: Function did not transform

### In conjunction with groupby()

In [33]:
df=pd.DataFrame({'key':['a', 'b', 'c']*3,
                'A':np.arange(9),
                'B':[1,2,3]*3})

df

Unnamed: 0,key,A,B
0,a,0,1
1,b,1,2
2,c,2,3
3,a,3,1
4,b,4,2
5,c,5,3
6,a,6,1
7,b,7,2
8,c,8,3


1. transform() returns a Series that has the same length as the input
2. apply() works with multiple Series at a time. However, transform() is only allowed to work with a single Series at a time.

In [34]:
# 1. transform() returns a Series that has the same length as the input

def group_sum(x):
    return x.sum()

groupped_apply=df.groupby('key')['A'].apply(group_sum)
groupped_apply

key
a     9
b    12
c    15
Name: A, dtype: int64

In [35]:
groupped_transform=df.groupby('key')['A'].transform(group_sum)
groupped_transform

0     9
1    12
2    15
3     9
4    12
5    15
6     9
7    12
8    15
Name: A, dtype: int64

In [36]:
# 2. apply() works with multiple Series at a time. 
# However, transform() is only allowed to work with a single Series at a time.

df
def subtract_two(x):
    return x['B']-x['A']

df.groupby('key').apply(subtract_two)

Unnamed: 0,key,A,B
0,a,0,1
1,b,1,2
2,c,2,3
3,a,3,1
4,b,4,2
5,c,5,3
6,a,6,1
7,b,7,2
8,c,8,3


key   
a    0    1
     3   -2
     6   -5
b    1    1
     4   -2
     7   -5
c    2    1
     5   -2
     8   -5
dtype: int64

In [37]:
# getting error
df.groupby('key').transform(subtract_two)

KeyError: 'B'