# apply() vs transform()

### <b>apply(func,axis=0)<\b> :
    Call a function func along an axis of the DataFrame. It returns the result of applying func along the given axis

### <b>transform(func, axis=0)<\b> :
    Call a function func on self producing a DataFrame with transformed  values.It returns a DataFrame that has he same length as self

In [1]:
import pandas as pd
import numpy as np

In [35]:
df=pd.DataFrame({'A':[1,2,3],'B':[10,20,30]})
df

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [3]:
def plus_10(x):
    return x+10

In [4]:
df.apply(plus_10)

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


In [5]:
df.transform(plus_10)

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


In [6]:
df.apply(lambda x: x+10) #we can use the lambda expression with apply() and transform()

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


In [7]:
df.transform(lambda x : x+10)

Unnamed: 0,A,B
0,11,20
1,12,30
2,13,40


In [10]:
df['B_ap']=df['B'].apply(lambda x : x+10) # For a single coloumn
df

Unnamed: 0,A,B,B_ap
0,1,10,20
1,2,20,30
2,3,30,40


In [11]:
df['B_tr']=df['B'].transform(lambda x: x+10)
df

Unnamed: 0,A,B,B_ap,B_tr
0,1,10,20,20
1,2,20,30,30
2,3,30,40,40


#  transform() :
works with function, a string function, a list of functions, and a dict.  

In [12]:
df.transform('sqrt')

Unnamed: 0,A,B,B_ap,B_tr
0,1.0,3.162278,4.472136,4.472136
1,1.414214,4.472136,5.477226,5.477226
2,1.732051,5.477226,6.324555,6.324555


In [13]:
df.apply('sqrt')

Unnamed: 0,A,B,B_ap,B_tr
0,1.0,3.162278,4.472136,4.472136
1,1.414214,4.472136,5.477226,5.477226
2,1.732051,5.477226,6.324555,6.324555


In [14]:
df.transform([np.sqrt, np.exp]) # can be a list of function

Unnamed: 0_level_0,A,A,B,B,B_ap,B_ap,B_tr,B_tr
Unnamed: 0_level_1,sqrt,exp,sqrt,exp,sqrt,exp,sqrt,exp
0,1.0,2.718282,3.162278,22026.47,4.472136,485165200.0,4.472136,485165200.0
1,1.414214,7.389056,4.472136,485165200.0,5.477226,10686470000000.0,5.477226,10686470000000.0
2,1.732051,20.085537,5.477226,10686470000000.0,6.324555,2.353853e+17,6.324555,2.353853e+17


In [15]:
df.apply([np.sqrt, np.exp])

Unnamed: 0_level_0,A,A,B,B,B_ap,B_ap,B_tr,B_tr
Unnamed: 0_level_1,sqrt,exp,sqrt,exp,sqrt,exp,sqrt,exp
0,1.0,2.718282,3.162278,22026.47,4.472136,485165200.0,4.472136,485165200.0
1,1.414214,7.389056,4.472136,485165200.0,5.477226,10686470000000.0,5.477226,10686470000000.0
2,1.732051,20.085537,5.477226,10686470000000.0,6.324555,2.353853e+17,6.324555,2.353853e+17


In [29]:
df.apply([min, max, 'mean'])

Unnamed: 0,A,B,B_ap,B_tr
min,1.0,10.0,20.0,20.0
max,3.0,30.0,40.0,40.0
mean,2.0,20.0,30.0,30.0


In [16]:
df.transform({
    'A': np.sqrt,
    'B': np.exp,
})

Unnamed: 0,A,B
0,1.0,22026.47
1,1.414214,485165200.0
2,1.732051,10686470000000.0


In [17]:
df.apply({
    'A': np.sqrt,
    'B': np.exp,
})

Unnamed: 0,A,B
0,1.0,22026.47
1,1.414214,485165200.0
2,1.732051,10686470000000.0


# What are the differences ?

## transform() cannot produce aggregated results.

Because the output of transform() has to be a DataFrame that has the same length as self.

In [33]:
df.apply(lambda x : x.sum()) # We cn use apply() to produce aggregated results

A        6
B       60
B_ap    90
B_tr    90
dtype: int64

In [32]:
df.transform(lambda x : x.sum()) # We will get a ValueError when trying to do same with transform()

ValueError: Function did not transform

## apply() works with multiple Series at a time.

## transform() is only allowed to work with a single Series at a time.

In [44]:
df.apply(lambda x : x['B']-x['A'],axis=1)

0     9
1    18
2    27
dtype: int64

In [45]:
df.transform(lambda x : x['B']-x['A'], axis=1) #Raise a ValueError because transform()is only allowed o work with a single Series at a time

ValueError: Function did not transform

# Groupby with transform() and apply()

1) transform() returns a DataFrame that has the same length as the input

2) apply() works with multiple Series at a time. But, transform() is only allowed to work with a single Series at a time.

In [46]:
df2 = pd.DataFrame({
    'key': ['a','b','c'] * 4,
    'A': np.arange(12),
    'B': [1,2,3] * 4,
})
df2

Unnamed: 0,key,A,B
0,a,0,1
1,b,1,2
2,c,2,3
3,a,3,1
4,b,4,2
5,c,5,3
6,a,6,1
7,b,7,2
8,c,8,3
9,a,9,1


In [48]:
df2.groupby('key')['A'].apply(lambda x: x.sum())

key
a    18
b    22
c    26
Name: A, dtype: int64

In [49]:
df2.groupby('key')['A'].transform(lambda x: x.sum()) #it returns a Series that has the same length as the given DataFrame

0     18
1     22
2     26
3     18
4     22
5     26
6     18
7     22
8     26
9     18
10    22
11    26
Name: A, dtype: int64

In [50]:
df2.groupby('key').apply(lambda x : x['A']+x['B'])

key    
a    0      1
     3      4
     6      7
     9     10
b    1      3
     4      6
     7      9
     10    12
c    2      5
     5      8
     8     11
     11    14
dtype: int64

In [51]:
df2.groupby('key').transform(lambda x : x['A']+x['B']) #apply() works with multiple Series at a time. However ,we are getting a KeyError with transform() 

KeyError: 'A'

In [52]:
df2['A'].apply(lambda x: 'high' if x> 5 else 'low')

0      low
1      low
2      low
3      low
4      low
5      low
6     high
7     high
8     high
9     high
10    high
11    high
Name: A, dtype: object

In [53]:
df2.apply(lambda x : len(str(x))) #it returns the length of ascii code

key    120
A      129
B      117
dtype: int64

In [72]:
df3 = pd.DataFrame({
  'restaurant_id': [101,102,103,104,105,106,107],
  'address': ['A','B','C','D', 'E', 'F', 'G'],
  'city': ['London','London','London','Oxford','Oxford', 'Durham', 'Durham'],
  'sales': [10,500,48,12,21,22,14]
})
df3

Unnamed: 0,restaurant_id,address,city,sales
0,101,A,London,10
1,102,B,London,500
2,103,C,London,48
3,104,D,Oxford,12
4,105,E,Oxford,21
5,106,F,Durham,22
6,107,G,Durham,14


In [73]:
city_sales = df3.groupby('city')['sales'].apply(sum).rename('city_total_sales').reset_index()
city_sales

Unnamed: 0,city,city_total_sales
0,Durham,36
1,London,558
2,Oxford,33


In [74]:
city_sales = df3.groupby('city')['sales'].sum().rename('city_total_sales').reset_index()
city_sales

Unnamed: 0,city,city_total_sales
0,Durham,36
1,London,558
2,Oxford,33


In [75]:
df_new = pd.merge(df3, city_sales, how='left')
df_new

Unnamed: 0,restaurant_id,address,city,sales,city_total_sales
0,101,A,London,10,558
1,102,B,London,500,558
2,103,C,London,48,558
3,104,D,Oxford,12,33
4,105,E,Oxford,21,33
5,106,F,Durham,22,36
6,107,G,Durham,14,36


In [88]:
df3['city_sales']=df3.groupby('city')['sales'].sum()
df3

Unnamed: 0,restaurant_id,address,city,sales,avg,city_sales
0,101,A,London,10,89.571429,
1,102,B,London,500,89.571429,
2,103,C,London,48,89.571429,
3,104,D,Oxford,12,89.571429,
4,105,E,Oxford,21,89.571429,
5,106,F,Durham,22,89.571429,
6,107,G,Durham,14,89.571429,


In [83]:
df3['city_sales']=df3.groupby('city')['sales'].transform(sum) #If we want to create a column by grouping, we should use transform so that there are no nan values

In [84]:
df3

Unnamed: 0,restaurant_id,address,city,sales,avg,city_sales
0,101,A,London,10,89.571429,558
1,102,B,London,500,89.571429,558
2,103,C,London,48,89.571429,558
3,104,D,Oxford,12,89.571429,33
4,105,E,Oxford,21,89.571429,33
5,106,F,Durham,22,89.571429,36
6,107,G,Durham,14,89.571429,36
