In [1]:
### Pandas Apply Functions

## Sometimes, pre-built Pandas functions are not enough, and therefore, it's important to know how to apply your own functions to Pandas objects. 
## In this activity, we will learn how to apply built-in functions, as well as our own Python functions, to Pandas objects in an efficient way.

In [2]:
## To apply your own library's functions, or another library’s functions to Pandas objects, you should be aware of the methods below. 
## The appropriate method to use depends on whether your function expects to operate on an entire DataFrame or Series, or row- or column-wise.

# - tablewise function application: pipe()
# - row or column-wise function application: apply()

In [3]:
### Tablewise Function Application

## DataFrames and Series can be passed into functions without any problems.

In [4]:
import pandas as pd 
import numpy as np

In [5]:
def extract_city_name(df):
   .....:     """
   .....:     Chicago, IL -> Chicago for city_name column
   .....:     """
   .....:     df['city_name'] = df['city_and_code'].str.split(",").str.get(0)
   .....:     return df

In [6]:
def add_country_name(df, country_name=None):
   .....:     """
   .....:     Chicago -> Chicago-US for city_name column
   .....:     """
   .....:     col = 'city_name'
   .....:     df['city_and_country'] = df[col] + country_name
   .....:     return df

In [7]:
 df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']})

In [8]:
add_country_name(extract_city_name(df_p), country_name='US')

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [9]:
## Pandas encourages us to use pipe() for the problem above, which is known as 'method chaining'. pipe makes it easy to use your own or another library’s functions in method chains, alongside Pandas’ methods.
## Compare the first approach with the following:

In [10]:
(df_p.pipe(extract_city_name)
         .pipe(add_country_name, country_name="US"))

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [11]:
### Row or Column-wise Function Application


## Arbitrary functions can be applied along the axes of a DataFrame using the apply() method, which, like the descriptive statistics methods, takes an optional axis argument.

In [12]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})


In [14]:
# pre-build numpy function
df.apply(np.mean)

one      0.021812
two     -0.175322
three   -0.738905
dtype: float64

In [15]:
# pre-build numpy function
df.apply(np.mean, axis=1)

a    0.338741
b   -0.151517
c   -0.709838
d   -0.472990
dtype: float64

In [16]:
# own lambda function
df.apply(lambda x: x.max() - x.min())

one      1.214512
two      1.274279
three    0.640069
dtype: float64

In [17]:
# pre-build numpy function
df.apply(np.cumsum)

Unnamed: 0,one,two,three
a,0.484569,0.192912,
b,0.795379,0.39546,-0.967909
c,0.065437,-0.676272,-1.295749
d,,-0.701287,-2.216715


In [19]:
# pre-build numpy function
df.apply(np.exp)

Unnamed: 0,one,two,three
a,1.623475,1.212776,
b,1.36453,1.224518,0.379876
c,0.481937,0.342415,0.720478
d,,0.975295,0.398134


In [20]:
## You can use apply() to apply your own function:

In [21]:
def own_function(x):
    return x*x


In [22]:
df.apply(own_function)

Unnamed: 0,one,two,three
a,0.234807,0.037215,
b,0.096603,0.041026,0.936849
c,0.532816,1.148609,0.107479
d,,0.000626,0.848178


In [23]:
## You may also pass additional arguments and keyword arguments to the apply() method. For instance, consider the following function you would like to apply:

In [24]:
def subtract_and_divide(x, sub, divide=1):
    return (x - sub) / divide

In [26]:
## You may then apply this function as follows:

In [27]:
df.apply(subtract_and_divide, args=(5,3))

Unnamed: 0,one,two,three
a,-1.505144,-1.602363,
b,-1.563063,-1.599151,-1.989303
c,-1.909981,-2.023911,-1.775947
d,,-1.675005,-1.973655


In [28]:
## args has to be iterable. Therefore, even if you pass only 1 argument, you have to pass it as a tuple: args=(5,)

In [29]:
def subtract(x, sub):
    return (x - sub)

In [30]:
df.apply(subtract, args=(5,))

Unnamed: 0,one,two,three
a,-4.515431,-4.807088,
b,-4.68919,-4.797452,-5.967909
c,-5.729943,-6.071732,-5.32784
d,,-5.025015,-5.920966


In [31]:
np.nan == np.nan

False