In [16]:
import pandas as pd
import numpy as np
import datetime as dt

In [21]:
df = pd.read_parquet('datasets/employee_list.parquet')

In [22]:
df.head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


### Map
- A Series method
- pandas map() function from Series is used to substitute each value in a Series with another value, that may be derived from a function, a dict or a Series. Since DataFrame columns are series, you can use map() to update the column and assign it back to the DataFrame.
- https://sparkbyexamples.com/pandas/pandas-map-function-explained/

##### Map with a dict
Return the boolean values in column has_parking_space with 0 for False or 1 for True.

In [23]:
bool_mapper = {False: 0, True: 1}
df.has_parking_space.map(bool_mapper).head()

0    0
1    1
2    0
3    0
4    0
Name: has_parking_space, dtype: int64

Another example:
- To import the salary data into the df_without_salary we can merge the dataframes but it is also possible to use the map function

In [24]:
salary_data_df = df[['employee_number', 'salary']]
df_without_salary = df.drop(columns=['salary'])

salary_mapper = dict(zip(salary_data_df.employee_number, salary_data_df.salary))
df_without_salary['salary'] = df_without_salary.employee_number.map(salary_mapper)
df_without_salary.head(2)

Unnamed: 0,employee_number,name,company,country,dob,age,department,has_parking_space,salary
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,False,124790
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,True,103122


##### Map with lambda
Return age in number of days for each employee.

In [13]:
df.dob.map(lambda x: (dt.datetime.today() - x).days).head()

0    14093
1    15151
2    12606
3    19327
4    15347
Name: dob, dtype: int64

##### Map with lambda and a function
Return age in number of days for each employee.

In [14]:
def return_days(x):
    return (dt.datetime.today() - x).days

df.dob.map(lambda x: return_days(x)).head()

0    14093
1    15151
2    12606
3    19327
4    15347
Name: dob, dtype: int64

Dealing with NaNs

In [19]:
ser = df.dob.head()
ser[2] = np.nan
df.dob.map(lambda x: (dt.datetime.today() - x).days, na_action='ignore').head()

0    14093
1    15151
2      NaT
3    19327
4    15347
Name: dob, dtype: object

### Apply
- This method applies a method to each value in Series or DataFrame

In [29]:
df.head(5)

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


##### Create a col consisting of the length of each employee name

In [31]:
# We can write it like this
df['name_length'] = df.name.apply(lambda x: len(x))

# But this is better
df['name_length'] = df.name.apply(len)


df[['name', 'name_length']].head()

Unnamed: 0,name,name_length
0,Kenneth Jensen,14
1,Sarah Anderson,14
2,Tracie Rollins,14
3,Seth Smith,10
4,Katherine Fields,16


##### Extract the last name of each employee and store the values in new column

In [34]:
def get_element(lst, position):
    return lst[position]

df['last_name'] = df.name.str.split(' ').apply(get_element, position=1)
df.head()
    

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space,name_length,last_name
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False,14,Jensen
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True,14,Anderson
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False,14,Rollins
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False,10,Smith
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False,16,Fields


Doing the same thing with a lambda function

In [35]:
df.name.str.split(' ').apply(lambda x: x[1]).head()

0      Jensen
1    Anderson
2     Rollins
3       Smith
4      Fields
Name: name, dtype: object

### Apply as a dataframe method

In [89]:
tbl = pd.DataFrame(np.random.rand(4, 4), index=list('abcd'))
tbl.columns = [f'Col {i}' for i in range(1, 5)]
tbl = tbl.apply(lambda x: x * 100).astype('int')
tbl

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
a,65,21,74,49
b,36,42,77,5
c,86,67,6,47
d,84,91,58,32


In [99]:
tbl.apply(sum, axis=0)

Col 1    271
Col 2    221
Col 3    215
Col 4    133
dtype: int64

In [90]:
tbl.apply(np.argmax, axis=0)

Col 1    2
Col 2    3
Col 3    1
Col 4    0
dtype: int64

In [91]:
tbl.apply(np.argmax, axis=1)

a    2
b    2
c    0
d    1
dtype: int64

### Applymap
- A DataFrame method

In [92]:
tbl.applymap(lambda x: x + 100)

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
a,165,121,174,149
b,136,142,177,105
c,186,167,106,147
d,184,191,158,132


In [94]:
tbl.applymap(float)

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
a,65.0,21.0,74.0,49.0
b,36.0,42.0,77.0,5.0
c,86.0,67.0,6.0,47.0
d,84.0,91.0,58.0,32.0


In [98]:
tbl.apply(sum, axis=0)

Col 1    271
Col 2    221
Col 3    215
Col 4    133
dtype: int64