In [33]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.0f}'.format

In [5]:
path = 'datasets/employee_list.parquet'
df = pd.read_parquet(path)
df.head()

Unnamed: 0,employee_number,name,company,country,dob,age,department,salary,has_parking_space
0,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
1,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
2,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
3,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
4,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


## Pivot table

**Different aggfunc methods:**
- `count` - The number of group values
- `sum` / `np.sum` - The sum of group values
- `mean` / `np.mean`    -   The average of a group values
- `median` -  The middle of the group values
- `np.std`
- `max`
- `min`
- `sem` -  Standard error of the mean
- `quantile`
- `nunique`
- `mad`
- `size`
- `mode` / `pd.Series.mode` - The most common number in a group
- `var`
- `unique` - Unique group values

##### Mean salary within each department of each company

In [10]:
df.pivot_table(index='company', columns='department', values='salary', aggfunc='mean')

department,Consulting,Developer,Finance,Management,System Architect
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Hernandez, Cunningham and Clark",108412,101506,104112,98756,108414
Spears-Brown,108118,102611,107777,120032,103507
Wilson and Sons,124710,97369,89296,115703,110295


##### Number of employees within each department of each company

In [11]:
df.pivot_table(index='company', columns='department', values='name', aggfunc='count')

department,Consulting,Developer,Finance,Management,System Architect
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Hernandez, Cunningham and Clark",7,4,9,7,9
Spears-Brown,3,7,7,8,8
Wilson and Sons,6,7,3,7,8


##### Oldest age within each department of each company

In [16]:
df.pivot_table(index='company', columns='department', values='age', aggfunc='max')

department,Consulting,Developer,Finance,Management,System Architect
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Hernandez, Cunningham and Clark",50,48,53,49,53
Spears-Brown,53,50,53,51,53
Wilson and Sons,49,52,45,51,53


##### Longest name within each department of each company

In [18]:
df.pivot_table(index='company', columns='department', values='name', aggfunc='size')

department,Consulting,Developer,Finance,Management,System Architect
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Hernandez, Cunningham and Clark",7,4,9,7,9
Spears-Brown,3,7,7,8,8
Wilson and Sons,6,7,3,7,8


#### Pivot table containing oldest employee and his/her age for each company and department

In [23]:
(
df
.sort_values('dob')
.drop_duplicates(
    subset=['company', 'department'], 
    ascending=True)
.pivot_table(
    index='company', 
    columns='department', 
    values=['name', 'age'], 
    aggfunc='first')
)

Unnamed: 0_level_0,age,age,age,age,age,name,name,name,name,name
department,Consulting,Developer,Finance,Management,System Architect,Consulting,Developer,Finance,Management,System Architect
company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
"Hernandez, Cunningham and Clark",50,48,53,49,53,Kimberly Williams,Logan Garcia,Andrew Rivera,Zachary Gallagher,Susan Horn
Spears-Brown,53,50,53,51,53,Bruce Crawford,Justin Anderson,Linda Page,Sophia Wallace,Benjamin Snyder
Wilson and Sons,49,52,45,51,53,Robert Carey,Jonathan Brown,Andrew Strickland,Larry Robinson,Julie Craig


## Melt
- Turns the dataframe from wide (many columns) to tall (many rows)

In [36]:
# Mean salary within each department of each company
piv = df.pivot_table(index='company', columns='department', values='salary', aggfunc='mean').reset_index()
piv

department,company,Consulting,Developer,Finance,Management,System Architect
0,"Hernandez, Cunningham and Clark",108412,101506,104112,98756,108414
1,Spears-Brown,108118,102611,107777,120032,103507
2,Wilson and Sons,124710,97369,89296,115703,110295


In [46]:
piv.melt(id_vars='company').sort_values('company')

Unnamed: 0,company,department,value
0,"Hernandez, Cunningham and Clark",Consulting,108412
3,"Hernandez, Cunningham and Clark",Developer,101506
6,"Hernandez, Cunningham and Clark",Finance,104112
9,"Hernandez, Cunningham and Clark",Management,98756
12,"Hernandez, Cunningham and Clark",System Architect,108414
1,Spears-Brown,Consulting,108118
4,Spears-Brown,Developer,102611
7,Spears-Brown,Finance,107777
10,Spears-Brown,Management,120032
13,Spears-Brown,System Architect,103507


In [49]:
(piv
.melt(
    id_vars='company', 
    # Only include data for dvelopers and system akjkrchitects
    value_vars=['Developer', 'System Architect'], 
    value_name='mean_salary')
.sort_values('company'))

Unnamed: 0,company,department,mean_salary
0,"Hernandez, Cunningham and Clark",Developer,101506
3,"Hernandez, Cunningham and Clark",System Architect,108414
1,Spears-Brown,Developer,102611
4,Spears-Brown,System Architect,103507
2,Wilson and Sons,Developer,97369
5,Wilson and Sons,System Architect,110295
