In [6]:
import pandas as pd
import numpy as np

path = 'datasets/employee_list.parquet'
df = pd.read_parquet(path)
df.set_index('employee_number', inplace=True)

In [7]:
df.head()

Unnamed: 0_level_0,name,company,country,dob,age,department,salary,has_parking_space
employee_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False
463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True
388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False
267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False
401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False


#### Explore set operations on columns

In [8]:
c1 = df.columns[:4]
c2 = df.columns[2:6]

print('c1: ', c1)
print('c2: ', c2)
print('union', c1.union(c2)) # c1 | c2
print('intersection: ', c1.intersection(c2))
print('difference: ', c1.difference(c2))
print('symmetric difference: ', c1.symmetric_difference(c2))


c1:  Index(['name', 'company', 'country', 'dob'], dtype='object')
c2:  Index(['country', 'dob', 'age', 'department'], dtype='object')
union Index(['age', 'company', 'country', 'department', 'dob', 'name'], dtype='object')
intersection:  Index(['country', 'dob'], dtype='object')
difference:  Index(['company', 'name'], dtype='object')
symmetric difference:  Index(['age', 'company', 'department', 'name'], dtype='object')


#### Cartesian product
when combining two series or two dataframes it is important that:
1. both series/dataframes contain indices with only unique values

OR

2. that the indices of both series/dataframes are in the same order
<br><br>


If not a Cartesian product is made, which means that the new index
contains all combinations of pairs of both indices. A LOT of extra rows
can mistakenly be added this way. 

When two Series are added together using the plus operator and one of the index labels does not appear in the other, the resulting value is always missing. pandas has the .add method, which provides an option to fill the missing value. Note that these Series do not include duplicate entries, hence there is no need to worry about a Cartesian product exploding the number of entries.

## Adding/combining series and DataFrames

#### Finding the difference between two indices

In [12]:
df_copy = df.iloc[:-5, :].copy()
df.index.difference(df_copy.index)

Int64Index([238955, 452246, 628844, 683924, 980867], dtype='int64', name='employee_number')

#### Employee dataset: find max salary for each department, and then add the corresponding value to each row

In [25]:
# Method 1
max_department_salary = (
    df[['department', 'salary']]
    .sort_values(['department', 'salary'], 
                 ascending=[True, False])
    .drop_duplicates(subset='department')
    .set_index('department')
)

df.reset_index().set_index('department').assign(max_dept_salary=max_department_salary['salary']).head()

Unnamed: 0_level_0,employee_number,name,company,country,dob,age,salary,has_parking_space,max_dept_salary
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Management,897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,124790,False,134840
Consulting,463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,103122,True,133354
Consulting,388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,119072,False,133354
System Architect,267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,115653,False,133943
Finance,401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,119412,False,130051


In [24]:
# Method 2
max_department_salary = df.groupby('department').salary.transform('max')
df.assign(max_dept_salary=max_department_salary).head()

Unnamed: 0_level_0,name,company,country,dob,age,department,salary,has_parking_space,max_dept_salary
employee_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False,134840
463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True,133354
388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False,133354
267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False,133943
401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False,130051


In [31]:
# Method 3
max_department_salary = df.groupby('department').salary.max()

(
df
.merge(
    max_department_salary.rename('max_department_salary'),
    how='left',
    left_on='department',
    right_index=True)
.head()
)

Unnamed: 0_level_0,name,company,country,dob,age,department,salary,has_parking_space,max_department_salary
employee_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
897028,Kenneth Jensen,Wilson and Sons,India,1983-07-03,38,Management,124790,False,134840
463979,Sarah Anderson,"Hernandez, Cunningham and Clark",India,1980-08-09,41,Consulting,103122,True,133354
388446,Tracie Rollins,"Hernandez, Cunningham and Clark",Cayman Islands,1987-07-29,34,Consulting,119072,False,133354
267447,Seth Smith,Spears-Brown,Germany,1969-03-04,52,System Architect,115653,False,133943
401300,Katherine Fields,"Hernandez, Cunningham and Clark",Venezuela,1980-01-26,42,Finance,119412,False,130051
