# XYZ's IT User Access Review Step 1: Consolidate the HR Data

###### Author : Sateesh Babu
###### Version: V3
###### Created: 2021-12-25
###### Updated: 2021-12-30

# Input

In [16]:
# Date when you want to generate the reports for give XYZ IT Active Directory snapshot
run_date = '20211230'

In [17]:
# Source data file path
# 1.HR active list; 2.HR depart list

source_filepath = r"C:\Users\SATEESHB\OneDrive - Metrolinx\documents\zMyWork\05_PowerBI_PythonCode\00_Py_UserAccess\Source_Data\HR"
source_hr_staff  = source_filepath + f"\HR_Status_Active_{run_date}.xlsx"
source_hr_depart  = source_filepath + f"\HR_Status_Departure_{run_date}.xlsx"

In [18]:
# results/outcome of this program

results_hr_path  = r"C:\Users\SATEESHB\OneDrive - Metrolinx\documents\zMyWork\05_PowerBI_PythonCode\00_Py_UserAccess\Results_Data\Step1_Results_HR"

## Code

### Packages

In [19]:
import numpy as np
import pandas as pd
#import pandasql as ps

from datetime import datetime
from datetime import date
from datetime import time
from dateutil import relativedelta
from datetime import timedelta


import warnings
warnings.filterwarnings('ignore')

### Functions

In [20]:
# Stats
def df_stats(df,columns):
    '''
    Stats of each column:
    1.Count
    2.Unique Values count
    3.Missing Values count 
    4.% of Missing Values wrt total instances of the dataset
    5.% of Highest column value count
    6.Data Type
    
    '''
    
    stats = []
    for col in columns:
        stats.append((col, df[col].count(), df[col].nunique(), df[col].isnull().sum(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))
    
    stats_cols = ['feature', 'record_count', 'unique_values','missing_records', 'percent_missing_records','percent_highest_columnvalue', 'data_type']
    df_stats = pd.DataFrame(stats, columns=stats_cols)
    df_stats['cat_or_num'] = df_stats['data_type'].apply(lambda x: 'categorical' if x == 'object' else 'numerical')
    return(df_stats) 

### Get Data

In [21]:
df_hr_staff = pd.read_excel(source_hr_staff,skiprows=4)
df_hr_depart = pd.read_excel(source_hr_depart,skiprows=4)

In [22]:
print(df_hr_staff.shape)
print(df_hr_depart.shape)

(26, 13)
(21, 12)


In [23]:
df_hr_staff.head(3)

Unnamed: 0,Employee #,First Name,Last Name,Status,Position Name,Employee Assignment Type,Division,Department Name,Start Date,Person User Name,Email Address,Manager E-Mail Address,Record Type
0,1001,John,Emp1,Active,Project Manager,Primary/Home Assignment,Consulting,Salesforce,2011-08-08,JOHNEMP1,John.Emp1@xyz.com,Sat.Emp5@xyz.com,XYZ Employee
1,1002,Kin,Emp2,Active,Developer,Primary/Home Assignment,Consulting,Salesforce,2001-12-15,KINEMP2,Kin.Emp2@xyz.com,Sat.Emp5@xyz.com,XYZ Employee
2,1003,Jun,Emp3,Active,Data Architect,Primary/Home Assignment,Data & Analytics,AI/ML,2010-12-18,JUNEMP3,Jun.Emp3@xyz.com,Sat.Emp5@xyz.com,XYZ Employee


In [24]:
col_staff = df_hr_staff.columns
col_depart = df_hr_depart.columns
df_stats(df_hr_staff,col_staff)

Unnamed: 0,feature,record_count,unique_values,missing_records,percent_missing_records,percent_highest_columnvalue,data_type,cat_or_num
0,Employee #,26,26,0,0.0,3.846154,int64,numerical
1,First Name,26,26,0,0.0,3.846154,object,categorical
2,Last Name,26,26,0,0.0,3.846154,object,categorical
3,Status,26,2,0,0.0,76.923077,object,categorical
4,Position Name,26,26,0,0.0,3.846154,object,categorical
5,Employee Assignment Type,26,1,0,0.0,100.0,object,categorical
6,Division,26,5,0,0.0,57.692308,object,categorical
7,Department Name,26,6,0,0.0,30.769231,object,categorical
8,Start Date,26,26,0,0.0,3.846154,datetime64[ns],numerical
9,Person User Name,26,26,0,0.0,3.846154,object,categorical


In [25]:
df_stats(df_hr_depart,col_depart)

Unnamed: 0,feature,record_count,unique_values,missing_records,percent_missing_records,percent_highest_columnvalue,data_type,cat_or_num
0,Employee #,21,21,0,0.0,4.761905,int64,numerical
1,First Name,21,21,0,0.0,4.761905,object,categorical
2,Last Name,21,21,0,0.0,4.761905,object,categorical
3,Position,21,18,0,0.0,9.52381,object,categorical
4,Division,21,3,0,0.0,85.714286,object,categorical
5,Department Name,21,6,0,0.0,52.380952,object,categorical
6,Start Date,21,13,0,0.0,42.857143,datetime64[ns],numerical
7,Termination Date,21,11,0,0.0,47.619048,datetime64[ns],numerical
8,Person User Name,21,21,0,0.0,4.761905,object,categorical
9,Work Email,21,21,0,0.0,4.761905,object,categorical


## Data Prep

### Normalize Data Structures

In [26]:
## Rename the columns 

col1_staff = {
 'Employee #': 'Employee_No',
 'First Name': 'First_Name',
 'Last Name': 'Last_Name',
 'Position Name': 'Position',
 'Department Name': 'Department_Name',
 'Start Date': 'Start_Date',
 'Person User Name': 'Person_User_Name',
 'Email Address': 'Work_Email',
 'Manager E-Mail Address': 'Manage_Email',
 'Record Type': 'Record_Type'}

col2_staff = {
 'Employee #': 'Employee_No',
 'First Name': 'First_Name',
 'Last Name': 'Last_Name',
 'Department Name': 'Department_Name',
 'Start Date': 'Start_Date',
 'Termination Date':'Termination_Date',
 'Person User Name': 'Person_User_Name',
 'Work Email': 'Work_Email',
 'Manager E-Mail Address': 'Manage_Email',
 'Record Type': 'Record_Type'}

df_hr_staff=df_hr_staff.rename(columns=col1_staff)
df_hr_depart=df_hr_depart.rename(columns=col2_staff)

In [27]:
# find the difference in the data structures
col_staff = df_hr_staff.columns
col_depart = df_hr_depart.columns
diff1 = list(set(col_staff) - set(col_depart))
diff2= list(set(col_depart) - set(col_staff))
print(f"missing columns in Depart table:{diff1}")
print(f"missing columns in Stafflist table:{diff2}")

missing columns in Depart table:['Employee Assignment Type', 'Status']
missing columns in Stafflist table:['Termination_Date']


In [28]:
# Normalize the data strcutures 
df1 = pd.DataFrame(columns=diff1)
df2 = pd.DataFrame(columns=diff2)
df_hr_depart = pd.concat([df_hr_depart,df1],ignore_index=True,sort=False)
df_hr_depart["Status"]="Departed"
df_hr_staff =  pd.concat([df_hr_staff,df2],ignore_index=True,sort=False)
print(df_hr_depart.columns)
print(df_hr_staff.columns)
print(f'difference in columns   :{list(set(df_hr_staff.columns)-set(df_hr_depart.columns))}')
print(f'difference in columns   :{list(set(df_hr_depart.columns)-set(df_hr_staff.columns))}')

Index(['Employee_No', 'First_Name', 'Last_Name', 'Position', 'Division',
       'Department_Name', 'Start_Date', 'Termination_Date', 'Person_User_Name',
       'Work_Email', 'Record_Type', 'Manage_Email', 'Employee Assignment Type',
       'Status'],
      dtype='object')
Index(['Employee_No', 'First_Name', 'Last_Name', 'Status', 'Position',
       'Employee Assignment Type', 'Division', 'Department_Name', 'Start_Date',
       'Person_User_Name', 'Work_Email', 'Manage_Email', 'Record_Type',
       'Termination_Date'],
      dtype='object')
difference in columns   :[]
difference in columns   :[]


### Join the datasets

In [29]:
df_hr_staff['Employee_No'] = df_hr_staff['Employee_No'].astype(str)
df_hr_depart['Employee_No'] = df_hr_depart['Employee_No'].astype(str)
df_hr_staff['Employee_No'] = df_hr_staff['Employee_No'].str.strip()
df_hr_depart['Employee_No'] = df_hr_depart['Employee_No'].str.strip()

df_m = pd.merge(df_hr_staff,df_hr_depart,on='Employee_No', how='inner')

### Rehire flag

In [30]:
# Calculate Attribute : Rehire flag 

def rehireflag(t,h):
    t = t.date()
    h = h.date()
    if t>=h and h != "":
        r = "Rehired"
    else : 
        r = ""
    return r
       
df_m["Rehire_Flag"] = df_m[["Start_Date_x","Termination_Date_y"]].apply(lambda z :rehireflag(*z), axis=1)

In [31]:
lst = ["Employee_No","First_Name_x","Start_Date_x","Termination_Date_y","Rehire_Flag"]
df_m[lst].head(3)

Unnamed: 0,Employee_No,First_Name_x,Start_Date_x,Termination_Date_y,Rehire_Flag
0,1014.0,Sara,2015-01-04,2014-01-30,Rehired
1,1015.0,Pal,2019-10-13,2018-01-30,Rehired
2,1016.0,Kar,2008-04-20,2007-01-30,Rehired


In [32]:
df_m[lst].shape

(5, 5)

### Final Depart List

In [33]:
df_rehire = pd.DataFrame(df_m[["Employee_No","Rehire_Flag"]])
df_fdepart = pd.merge(df_hr_depart,df_rehire, on='Employee_No', how='left')

In [34]:
q = (df_fdepart['Rehire_Flag']!="Rehired")
#df_final_departlist= df_fdepart.drop(df_fdepart[['Employee Assignment Type','Rehire_Flag']],axis=1)
df_final_departlist= df_fdepart[q]
df_final_departlist.shape

(16, 15)

In [35]:
df_final_departlist.columns

Index(['Employee_No', 'First_Name', 'Last_Name', 'Position', 'Division',
       'Department_Name', 'Start_Date', 'Termination_Date', 'Person_User_Name',
       'Work_Email', 'Record_Type', 'Manage_Email', 'Employee Assignment Type',
       'Status', 'Rehire_Flag'],
      dtype='object')

### Final Staff list

In [36]:
df_fstafflist = pd.merge(df_hr_staff,df_rehire, on='Employee_No', how='left')
#q = (df_fstafflist['Rehire_Flag']=="Rehired")
df_fstafflist.shape

(26, 15)

In [37]:
df_fstafflist.columns

Index(['Employee_No', 'First_Name', 'Last_Name', 'Status', 'Position',
       'Employee Assignment Type', 'Division', 'Department_Name', 'Start_Date',
       'Person_User_Name', 'Work_Email', 'Manage_Email', 'Record_Type',
       'Termination_Date', 'Rehire_Flag'],
      dtype='object')

## Output

#### Final HR Refined dataset (includes active + depart)

In [38]:
df = pd.concat([df_fstafflist, df_final_departlist],sort=False)
df.shape

(42, 15)

In [39]:
with pd.ExcelWriter(results_hr_path+f"\df_final_hr_dataset_{run_date}.xlsx") as writer:
    df.to_excel(writer, sheet_name="df_final_hr_dataset")

## Analysis

In [40]:
# Classify by rehire flag
df[['Employee_No','Status','Rehire_Flag']].groupby('Status').count()

Unnamed: 0_level_0,Employee_No,Rehire_Flag
Status,Unnamed: 1_level_1,Unnamed: 2_level_1
Active,20,5
Departed,16,0
Leave,6,0
