# xyz UAR Step 3: Find the changes in AD Master Data 

###### Author : Sateesh Babu
###### Version: V3
###### Created: 2021-12-28
###### Updated: 2021-12-30

# Input

In [156]:
# Dates
old_dt = '20211229'
new_dt = '20211230'

In [157]:
# Which files you want to compare ?

## AD Department file
#filename = '\output_ad_unique_departments'
#keyfield = 'Unique_AD_Departments'

## AD Groups file
#filename = '\output_ad_unique_groups'
#keyfield = 'Unique_AD_Groups'

## AD Users file
filename = '\output_ad_unique_users'
keyfield = 'Unique_AD_SamAccountName'

In [158]:
# File Path
in_filepath = r'C:\Users\SATEESHB\OneDrive - Metrolinx\documents\zMyWork\05_PowerBI_PythonCode\00_Py_UserAccess\Results_Data\Step2_Results_AD_Master'
out_filepath= r'C:\Users\SATEESHB\OneDrive - Metrolinx\documents\zMyWork\05_PowerBI_PythonCode\00_Py_UserAccess\Results_Data\Step3_Results_Differences'

# Code

## Packages

In [159]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [160]:
# Define the diff function to show the changes in each field
def report_diff(x):
    return x[0] if x[0] == x[1] else '{} ---> {}'.format(*x)

In [161]:
def df_stats(df,columns):
    """
    df info
    """
    stats = []
    for col in columns:
        stats.append((col, df[col].count(), df[col].nunique(), df[col].isnull
                      ().sum(), df[col].isnull().sum() * 100 / df.shape[0], df[col].dtype))
        stats_cols = ['attribute', 'record_count', 'unique_values','missing_records', 'percent_missing_records', 'data_type']
        df_stats = pd.DataFrame(stats, columns=stats_cols)
    return(df_stats)

## Get Data

In [162]:
# Read in the two files but call the data old and new and create columns to track
old = pd.read_excel(f'{in_filepath}{filename}_{old_dt}.xlsx', na_values=['NA'])
new = pd.read_excel(f'{in_filepath}{filename}_{new_dt}.xlsx', na_values=['NA'])
old['version'] = "old"
new['version'] = "new"

## Prepare Data

### Drop Unwanted Columns

In [163]:
# Drop umwanted columns
drop_cols = ['Unnamed: 0']
old = old.drop(drop_cols, axis=1)
new = new.drop(drop_cols, axis=1)

### Check for new/drop Columns

In [164]:
old_cols = list(old.columns.values)
new_cols = list(new.columns.values)
print(set(new_cols) - set(old_cols))

set()


### Check for Unique values 

In [165]:
old_stats = df_stats(old,old_cols)
new_stats = df_stats(new,new_cols)

In [166]:
 old_stats.sort_values('unique_values', ascending=False).head(5)

Unnamed: 0,attribute,record_count,unique_values,missing_records,percent_missing_records,data_type
0,Unique_AD_SamAccountName,25,25,0,0.0,object
1,version,25,1,0,0.0,object


In [167]:
 new_stats.sort_values('unique_values', ascending=False).head(5)

Unnamed: 0,attribute,record_count,unique_values,missing_records,percent_missing_records,data_type
0,Unique_AD_SamAccountName,49,49,0,0.0,object
1,version,49,1,0,0.0,object


### Drop Duplicate Records

In [168]:
# keep the last duplicate record
old = old.drop_duplicates(subset=[keyfield],keep='last')

In [169]:
# keep the last duplicate record
new = new.drop_duplicates(subset=[keyfield],keep='last')

### Dropped & New Records

In [170]:
# We use the account numbers as the keys to check what is added, dropped and potentially changed
# Using sets makes the deduping easy and we can use set operations to figure out groupings
old_values_all = set(old[keyfield])
new_values_all = set(new[keyfield])

dropped_values = old_values_all - new_values_all
added_values = new_values_all - old_values_all
print(f'Dropped Values :{dropped_values}')
print(f'New Values :{added_values}')

Dropped Values :set()
New Values :{'SUGAREMP39x', 'ABBYEMP36x', 'ARJUNCONTR1X', 'COFFEEEMP40x', 'MONEYCONTR1X', 'ARUNCONTR1X', 'ALLYCONTR1X', 'JANCONTR1X', 'ZHANEMP31x', 'TEAEMP41x', 'VINNYEMP28x', 'MARIEEMP34x', 'RICECONTR1X', 'FAYEMP27x', 'CASSYEMP38x', 'XONGEMP32x', 'ANTEMP35x', 'DANCONTR1X', 'JENNYEMP37x', 'TIFFYEMP29x', 'MACCONTR1X', 'SHARKEMP42x', 'LARRYEMP33x', 'GEOEMP30x'}


## Join

In [171]:
#Join all the data together and ignore indexes so it all gets concatenated
all_data = pd.concat([old,new],ignore_index=True)

In [172]:
all_data.head(3)

Unnamed: 0,Unique_AD_SamAccountName,version
0,JOHNEMP1x,old
1,KINEMP2x,old
2,JUNEMP3x,old


## Changes

In [173]:
imp_cols = [keyfield]

In [174]:
# Let's see what changes in the main columns we care about
# Change drop_duplicates syntax: keep=last
changes = all_data.drop_duplicates(subset=imp_cols, keep='last')

In [175]:
#Get all the duplicate rows
dupe_accts = changes[changes[keyfield].duplicated() == True][keyfield].tolist()
dupes = changes[changes[keyfield].isin(dupe_accts)]
dupes.head(3)

Unnamed: 0,Unique_AD_SamAccountName,version


In [176]:
# Pull out the old and new data into separate dataframes
change_new = dupes[(dupes["version"] == "new")]
change_old = dupes[(dupes["version"] == "old")]

In [177]:
# Drop the temp columns - we don't need them now
change_new = change_new.drop(['version'], axis=1)
change_old = change_old.drop(['version'], axis=1)

In [178]:
# Add a index column
change_new['ID'] = change_new[keyfield]
change_old['ID'] = change_old[keyfield]

In [179]:
# Index on the account numbers
change_new.set_index('ID', inplace=True)
change_old.set_index('ID', inplace=True)

In [180]:
df_all_changes = pd.concat([change_old, change_new],
                           axis='columns',
                           keys=['old', 'new'],
                           join='outer')

In [181]:
df_all_changes = df_all_changes.swaplevel(axis='columns')[change_new.columns[0:]]

In [182]:
df_changed = df_all_changes.groupby(level=0, axis=1).apply(lambda frame: frame.apply(report_diff, axis=1))
df_changed = df_changed.reset_index()
df_changed.head(3)

Unnamed: 0_level_0,ID,Unique_AD_SamAccountName,Unique_AD_SamAccountName
Unnamed: 0_level_1,Unnamed: 1_level_1,old,new


In [183]:
df_changed[keyfield]

Unnamed: 0,old,new


## Removed

In [184]:
df_removed = changes[changes[keyfield].isin(dropped_values)]
df_removed

Unnamed: 0,Unique_AD_SamAccountName,version


## Added

In [185]:
df_added = changes[changes[keyfield].isin(added_values)]
df_added.head(2)

Unnamed: 0,Unique_AD_SamAccountName,version
50,XONGEMP32x,new
51,FAYEMP27x,new


# Output

In [186]:
#Save the changes to excel but only include the columns we care about
output_columns = [keyfield]

writer = pd.ExcelWriter(out_filepath+"\\"+f"ouput_{keyfield}_Values_Differences_{old_dt}-to-{new_dt}.xlsx")
df_changed.to_excel(writer,"Changed_Values")
df_removed.to_excel(writer,"Removed_Values",index=False, columns=output_columns)
df_added.to_excel(writer,"Added_Values",index=False, columns=output_columns)
writer.save()