## XYZ's IT User Access Review Step 2: Prepare AD Data 

###### Author : Sateesh Babu
###### Version: V3
###### Created: 2021-12-25
###### Updated: 2021-12-30

# Input

In [47]:
# Date when you want to generate the reports for give AD snapshot
run_date = '20211230'

In [48]:
# Source data file path

source_filepath = r"C:\Users\SATEESHB\OneDrive - Metrolinx\documents\zMyWork\05_PowerBI_PythonCode\00_Py_UserAccess\Source_Data\IT_AD"
source_ad_file  = source_filepath + f"\IT_AD_XYZ_Users_{run_date}.xlsx"

In [49]:
# results/outcome of this program

results_AD_path  = r"C:\Users\SATEESHB\OneDrive - Metrolinx\documents\zMyWork\05_PowerBI_PythonCode\00_Py_UserAccess\Results_Data\Step2_Results_AD_Master"

# Code

### Packages

In [50]:
import numpy as np
import pandas as pd
#import pandasql as ps

from datetime import datetime
from datetime import date
from datetime import time
from dateutil import relativedelta
from datetime import timedelta
import time

import warnings
warnings.filterwarnings('ignore')

### Functions

In [51]:
# Stats
def df_stats(df,columns):
    '''
    Stats of each column:
    1.Count
    2.Unique Values count
    3.Missing Values count 
    4.% of Missing Values wrt total instances of the dataset
    5.% of Highest column value count
    6.Data Type
    
    '''
    
    stats = []
    for col in columns:
        stats.append((col, df[col].count(), df[col].nunique(), df[col].isnull().sum(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))
    
    stats_cols = ['feature', 'record_count', 'unique_values','missing_records', 'percent_missing_records','percent_highest_columnvalue', 'data_type']
    df_stats = pd.DataFrame(stats, columns=stats_cols)
    df_stats['cat_or_num'] = df_stats['data_type'].apply(lambda x: 'categorical' if (x == 'object'or x == 'bool') else ('date-Categorical'if x == "datetime64[ns]" else'numerical'))
    return(df_stats)

### Get Data

In [52]:
df_ad_dataset = pd.read_excel(source_ad_file)

In [53]:
print(f'Shape of AD data: {df_ad_dataset.shape}')

Shape of AD data: (50, 19)


In [54]:
cols = df_ad_dataset.columns
df_stats(df_ad_dataset,cols)

Unnamed: 0,feature,record_count,unique_values,missing_records,percent_missing_records,percent_highest_columnvalue,data_type,cat_or_num
0,Firstname,50,49,0,0.0,4.0,object,categorical
1,Lastname,50,43,0,0.0,16.0,object,categorical
2,OUpath,50,1,0,0.0,100.0,object,categorical
3,DisplayName,50,50,0,0.0,2.0,object,categorical
4,SamAccountName,50,49,0,0.0,4.0,object,categorical
5,Department,45,7,5,10.0,40.0,object,categorical
6,EmailAddress,42,42,8,16.0,16.0,object,categorical
7,Company,0,0,50,100.0,100.0,float64,numerical
8,Description,50,50,0,0.0,2.0,object,categorical
9,Enabled,50,2,0,0.0,78.0,bool,categorical


### Prepare Data

In [55]:
# get data without user groups
cols1 = list(set(cols) - set(['Group']))
df_ad1 = df_ad_dataset[cols1]

df_ad1.head(3)

Unnamed: 0,WhenChanged,OUpath,DisplayName,AdminCount,Department,AccountExpiry,SamAccountName,Firstname,Passwordset,WhenCreated,Lastname,Description,CreateTimeStamp,UserAccountControl,LastLogonDate,Company,EmailAddress,Enabled
0,2021-12-01 01:00:17,Test path,John Emp1,,XYZ,,JOHNEMP1x,John,2021-09-17 01:02:31,2021-05-18 03:20:58,Emp1,Request#123,2021-05-18 03:20:58,512,2021-06-16 01:00:09,,John.Emp12@xyz.com,True
1,2021-12-01 01:00:17,Test path,Kin Emp2,,xyz,,KINEMP2x,Kin,2021-09-17 01:02:31,2021-05-18 03:20:58,Emp2,Request#122,2021-05-18 03:20:58,514,2021-06-16 01:00:09,,Kin.Emp2@xyz.com,False
2,2021-09-20 13:25:07,Test path,Jun Emp3,,XYZ_Dept1,,JUNEMP3x,Jun,2021-09-17 12:31:40,2021-09-18 12:31:40,Emp3,Request#121,2021-09-18 12:31:40,512,2021-09-20 13:07:35,,Jun.Emp3@xyz.com,True


In [56]:
# get data with user groups
df_ad2 = df_ad_dataset[['SamAccountName','Group']]
df_ad2.head(3)

Unnamed: 0,SamAccountName,Group
0,JOHNEMP1x,grp1;grp2;grp3
1,KINEMP2x,grp1;grp2;grp3;grp4
2,JUNEMP3x,grp1;grp2;grp3;grp4;grp5;grp6;grp7;grp8;grp9;g...


In [57]:
# parse the user groups
start = time.time()

df = pd.DataFrame()
frames = []
row_iterator = df_ad2.iterrows()

for index, row in row_iterator:
    d = {}
    d["SamAccountName"] = row[0]
    d["Group"] = row[1].split(';')
    df1 = pd.DataFrame(d)
    frames.append(df1)
    df = pd.concat(frames)
    df = df.replace(r'^\s*$', np.nan, regex=True)
    
    # drop rows with null value in groups
    df_ad3 = df.dropna()

end = time.time()

print(f"Processing time in minutes {(end - start)/60}")
print(df_ad3.head(5))

Processing time in minutes 0.017924269040425617
  SamAccountName Group
0      JOHNEMP1x  grp1
1      JOHNEMP1x  grp2
2      JOHNEMP1x  grp3
0       KINEMP2x  grp1
1       KINEMP2x  grp2


In [58]:
df_ad3.describe()

Unnamed: 0,SamAccountName,Group
count,461,461
unique,49,21
top,XONGEMP32x,grp1
freq,18,47


# Ouput

In [59]:
# Save to excel for quick reference 

# 1.AD users and their provision
# ie. list of AD user assignments

d = df_ad1["SamAccountName"].count()
print(f"1.AD users count:{d}")
with pd.ExcelWriter(results_AD_path+f"\df_ad_users_master_{run_date}.xlsx") as writer:
    df_ad1.to_excel(writer, sheet_name="ad_user_master")
    

d = df_ad3["Group"].count()
print(f"1.AD user assignments count:{d}")
with pd.ExcelWriter(results_AD_path+f"\df_ad_user_group_assignments_{run_date}.xlsx") as writer:
    df_ad3.to_excel(writer, sheet_name="ad_user_groups")

    
# 2.Unique AD Groups list
df_unique_groups = pd.DataFrame(df_ad3["Group"].unique(), columns = ["Unique_AD_Groups"])
g = df_ad3["Group"].nunique()
print(f"2.AD Unique Groups:{g}")

with pd.ExcelWriter(results_AD_path+f"\output_ad_unique_groups_{run_date}.xlsx") as writer:
    df_unique_groups.to_excel(writer, sheet_name="ad_unique_groups")

      
# 3.Unique AD User list 
df_unique_SamAccountName = pd.DataFrame(df_ad3["SamAccountName"].unique(), columns = ["Unique_AD_SamAccountName"])
u = df_ad3["SamAccountName"].nunique()
print(f"3.AD Unique Users:{u}")
      
with pd.ExcelWriter(results_AD_path+f"\output_ad_unique_users_{run_date}.xlsx") as writer:
    df_unique_SamAccountName.to_excel(writer, sheet_name="ad_unique_users")
      

# 4.Unique AD Department list
df_unique_depart = pd.DataFrame(df_ad1["Department"].unique(),columns = ["Unique_AD_Departments"])
d = df_ad1["Department"].nunique()
print(f"4.AD Unique Departments:{d}")

with pd.ExcelWriter(results_AD_path+f"\output_ad_unique_departments_{run_date}.xlsx") as writer:
    df_unique_depart.to_excel(writer, sheet_name="ad_unique_departments")

1.AD users count:50
1.AD user assignments count:461
2.AD Unique Groups:21
3.AD Unique Users:49
4.AD Unique Departments:7


# Analysis

## Q1 Get top 5 groups with more users assigned.

In [60]:
grp_lst = ['Group']

#df_ad3.groupby(grp_lst,as_index=False).agg(agg_dict)

df = pd.DataFrame(df_ad3.groupby(grp_lst)['SamAccountName'].count().sort_values(ascending=False))
df.head(5)

Unnamed: 0_level_0,SamAccountName
Group,Unnamed: 1_level_1
grp1,47
grp2,45
grp3,43
grp4,34
grp9,28


## Q2 Get top 5 users with more groups assigned.

In [61]:
df = pd.DataFrame(df_ad3.groupby(['SamAccountName'])["Group"].count().sort_values(ascending=False))
df.head(5)

Unnamed: 0_level_0,Group
SamAccountName,Unnamed: 1_level_1
XONGEMP32x,18
COFFEEEMP40x,15
DEBRAEMP21x,15
FAYEMP27x,15
CLAREEMP24x,15


## Q3 Get user details

In [62]:
lst= ['CLAREEMP24x','COFFEEEMP40x','XONGEMP32x']
df_ad1[df_ad1['SamAccountName'].isin(lst)]

Unnamed: 0,WhenChanged,OUpath,DisplayName,AdminCount,Department,AccountExpiry,SamAccountName,Firstname,Passwordset,WhenCreated,Lastname,Description,CreateTimeStamp,UserAccountControl,LastLogonDate,Company,EmailAddress,Enabled
23,2021-09-20 13:25:07,Test path,Clare Emp24,,,,CLAREEMP24x,Clare,2021-09-17 12:31:40,2021-09-18 12:31:40,Emp24,Request#100,2021-09-18 12:31:40,512,2021-09-20 13:07:35,,Clare.Emp24@xyz.com,True
25,2021-09-20 13:25:07,Test path,Maurice Emp26,,xyz,,XONGEMP32x,Maurice,2021-09-17 12:31:40,2021-09-18 12:31:40,Emp26,Request#98,2021-09-18 12:31:40,512,2021-09-20 13:07:35,,Maurice.Emp26@xyz.com,True
31,2021-09-20 13:25:07,Test path,Xong Emp32,,xyz,,XONGEMP32x,Xong,2021-09-17 12:31:40,2021-09-18 12:31:40,Emp32,Request#92,2021-09-18 12:31:40,512,2021-09-20 13:07:35,,Xong.Emp32@xyz.com,True
39,2021-09-20 13:25:07,Test path,Coffee Emp40,,XYZ,,COFFEEEMP40x,Coffee,2021-09-17 12:31:40,2021-09-18 12:31:40,Emp40,Request#84,2021-09-18 12:31:40,514,2021-09-20 13:07:35,,Coffee.Emp40@xyz.com,False
