In [2]:
import pandas as pd
import os

# Set the notebook to display all columns of a dataframe
pd.set_option('display.max_columns', None)

def read_data_files(addin_data_directory):

    # Get the files in the directory
    files = os.listdir(addin_data_directory)

    out_df = pd.DataFrame()

    for file in files:
        if file.endswith('.parquet'):

            filepath = os.path.join(addin_data_directory, file)
            
            # Read in the parquet file
            chunk = pd.read_parquet(filepath, engine='pyarrow')

            # Drop row version 
            chunk = chunk.drop('rowversion', axis=1)

            # Append result to output
            out_df = pd.concat([out_df, chunk], axis=0)
    
    return out_df

# Read in userdisc and user profile data 
u_disc = pd.read_parquet('assets/Persist_USER_DISC.parquet')
u_profile = read_data_files('assets/user_profile_data')

In [7]:
def get_machine_login_features():

    # Inner join user disc and user profile
    user_df = pd.merge(u_disc, u_profile, how='inner', left_on = ['SID0', 'RWB_EFFECTIVE_DATE'], 
                       right_on=['SID00', 'RWB_EFFECTIVE_DATE'])
    
    # Function to use in the group by
    get_unique_count = lambda x: pd.Series({"num_users": len(x['User_Name0'].unique())}, index=['num_users'])

    # Get username cound 
    out_gb = user_df.groupby(['MachineID', 'RWB_EFFECTIVE_DATE']).apply(get_unique_count)
    out_gb = out_gb.reset_index()

    return out_gb 

out = get_machine_login_features()
out.to_parquet('assets/user_count_feature.parquet', index=False)