## Feature Engineering: Addins, Programs, and Updates
This notebook is intended to engineer the features from `OFFICE_ADDIN_DATA`, `Add_Remove_Programs`, and update events from `EventRawResultItem`.

### Engineer update event features
Here I will create 3 features:
1.	Number of updates installed 
2.	Number of Windows 10 updates installed
3.	Number of office updates installed 

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date
import os
import dtale

# Set the notebook to display all columns of a dataframe
pd.set_option('display.max_columns', None)

def handle_none_values(in_val):

    if type(in_val) == type(None) or in_val == 'None':
        return np.nan
    
    else:
        return in_val
    
def read_json_fill_attr(jsonfile, in_df, attr):

    attr_df = pd.read_json(jsonfile, orient='index')
    in_dict = attr_df.reset_index().set_index('id').to_dict(orient='index')
    in_df[attr] = in_df[attr].apply(lambda x: in_dict[x]['index'])

    return in_df

def read_update_data(infile):

    # Read in INC and category df
    inc_df = pd.read_parquet(infile)

    # Filter out erroneous Nones in the data 
    for col in inc_df.columns:
        inc_df[col] = inc_df[col].apply(lambda x: handle_none_values(x))

    out_inc = read_json_fill_attr('assets/updateTitle.json', inc_df, 'updateTitle')

    return out_inc

# Get incident data 
#df = read_update_data('assets/update_events.parquet')

In [2]:
def get_update_counts(in_dat):

    output_dict = {}

    # Get total number of updates
    output_dict['num_updates'] = len(in_dat)

    # Get the number of windows os updates
    output_dict['num_windows_64_os_updates'] = in_dat['win_64_os_update'].sum()

    # Get the number of office updates
    output_dict['num_office_updates'] = in_dat['office_update'].sum()

    # Return a series for the group 
    out_series = pd.Series(output_dict, index=list(output_dict.keys()))

    return out_series
 
def get_update_features(in_df):

    # lower case for updateTitle 
    in_df['updateTitle'] = in_df['updateTitle'].str.lower()

    # Change created system time to type datetime
    in_df['TimeCreatedSystemTime'] = pd.to_datetime(in_df['TimeCreatedSystemTime'])

    # Add created date 
    in_df['created_date'] = in_df['TimeCreatedSystemTime'].dt.strftime('%Y-%m-%d')

    # Create identifier for x64 based Windows OS updates 
    in_df['win_64_os_update'] = in_df['updateTitle'].apply(lambda x: 1 if ('cumulative update for' in x) or ('windows 10' in x) else 0)

    # Create identifier for office updates
    in_df['office_update'] = in_df['updateTitle'].apply(lambda x: 1 if 'office' in x else 0)

    # Group by and get results 
    out_gb = in_df.groupby(['ClientItemKey', 'created_date']).apply(get_update_counts).reset_index()

    return in_df, out_gb

# Get features
# processd_df, grouped_df = get_update_features(df)

# Confirm results are expected
# dtale.show(grouped_df).open_browser()

# Save result to parquet
# grouped_df.to_parquet('assets/update_summary_features.parquet')

### Engineer Addin Features 

In [3]:

def read_addin_data(addin_data_directory):

    # Get the files in the directory
    files = os.listdir(addin_data_directory)

    out_df = pd.DataFrame()

    for file in files:
        if file.endswith('.parquet'):

            filepath = os.path.join(addin_data_directory, file)
            
            # Read in the parquet file
            chunk = pd.read_parquet(filepath, engine='pyarrow')

            # Drop row version 
            chunk = chunk.drop('rowversion', axis=1)

            # Fill friendly name, product name, and company name with values
            friendlyname = os.path.join(addin_data_directory, 'FriendlyName.json')
            companyname = os.path.join(addin_data_directory, 'CompanyName.json')
            productname = os.path.join(addin_data_directory, 'ProductName.json')
            chunk = read_json_fill_attr(friendlyname, chunk, 'FriendlyName00')
            chunk = read_json_fill_attr(companyname, chunk, 'CompanyName00')
            chunk = read_json_fill_attr(productname, chunk, 'ProductName00')

            # Append result to output
            out_df = pd.concat([out_df, chunk], axis=0)
    
    return out_df

# Profile first chunk
# df = read_addin_data('assets/office_addin_data')
# addin_df = next(dfs)

# Optional, view result in dtale
# dtale.show(addin_df).open_browser()

In [4]:
import re 

def identify_addin(in_dat, attr_name, re_pat):

    in_dat[attr_name] = in_dat['FriendlyName00'].apply(lambda x: 1 if re.search(re_pat, x) else 0)

    return in_dat


def get_specific_addin_presence(group, addin_attrs):

    out_dict = {}

    # Get attribute presence for the group
    for attribute in addin_attrs:

        if group[attribute].sum() > 0:
            out_dict[attribute] = 1
        else: 
            out_dict[attribute] = 0
    
    # Return results in the series
    out_series = pd.Series(out_dict, index=list(out_dict.keys()))

    return out_series

def get_addins_features(in_dat):

    # Function to use in the group by
    get_num_addins = lambda x: pd.Series({"num_addins": len(x['Id00'].unique()),
                                            "avg_loadtime": x['AverageLoadTimeInMilliseconds00'].mean()}, 
                                         index=["num_addins", "avg_loadtime"])

    # Get architecture-specific addin features
    gb_fields = ['MachineID', 'RWB_EFFECTIVE_DATE', 'Architecture00']
    out_gb = in_dat.groupby(gb_fields).apply(get_num_addins)
    out_gb = out_gb.reset_index()

    # Pivot results 
    out_gb['Architecture00'] = 'num_' + out_gb['Architecture00'] + '_addins'
    out_gb = out_gb.pivot(index=['MachineID', 'RWB_EFFECTIVE_DATE'],
                          columns='Architecture00',
                          values = ['num_addins', 'avg_loadtime']
                          )
    
    # Clean results
    out_gb =out_gb.reset_index()
    out_gb.columns = ['MachineID', 'RWB_EFFECTIVE_DATE', 'num_x64addins',
                      'num_x86addins', 'x64add_avgloadtime', 'x86add_avgloadtime']
    
    # Define addins to check presence
    addin_attrs = ['has_cap_iq_add', 'has_factset_add', 'has_bluematrix_add',
                'has_bloomberg_add', 'has_acrobat_add']

    # Get specific addin presence features
    gb_fields2 = ['MachineID', 'RWB_EFFECTIVE_DATE']
    out_gb2 = in_dat.groupby(gb_fields2).apply(get_specific_addin_presence, 
                                               addin_attrs=addin_attrs)
    out_gb2 = out_gb2.reset_index()

    # Join addin presence and architecture-specific addin features
    output_df = pd.merge(out_gb, out_gb2, 
                         how='outer', 
                         left_on=['MachineID', 'RWB_EFFECTIVE_DATE'],
                         right_on=['MachineID', 'RWB_EFFECTIVE_DATE'])

    return output_df

def get_addin_filesize(in_dat):

    # Function to use in the group by
    get_total_filesize = lambda x: pd.Series({"total_filesize": x['FileSize00'].sum()}, 
                                         index=["total_filesize"])
    
    # Group by Machine, day and architecture to get results
    gb_fields = ['MachineID', 'RWB_EFFECTIVE_DATE', 'OfficeApp00', 'Architecture00']
    out_gb = in_dat.groupby(gb_fields).apply(get_total_filesize)
    out_gb = out_gb.reset_index()

    # Pivot results 
    out_gb['summary_vals'] =  out_gb['OfficeApp00'] + out_gb['Architecture00'] + '_addin_filesize'
    out_gb = out_gb.pivot(index=['MachineID', 'RWB_EFFECTIVE_DATE'],
                          columns='summary_vals',
                          values = 'total_filesize'
                          )
    out_gb = out_gb.reset_index()

    return out_gb

def create_addin_features(in_df):

    # Change effective date to type date
    in_df['RWB_EFFECTIVE_DATE'] = pd.to_datetime(in_df['RWB_EFFECTIVE_DATE']).dt.strftime('%Y-%m-%d')

    # Create identifier for CAP IQ 
    cap_pat = 'Cap IQ|Capital IQ|cap iq|capital iq'
    in_df = identify_addin(in_df, 'has_cap_iq_add', cap_pat)

    # Create identifier for FactSet 
    cap_pat = 'FactSet|factset'
    in_df = identify_addin(in_df, 'has_factset_add', cap_pat)

    # Create identifier for BlueMatrix, Bloomberg, and acrobat
    in_df['has_bluematrix_add'] = in_df['CompanyName00'].apply(lambda x: 1 if x == 'BlueMatrix I LLC' else 0)
    in_df['has_bloomberg_add'] = in_df['CompanyName00'].apply(lambda x: 1 if x == 'Bloomberg LP' else 0)
    in_df['has_acrobat_add'] = in_df['CompanyName00'].apply(lambda x: 1 if x == 'Adobe Systems Incorporated' else 0)

    # Get the number of 64-bit and 32-bit Office addins 
    out_gb = get_addins_features(in_df)

    # Get file size of addins per office app 
    out_gb2 = get_addin_filesize(in_df)

    # Join all addin features
    output_df = pd.merge(out_gb, out_gb2, 
                         how='outer', 
                         left_on=['MachineID', 'RWB_EFFECTIVE_DATE'],
                         right_on=['MachineID', 'RWB_EFFECTIVE_DATE'])
    
    # Replace NaNs with zeros for machines that do not have addins
    output_df = output_df.replace(np.nan, 0)

    return output_df

# grouped_df = create_addin_features(df)

# Optional, show results of processed dataframe
# dtale.show(processed_df).open_browser()

# Send results to parquet
# grouped_df.to_parquet('assets/office_addin_features.parquet', index=False)

### Engineer Program Features
I'm going to engineer the following features from add_remove_programs:
1. Average software age
2. Number of installed programs
3. Power BI desktop installed

In [None]:
def handle_exceptions(in_val, indict):

    try:
        output = indict[in_val]['index']

    except:
        output = in_val
    
    return output

def read_json_fill_attr(jsonfile, in_df, attr):

    attr_df = pd.read_json(jsonfile, orient='index')
    in_dict = attr_df.reset_index().set_index('id').to_dict(orient='index')
    in_df[attr] = in_df[attr].apply(lambda x: handle_exceptions(x, in_dict))

    return in_df

def read_programs_data(data_directory):

    # Get the files in the directory
    files = os.listdir(data_directory)

    out_df = pd.DataFrame()

    for file in files:
        if file.endswith('.parquet'):

            filepath = os.path.join(data_directory, file)
            
            # Read in the parquet file
            chunk = pd.read_parquet(filepath, engine='pyarrow')

            # Drop unneeded columns 
            read_columns = ['MachineID', 'RWB_EFFECTIVE_DATE', 'ProdID00', 
                            'DisplayName00', 'InstallDate00' ]
            chunk = chunk[read_columns]

            # Fill friendly name, product name, and company name with values
            displayname = os.path.join(data_directory, 'DisplayName00.json')
            prodid = os.path.join(data_directory, 'ProdID00.json')
            chunk = read_json_fill_attr(displayname, chunk, 'DisplayName00')
            chunk = read_json_fill_attr(prodid, chunk, 'ProdID00')

            # Append result to output
            out_df = pd.concat([out_df, chunk], axis=0)
    
    return out_df

df = read_programs_data('assets/add_remove_programs')

In [35]:

def get_group_data(group):

    out_dict = {}

    # Get attribute presence for the g

    if group['has_powerbi'].sum() > 0:
        out_dict['has_powerbi'] = 1
    else: 
        out_dict['has_powerbi'] = 0
    
    out_dict['num_installed_programs'] = len(group['ProdID00'].unique())
    out_dict['avg_software_age'] = group['SoftwareAge'].mean()
    
    # Return results in the series
    out_series = pd.Series(out_dict, index=list(out_dict.keys()))

    return out_series

def parse_dates(val):

    if pd.notnull(val):
        if (len(val) == 8) or ('/' in val):
            return pd.to_datetime(val, format='mixed').strftime('%Y-%m-%d')
    
    else: 
        return np.nan

def calculate_age_in_days(target_date_str, in_date):
   
   # Convert target_date_str to a Python datetime object
   target_date = datetime.strptime(target_date_str, '%Y-%m-%d').date()

   # Calculate the age in days
   age_in_days = (in_date - target_date).days

   return age_in_days

def create_program_features(in_df):

    # Change effective date to type date
    in_df['RWB_EFFECTIVE_DATE'] = pd.to_datetime(in_df['RWB_EFFECTIVE_DATE']).dt.strftime('%Y-%m-%d')

    # Change install date values to type datetime
    in_df['InstallDate00'] = in_df['InstallDate00'].apply(lambda x: parse_dates(x))

    # Get age of software in days
    todaydate = date(2023, 7, 4)
    in_df['SoftwareAge'] = in_df['InstallDate00'].apply(lambda x: calculate_age_in_days(x, todaydate)
                                                        if pd.notnull(x) else np.nan)

    # Create identifier for CAP IQ 
    power_biregex = 'Power BI Desktop|Microsoft Power BI Report Server|Microsoft PowerBI Desktop'
    in_df['DisplayName00'] = in_df['DisplayName00'].astype(str)
    in_df['has_powerbi'] = in_df['DisplayName00'].apply(lambda x: 1 if re.search(power_biregex, x) else 0)

    # Get the main groupby features
    out_gb = in_df.groupby(['MachineID', 'RWB_EFFECTIVE_DATE']).apply(get_group_data)
    out_gb = out_gb.reset_index()

    # Handle negative software ages
    out_gb['avg_software_age'] = out_gb['avg_software_age'].apply(lambda x: x if x >=0 else np.nan)

    return out_gb

grouped_df = create_program_features(df)

# Optional, show results of processed dataframe
# dtale.show(grouped_df).open_browser()

# Send results to parquet
grouped_df.to_parquet('assets/add_remove_programs_features.parquet', index=False)