In [1]:
import pandas as pd
import numpy as np
import time, tqdm
import warnings
warnings.filterwarnings("ignore")

import itertools
import os, glob, re
from collections import Counter

from sklearn import preprocessing

In [2]:
def print_data_summary(df):
    user_counts = df.user_id.value_counts()
    num_users = len(df.name.unique())
    avg_rows_per_user = np.mean(user_counts.values).round(2)
    false_rows = df[df['false_character'] == 't']
    high_long_pressed = df[df['long_pressed_equivalent'] > 1]
    print("******** Data at a high level ********")
    print("Number of users              : ", num_users)
    print("Unique user_ids              : ", len(user_counts))
    print("Avg rows collected/user_id   : ", avg_rows_per_user)
    print("Rows with false characters   : {}%".format(round((false_rows.shape[0]/df.shape[0])*100,2)))
    print("Avg long_pressed_equivalent  : ", np.mean(df['long_pressed_equivalent']).round(2))
    print("Rows with long_pressed > 1   : {} %".format(round((high_long_pressed.shape[0]/df.shape[0])*100,2)))
    
    print("*********************************************")

In [13]:
def find_files(path, pattern):
    """ Returns all file paths that match a given pattern """
    return [file for file in glob.glob(path) if re.search(pattern, os.path.basename(file))]

def convert_to_pandas(files):
    """ Reads all files into pandas and returns a single merged DataFrame """
    dfs = [pd.read_csv(file) for file in files]
    return pd.concat(dfs)

def get_valid_dataset(path_pattern, invalid_platform, user_count_threshold=600):
    """
        1. Parse files in directories with specific patterns, get a working dataset.
        2. Remove invalid users based on platform used
        3. Remove users for whom total typed characters collected is less than threshold.
    """
    desired_files = [find_files(path, path_pattern[path]) for path in path_pattern]
    desired_files = list(itertools.chain(*desired_files))
    df = convert_to_pandas(desired_files)
    df = df[df['long_pressed_equivalent'] == 1]
    
    cond1 = ~df['platform'].isin(invalid_platform)
    user_typed_count = df.groupby('name').agg({'long_pressed_equivalent':np.sum}).reset_index()
    valid_users = user_typed_count[user_typed_count['long_pressed_equivalent'] > user_count_threshold]['name']
    cond2 = df['name'].isin(valid_users)
    df = df[(cond1) & (cond2)]
    return df
    

In [226]:
def get_unique_users_subset(df):
    """
        Given a dataset with multiple user_id(s) per user/name, retrieve only one instance for a user.
        Return the instance with highest number of rows collected.
    """
    name_to_user_id = get_num_instances_per_user(df)
    users = name_to_user_id[name_to_user_id['user_id'] > 1]['name']
    print(f"*** {len(users)} users with multiple user_ids, taking most significant instance for each user")
    discard = []
    for user in users:
        discard.extend(df[df['name'] == user].groupby('user_id').agg({'long_pressed_equivalent':np.sum}).sort_values('long_pressed_equivalent').index[:-1].tolist())
    
    return df[~df['user_id'].isin(list(discard))]

def encode_to_labels(df, feature_list) -> (pd.DataFrame, dict):
    """ Encodes all features provided in input list format.
        Returns back encoded dataframe and map of of feature -> {label : encoded_class}
    """
    feature_encoded = {}
    labelEncoder = preprocessing.LabelEncoder()
    for feature in feature_list:
        df[feature] = labelEncoder.fit_transform(df[feature])
        feature_encoded[feature] = {labelEncoder.classes_[i] : i for i in range(len(labelEncoder.classes_))}
    return df, feature_encoded

def get_num_instances_per_user(df):
    return df.groupby('name').agg({'user_id':'nunique'}).reset_index()
        

In [11]:
def prepare_data(raw_df, conditions):
    """
        Takes raw data and applies conditions. 
        Displays some information on the "prepared" data.
    """
    df = raw_df[conditions]
    char_freq = Counter(df['key_pressed'])
    char_df = pd.DataFrame.from_dict(char_freq, orient='index').reset_index()
    char_df.columns = ['char','freq']
    char_df['type'] = char_df['char'].apply(lambda x : 'lower' if x.islower() else 'upper' if len(x) == 1 and x.isupper() else 'other' )
    char_distribution = pd.DataFrame((char_df['type'].value_counts(normalize=True)*100).round(3))
    char_type = dict(zip(char_df['char'], char_df['type']))
    
    df['type'] = df['key_pressed'].map(char_type)
    false_char_distribution = pd.DataFrame((df['false_character'].value_counts(normalize=True)*100).round(3))
    
    print("********* DATA PREPARATION FOR FEATURE ENGINEERING *********")
    print("\tRAW VS PREPARED SHAPES\n{}\t\t{}\n".format(raw_df.shape, df.shape))
    print("----------")
    print("\tCHARACTER TYPE DISTRIBUTION\n", char_distribution)
    print("----------")
    print("\tFALSE CHARACTER DISTRIBUTION\n", false_char_distribution)
    print("*************************************************************")
    
    return df
    
def combine_characters_v1(zipped_rows : list, columns : list) -> pd.DataFrame:
    """
        Feature engineering support on zipped rows.
        `false_character` : Joins all the false_character readings into one
        `hold_for` : Sum up
        `key_pressed` : Joins all characters pressed
        `long_pressed_equivalent` : Sum up
        `type_combination` : Comprised of joining the character types (lower, upper, other)
        `effort` : Total time spent typing the sequence of N characters.
        `speed` : How efficiently and quickly was the sequence typed. 
    """
    n = len(zipped_rows[0])
    feature_rows = []
    skipped = 0
    for row in tqdm.tqdm(zipped_rows):
        row = pd.DataFrame(row, columns=columns)
        if len(row['box_id'].unique()) == 1 and len(row['name'].unique()) == 1:

            result = row.loc[0]
            result['false_character'] = ''.join(row['false_character'])
            result['hold_for'] = sum(row['hold_for'])
            result['key_pressed'] = ''.join(row['key_pressed'])
            result['long_pressed_equivalent'] = sum(row['long_pressed_equivalent'])
            result['pressed_after'] = sum(row['pressed_after'])
            result['type_combination'] = ''.join(['l' if x.islower() else 'u' if x.isupper() else 'o' for x in row['key_pressed']])
            result['effort'] = sum(row['hold_for']) + sum(row['pressed_after'][1:])
            result['speed'] = np.mean([(abs(ord(x[0].lower()) - ord(x[1].lower())) / row['pressed_after'][i+1]) for i,x in enumerate(zip(row['key_pressed'][:-1], row['key_pressed'][1:]))]).round(4)
            if result['speed'] == float('inf'):
#                 print('Skipping row as `speed` evaluates to `inf`... ', row['id'])
                skipped += 1
                continue
                    
            feature_rows.append(result)
    print("****** Skipped {} rows as `speed` evaluates to `inf`".format(skipped))
    return pd.DataFrame(feature_rows)
    

In [7]:
def user_summary(df : pd.DataFrame) -> pd.DataFrame:
    """
        At a high level, derive the characteristics of a user split by `TRUE_CHAR` and `FALSE_CHAR`.
        Total characters captured, time spent and respective percentages.
    """
    user_summary = []    
    columns = ['name','device_type','platform','type','total_characters','total_time_spent','character_perc','character_time_perc']

    for user in tqdm.tqdm(df.name.unique()):
        user_row = [user]
        u_df = df[df['name'] == user]
        
        user_row = [user, u_df['device_type'].unique()[0], u_df['platform'].unique()[0]]
        
        true_df = u_df[u_df['false_character'] == 'f']
        false_df = u_df[u_df['false_character'] == 't']

        chars_captured = u_df['long_pressed_equivalent'].sum()
        true_chars_captured = true_df['long_pressed_equivalent'].sum()
        false_chars_captured = false_df['long_pressed_equivalent'].sum()

        time_spent = np.sum(u_df['hold_for'] + u_df['pressed_after'])
        time_spent_true = np.sum(true_df['hold_for'] + true_df['pressed_after'])
        time_spent_false = np.sum(false_df['hold_for'] + false_df['pressed_after'])
        
        row = user_row.copy()
        row.extend(['TRUE_CHAR',true_chars_captured, time_spent_true,
                   (true_chars_captured/chars_captured)*100,
                   (time_spent_true/time_spent)*100])
        
        user_summary.append(row)
        row = user_row.copy()
        row.extend(['FALSE_CHAR',false_chars_captured, time_spent_false,
                   (false_chars_captured/chars_captured)*100,
                   (time_spent_false/time_spent)*100])
        user_summary.append(row)
    
    return pd.DataFrame(user_summary, columns=columns)

def clean_raw_data(raw : pd.DataFrame, true_perc_threshold=60) -> pd.DataFrame:
    raw = raw[raw['long_pressed_equivalent'] == 1]
    user_agg_test = user_summary(raw)
    
    invalid_users = user_agg_test[(user_agg_test['type'] == 'TRUE_CHAR') & (user_agg_test['character_perc'] < true_perc_threshold)]
    valid_users = user_agg_test[(user_agg_test['type'] == 'TRUE_CHAR') & (user_agg_test['character_perc'] >= true_perc_threshold)]
    print("******** TRUE_CHAR_MEAN ********")
    print(f"Removing users with avg character perc for TRUE less than {true_perc_threshold} (TRUE PERC THRESHOLD)")
    print(f"ALL USERS AVG    : ", user_agg_test[(user_agg_test['type'] == 'TRUE_CHAR')]['character_perc'].mean())
    print("INVALID USERS AVG : ", invalid_users['character_perc'].mean())
    print("VALID USERS AVG   : ", valid_users['character_perc'].mean())
    print("********************************")
    raw_clean = raw[raw['name'].isin(valid_users['name'].unique())]
    unique_users = len(raw_clean['name'].unique())
    print(f"{unique_users} valid users found, data is cleaned for exploration and feature engineering!")
    print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
    return raw_clean
                
    