In [1]:
import numpy as np
import pandas as pd
import re
import dill as pickle
import itertools
from collections import Counter, defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

sns.set_style('whitegrid')
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 300

Using TensorFlow backend.


In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

## Collapse data on a per-user basis

In [3]:
trn = pd.read_pickle('data/trn.pkl')
dev = pd.read_pickle('data/dev.pkl')
test = pd.read_pickle('data/test.pkl')

trn.shape, dev.shape, test.shape

((5523173, 34), (813809, 34), (804310, 34))

In [4]:
# Select categorical variables
cat_vars = ['track', 'user', 'countries', 'client', 'session', 'format', 'token_w_l2', 'part_of_speech', 'dependency_label'] + [col for col in trn if col.startswith('morphological_features')]
len(cat_vars)

27

In [5]:
# Select continuous variables
cont_vars = ['days', 'dependency_edge_head', 'time']

In [6]:
# Combine
cols = cat_vars + cont_vars + ['label']

In [7]:
# Convert labels to string (to work with padding below)
trn['label'] = trn['label'].astype(int).astype(str)
dev['label'] = dev['label'].astype(int).astype(str)
test['label'] = test['label'].astype(int).astype(str)

In [None]:
# Combine train and dev sets (used to later train the final model)


In [8]:
def collapse_to_list(df, cols):
    return df[cols].groupby('user').apply(lambda grp: pd.Series({col: grp[col].tolist() for col in grp}))

In [9]:
trn_collapsed = collapse_to_list(trn, cols)
dev_collapsed = collapse_to_list(dev, cols)
test_collapsed = collapse_to_list(test, cols)

trn_collapsed.shape, dev_collapsed.shape, test_collapsed.shape

((6447, 31), (6437, 31), (6439, 31))

In [10]:
# Save
trn_collapsed.to_pickle('data/trn_collapsed.pkl')
dev_collapsed.to_pickle('data/dev_collapsed.pkl')
test_collapsed.to_pickle('data/test_collapsed.pkl')

## Extract categorical features

In [11]:
# Create universal mappers to map categorical values from strings and indices
UNK = '_unk_'
PAD = '_pad_'

def create_mapper(toks, max_vocab=100000, min_freq=1, UNK=UNK, PAD=PAD):
    """Create mappers between tokens and numerical indices.

    Parameters
    ----------
    toks : A list containing all raw tokens.

    max_vocab : The maximum vocabulary size.

    min_freq : The minimum frequency for a token to be included in the vocabulary.

    UNK : Special token for unknown word (default to '_unk_').

    PAD : Special token for paddings (default to '_pad_').

    Returns
    -------
    stoi : A dictionary that maps tokens to indices.

    itos : A list that maps indices to tokens.
    """
    
    toks_freq = Counter(toks)
    
    itos = [s for s, c in toks_freq.most_common(max_vocab) if c > min_freq]
    
    if PAD:
        if UNK:
            itos.insert(0, UNK)
            itos.insert(0, PAD)  # Note the index for UNK is 1 and the index for PAD is 0

            stoi = defaultdict(lambda: 1, {v: k for k, v in enumerate(itos)})
        else:
            itos.insert(0, PAD)  # Note the index for PAD is 0
            stoi = {v: k for k, v in enumerate(itos)}
    else:
        if UNK:
            itos.insert(0, UNK)  # Note the index for UNK is 0
            stoi = defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)})
        else:
            stoi = {v: k for k, v in enumerate(itos)}
    
    return stoi, itos

In [12]:
# Create mappers for each categorical variable using the training data
cat_mappers = {}

for var in cat_vars + ['label']:
    if var == 'token_w_l2':
        min_freq = 2
    else:
        min_freq = 1
    
    if var == 'label':
        UNK = None
    else:
        UNK = UNK
    
    cat_mappers[var] = {}
    cat_mappers[var]['stoi'], cat_mappers[var]['itos'] = create_mapper(trn[var].tolist(), min_freq=min_freq, UNK=UNK)

In [13]:
# Check the vocabulary size for each variable
{var: len(cat_mappers[var]['itos']) for var in cat_mappers}

{'client': 5,
 'countries': 128,
 'dependency_label': 43,
 'format': 5,
 'label': 3,
 'morphological_features_case': 8,
 'morphological_features_definite': 5,
 'morphological_features_degree': 7,
 'morphological_features_foreign': 4,
 'morphological_features_fpos': 65,
 'morphological_features_gender': 6,
 'morphological_features_mood': 7,
 'morphological_features_number': 5,
 'morphological_features_numtype': 6,
 'morphological_features_person': 6,
 'morphological_features_polite': 4,
 'morphological_features_poss': 4,
 'morphological_features_prepcase': 5,
 'morphological_features_prontype': 12,
 'morphological_features_reflex': 4,
 'morphological_features_tense': 7,
 'morphological_features_verbform': 7,
 'morphological_features_voice': 4,
 'part_of_speech': 18,
 'session': 5,
 'token_w_l2': 5240,
 'track': 5,
 'user': 6449}

In [4]:
# Determine the maximum sequence length
trn['user'].value_counts().max(), dev['user'].value_counts().max(), test['user'].value_counts().max()

(8894, 1288, 1284)

In [10]:
trn['user'].value_counts().quantile(np.arange(0, 1, .1))

0.0      64.0
0.1     357.0
0.2     441.0
0.3     510.0
0.4     588.0
0.5     679.0
0.6     782.0
0.7     919.0
0.8    1131.0
0.9    1551.0
Name: user, dtype: float64

In [15]:
max_len = dev['user'].value_counts().max()
max_len

1288

In [16]:
# Encode all categorical variables
def encode_cat_var(seqs, stoi, max_len, padding_pos='pre', truncating='pre', padding_token=PAD):
    # Index
    ixs = [[stoi[str_val] for str_val in seq] for seq in seqs]
    
    # Pad
    ixs = pad_sequences(ixs, max_len, padding=padding_pos, truncating=truncating, value=stoi[padding_token])
    
    return ixs

In [17]:
cat_ix_trn = {var: encode_cat_var(trn_collapsed[var], cat_mappers[var]['stoi'], max_len) for var in cat_vars}
cat_ix_dev = {var: encode_cat_var(dev_collapsed[var], cat_mappers[var]['stoi'], max_len) for var in cat_vars}
cat_ix_test = {var: encode_cat_var(test_collapsed[var], cat_mappers[var]['stoi'], max_len) for var in cat_vars}

In [18]:
# Sanity check encodings
def sanity_check_encodings(original_df, cat_ix, cat_vars=cat_vars, cat_mappers=cat_mappers, seed=0, last_n=5):
    # Randomly select an observation
    np.random.seed(seed)
    ix = np.random.choice(np.arange(len(original_df)), 1)[0]
    
    # Print out both the original and encoded variables
    for var in cat_vars:
        print('Variable {}:'.format(var))
        print('\tOriginal last {} values:\t{}'.format(last_n, original_df[var].values[ix][-last_n:]))
        print('\tEncoded last {} values:\t{}'.format(last_n, [cat_mappers[var]['itos'][i] for i in cat_ix[var][ix][-last_n:]]))

In [None]:
sanity_check_encodings(trn_collapsed, cat_ix_trn)
sanity_check_encodings(dev_collapsed, cat_ix_dev)
sanity_check_encodings(test_collapsed, cat_ix_test)

## Extract numerical features

In [20]:
# Encode all continuous variables
def encode_cont_var(seqs, max_len, padding_pos='pre', truncating='pre', padding_value=-1):
    # Pad with -1 (because all these variables are positive)
    ixs = pad_sequences(seqs, max_len, padding=padding_pos, truncating=truncating, value=padding_value, dtype='float')
    return ixs

In [21]:
cont_trn = {var: encode_cont_var(trn_collapsed[var], max_len) for var in cont_vars}
cont_dev = {var: encode_cont_var(dev_collapsed[var], max_len) for var in cont_vars}
cont_test = {var: encode_cont_var(test_collapsed[var], max_len) for var in cont_vars}

## Extract labels

In [24]:
y_trn = encode_cat_var(trn_collapsed['label'], cat_mappers['label']['stoi'], max_len)
y_dev = encode_cat_var(dev_collapsed['label'], cat_mappers['label']['stoi'], max_len)
y_test = encode_cat_var(test_collapsed['label'], cat_mappers['label']['stoi'], max_len)

In [37]:
# One-hot encode
y_trn_oh = to_categorical(y_trn)
y_dev_oh = to_categorical(y_dev)
y_test_oh = to_categorical(y_test)

y_trn_oh.shape, y_dev_oh.shape, y_test_oh.shape

((6447, 1288, 3), (6437, 1288, 3), (6439, 1288, 3))

## Save all extracted features

In [56]:
feats = {
    # Categorical features
    'cat_ix_trn': cat_ix_trn,
    'cat_ix_dev': cat_ix_dev,
    'cat_ix_test': cat_ix_test,
    
    # Numerical features
    'cont_trn': cont_trn,
    'cont_dev': cont_dev,
    'cont_test': cont_test,
    
    # Labels
    'y_trn_oh': y_trn_oh,
    'y_dev_oh': y_dev_oh,
    'y_test_oh': y_test_oh,
    
    # Supporting data
    'cat_vars': cat_vars,
    'cont_vars': cont_vars,
    'cat_mappers': cat_mappers,
    'max_len': max_len
}

pickle.dump(feats, open('data/feats.pkl', 'wb'))