In [1]:
import numpy as np
import pandas as pd
import re
import glob
from copy import deepcopy
from pandas.io.json import json_normalize
from baseline import *
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')
pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 300

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

## Load data as dataframes

In [4]:
def load_data(filename, return_df=True):
    """
    This is a modified version of the `load_data` function provided by Duolingo.
    The main difference is that this one returns the data in a dataframe.
    """
    
    print('Loading {}...'.format(filename))    
    data = []
    
    if 'key' in filename:
        print('Loading labels...')
        with open(filename, 'rt', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                instance_properties = {}
                instance_properties['instance_id'], instance_properties['label'] = line.split()
                instance_properties['label'] = float(instance_properties['label'])
                data.append(instance_properties)
    else:
        # If this is training data, then 'labels' is a dict that contains instance_ids as keys and labels as values.
        training = False
        if filename.find('train') != -1:
            training = True

        if training:
            labels = dict()

        num_exercises = 0
        print('Loading instances...')

        with open(filename, 'rt', encoding='utf-8') as f:
            for line in f:
                line = line.strip()

                # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
                if len(line) == 0:
                    num_exercises += 1
                    if num_exercises % 100000 == 0:
                        print('Loaded ' + str(len(data)) + ' instances across ' + str(num_exercises) + ' exercises...')

                # If the line starts with #, then we're beginning a new exercise
                elif line[0] == '#':
                    list_of_exercise_parameters = line[2:].split()
                    instance_properties = dict()
                    for exercise_parameter in list_of_exercise_parameters:
                        [key, value] = exercise_parameter.split(':')
                        if key == 'countries':
                            value = value.split('|')
                        elif key == 'days':
                            value = float(value)
                        elif key == 'time':
                            if value == 'null':
                                value = None
                            else:
                                assert '.' not in value
                                value = int(value)
                        instance_properties[key] = value

                # Otherwise we're parsing a new Instance for the current exercise
                else:
                    line = line.split()
                    if training:
                        assert len(line) == 7
                    else:
                        assert len(line) == 6
                    assert len(line[0]) == 12
                    
                    instance_properties['instance_id'] = line[0]
                    instance_properties['token'] = line[1]
                    instance_properties['part_of_speech'] = line[2]
                    
                    instance_properties['morphological_features'] = dict()
                    for l in line[3].split('|'):
                        [key, value] = l.split('=')
#                         if key == 'Person':
#                             value = int(value)
                        instance_properties['morphological_features'][key.lower()] = value

                    instance_properties['dependency_label'] = line[4]
                    instance_properties['dependency_edge_head'] = int(line[5])
                    if training:
                        instance_properties['label'] = float(line[6])
                    
                    # Add track
                    instance_properties['track'] = re.findall('[a-z]{2}_[a-z]{2}', filename)[0]
                    data.append(deepcopy(instance_properties))
                    
            print('Done loading ' + str(len(data)) + ' instances across ' + str(num_exercises) +
                  ' exercises.\n')
        
    if return_df:
        data = json_normalize(data, sep='_')
        assert data['instance_id'].is_unique
        return data
    else:
        return data

In [5]:
# Load all data
filenames = glob.glob('data/*.slam*')
data = {fn: load_data(fn) for fn in filenames}

Loading data/fr_en.slam.20171218.train...
Loading instances...
Loaded 285973 instances across 100000 exercises...
Loaded 567856 instances across 200000 exercises...
Loaded 850511 instances across 300000 exercises...
Done loading 926657 instances across 326792 exercises.

Loading data/en_es.slam.20171218.test.key...
Loading labels...
Loading data/en_es.slam.20171218.train...
Loading instances...
Loaded 317049 instances across 100000 exercises...
Loaded 635368 instances across 200000 exercises...
Loaded 951536 instances across 300000 exercises...
Loaded 1271940 instances across 400000 exercises...
Loaded 1591345 instances across 500000 exercises...
Loaded 1911213 instances across 600000 exercises...
Loaded 2227445 instances across 700000 exercises...
Loaded 2546705 instances across 800000 exercises...
Done loading 2622958 instances across 824012 exercises.

Loading data/fr_en.slam.20171218.dev.key...
Loading labels...
Loading data/fr_en.slam.20171218.dev...
Loading instances...
Done load

In [6]:
# Merge labels with features for dev and test data
for fn in data:
    if re.search(r'[dev|test]$', fn):
        data[fn] = pd.merge(data[fn], data[fn + '.key'], on='instance_id')
        assert data[fn]['instance_id'].is_unique

In [7]:
# Combine all 3 tracks togethe
trn = pd.concat([data[fn] for fn in data if fn.endswith('train')], ignore_index=True, sort=True)
dev = pd.concat([data[fn] for fn in data if fn.endswith('dev')], ignore_index=True, sort=True)
test = pd.concat([data[fn] for fn in data if fn.endswith('test')], ignore_index=True, sort=True)

trn.shape, dev.shape, test.shape

((5523173, 32), (813809, 32), (804310, 32))

In [9]:
# Check the distribution of tracks
trn['track'].value_counts(normalize=True)

en_es    0.474901
es_en    0.357323
fr_en    0.167776
Name: track, dtype: float64

In [10]:
# Check the % of positive labels
trn['label'].mean(), dev['label'].mean(), test['label'].mean()

(0.13735492261422919, 0.1535642884263998, 0.15884671333192427)

In [11]:
# Check the % of positive labels per tracks
trn.groupby('track')['label'].mean()

track
en_es    0.126113
es_en    0.140597
fr_en    0.162273
Name: label, dtype: float64

## Clean data

In [12]:
# Check missing values
trn.isnull().mean()

client                             0.000000
countries                          0.000000
days                               0.000000
dependency_edge_head               0.000000
dependency_label                   0.000000
format                             0.000000
instance_id                        0.000000
label                              0.000000
morphological_features_case        0.907641
morphological_features_definite    0.869148
morphological_features_degree      0.960974
morphological_features_foreign     0.999908
morphological_features_fpos        0.000000
morphological_features_gender      0.713314
morphological_features_mood        0.824631
morphological_features_number      0.341225
morphological_features_numtype     0.993484
morphological_features_person      0.725570
morphological_features_polite      0.999998
morphological_features_poss        0.967216
morphological_features_prepcase    0.997607
morphological_features_prontype    0.719426
morphological_features_reflex   

In [16]:
# Compute the medium of `time` using the training data
time_p50 = trn['time'].quantile(.5)
time_p50

11.0

In [17]:
# Check the number of countries a user can have
pd.Series([len(countries) for countries in trn['countries']]).value_counts(normalize=True)

1    0.955765
2    0.037551
3    0.004819
6    0.000844
4    0.000542
5    0.000480
dtype: float64

In [19]:
def clean_data(df, time_p50):
    # Fill missing values
    for col in df:
        if col.startswith('morphological_features'):
            df[col].fillna('', inplace=True)
    
    df['time'].fillna(time_p50, inplace=True)
    
    # Keep `time` above 0 (those that are less than 0 are due to race conditions)
    df['time'].clip(lower=0, inplace=True)
    
    # Only keep the first country for a given user
    df['countries'] = [countries[0] for countries in df['countries']]
    
    # Add learning languages
    df['l2'] = df['track'].str.replace('_.+$', '')
    df['token_w_l2'] = (df['l2'] + ':' + df['token']).str.lower().str.strip()
    
    return df

In [20]:
trn = clean_data(trn, time_p50)
dev = clean_data(dev, time_p50)
test = clean_data(test, time_p50)

In [21]:
# Check out a few rows
trn.head().transpose()

Unnamed: 0,0,1,2,3,4
client,web,web,web,web,web
countries,CA,CA,CA,CA,CA
days,0.005,0.005,0.005,0.005,0.005
dependency_edge_head,2,0,4,4,4
dependency_label,det,ROOT,nsubj,cop,det
format,reverse_translate,reverse_translate,reverse_translate,reverse_translate,reverse_translate
instance_id,8XTyQUAl0101,8XTyQUAl0102,8XTyQUAl0201,8XTyQUAl0202,8XTyQUAl0203
label,0,0,0,0,0
morphological_features_case,,,,,
morphological_features_definite,Def,,,,Ind


In [24]:
# Check the sharing of users across the 3 datasets
users_trn = trn['user'].unique()
users_dev = dev['user'].unique()
users_test = test['user'].unique()

len(users_trn), len(users_dev), len(users_test)

(6447, 6437, 6439)

In [25]:
print('{}% of users in the dev set are in the training set.'.format(len(np.intersect1d(users_trn, users_dev)) / len(users_dev) * 100))
print('{}% of users in the test set are in the training set.'.format(len(np.intersect1d(users_trn, users_test)) / len(users_test) * 100))

100.0% of users in the dev set are in the training set.
100.0% of users in the test set are in the training set.


In [36]:
# Save
trn.to_pickle('data/trn.pkl')
dev.to_pickle('data/dev.pkl')
test.to_pickle('data/test.pkl')