# Form Field Type Detection Using Machine Learning

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
import re
from itertools import chain
from collections import Counter

import numpy as np
import scipy.stats
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.grid_search import RandomizedSearchCV
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score
from sklearn_crfsuite import scorers
from sklearn_crfsuite import CRF

from formasaurus.annotation import get_annotation_folds
from formasaurus.storage import Storage
from formasaurus import formtype_model, fieldtype_model

## Loading dataset

In [3]:
storage = Storage('C:/Users/raoumer/Desktop/form_classifier_ml/data')
index = storage.get_index()
field_schema = storage.get_field_schema()
form_schema = storage.get_form_schema()

In [4]:
_ann_iter = storage.iter_annotations(
    index=index, 
    simplify_form_types=True,
    simplify_field_types=True
)

annotations_all = list(a for a in _ann_iter if True)
annotations_complete = [a for a in annotations_all if a.fields_annotated]
len(annotations_complete), len(annotations_all)

(1355, 1418)

The model is two-stage:

1. First, we train Formasaurus form type detector.
2. Second, we use form type detector results to improve quality of field type detection.

We have form types available directly in training data, but in reality form type detecor will make mistakes. It is better for field type detector to account for this and not rely on form types blindly. So it should be trained on input where form type detection quality is roughly the same it'll be in real life. 

To get 'realistic' form type labels we split data into 10 folds, and then for each fold we predict its labels using form type detector trained on the rest 9 folds.

In [5]:
%%time
form_types_realistic = formtype_model.get_realistic_form_labels(annotations_complete, n_folds=10)

Wall time: 12.8 s


In [6]:
form_types_realistic[100]

u'contact/comment'

In [7]:
form_types_correct = np.asarray([a.type_full for a in annotations_complete])
accuracy_score(form_types_correct, form_types_realistic)

0.8929889298892989

## Train / Test Data

Prepare training/testing data for field type detection. We use 1/4 (25%) of data for testing.

In [8]:
%%time
def get_annotation_train_test_indices(annotations, n_folds=4):
    """
    Split annotations into train and test parts, return train and test indices.
    The size of test part is approximately ``len(annotations)/n_folds``.
    it is guaranteed forms from the same website can't be both
    in train and test parts.
    """
    for idx_train, idx_test in get_annotation_folds(annotations, n_folds):
        break
    return idx_train, idx_test

def select_by_index(arr, index):
    """
    Like numpy indexing, but for lists. This is for cases
    conversion to numpy array is problematic.

    >>> select_by_index(['a', 'b', 'c', 'd'], [0, 3])
    ['a', 'd']
    """
    return [arr[i] for i in index]

idx_train, idx_test = get_annotation_train_test_indices(annotations_complete, 4)

annotations_train = select_by_index(annotations_complete, idx_train)
annotations_test = select_by_index(annotations_complete, idx_test)

# form_types_train = form_types_realistic[idx_train]
form_types_train = form_types_correct[idx_train]
form_types_test = form_types_realistic[idx_test]

X_train, y_train = fieldtype_model.get_Xy(annotations_train, form_types_train, full_type_names=True)
X_test, y_test = fieldtype_model.get_Xy(annotations_test, form_types_test, full_type_names=True)
X, y = X_train + X_test, y_train + y_test
print(len(X_train), len(X_test))

#print "X:",X
#print "y",y

(1016, 339)
Wall time: 1.82 s


## Optimal Parameter Selection Using `RandomizedSearchCV`

Find regularization parameters for field type detector using randomized search. 
We're optimizing for weighted average of F1 metric. Only training data is used; 5-fold cross-validation is performed at each iteration.

In [9]:
%%time
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
#     'max_iterations': [25, 30, 35, 100]
#     'min_freq': [None, 2],
}

crf = CRF(all_possible_transitions=True, max_iterations=100)
rs = RandomizedSearchCV(crf, params_space, 
    cv=get_annotation_folds(annotations_train, 5), 
    verbose=1, 
    n_jobs=-1, 
    n_iter=50,
    iid=False, 
    scoring=scorers.sequence_accuracy #fieldtype_model.scorer
)

Wall time: 27 ms


In [13]:
print X_train[0], len(X_train)
print y_train[0], len(y_train)

[{'help': ['search'], 'tag': 'input', 'text-after': [], 'input-type': 'text', 'id': ['search_field'], 'is-last': True, 'name': ['q'], 'value-ngrams': [], 'text-before': [], 'form-type': u'search', 'css-class-ngrams': [], 'value': [], 'id-ngrams': ['sear', 'earc', 'arch', 'rch_', 'ch_f', 'h_fi', '_fie', 'fiel', 'ield'], 'is-first': True, 'bias': 1, 'name-ngrams-3-5': []}] 1016
[u'search query'] 1016


In [11]:
rs.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  9.5min finished


RandomizedSearchCV(cv=sklearn.cross_validation.LabelKFold(n_labels=1016, n_folds=5),
          error_score='raise',
          estimator=CRF(algorithm=None, all_possible_states=None, all_possible_transitions=True,
  averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
  calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sens...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
          fit_params={}, iid=False, n_iter=50, n_jobs=-1,
          param_distributions={'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000001A089CC0>, 'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000001A089E48>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=make_scorer(sequence_accuracy_score), verbose=1)

In [14]:
crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best score:', rs.best_score_)

('best params:', {'c2': 0.018810789758563505, 'c1': 0.06520442886522007})
('best score:', 0.7716748768472905)


In [15]:
rs.best_params_, crf.score(X_test, y_test)

({'c1': 0.06520442886522007, 'c2': 0.018810789758563505}, 0.85956006768189508)

## Train Model

In [17]:
%%time
crf = rs.best_estimator_
crf.fit(X_train, y_train)
crf.score(X_test, y_test)

Wall time: 6.37 s


## Test Model

In [18]:
y_pred = crf.predict(X_test)
print y_pred

[['search query'], ['search query'], ['username', 'password', 'password confirmation', 'email', 'captcha'], ['username', 'password'], ['email', 'captcha'], ['comment title or subject', 'comment text', 'captcha'], ['search query'], ['other'], ['other'], ['remember me checkbox', 'submit button'], ['search query'], ['username or email', 'submit button'], ['search category / refinement', 'search category / refinement', 'search category / refinement', 'search query', 'submit button'], ['search category / refinement', 'search category / refinement', 'search category / refinement', 'search query', 'submit button'], ['full name', 'email', 'comment text'], ['search query'], ['search query'], ['sorting option', 'sorting option', 'sorting option', 'submit button'], ['other'], ['search query', 'submit button'], ['email'], ['search query'], ['search query'], ['email', 'comment text', 'email'], ['email', 'comment text', 'email'], ['email', 'password', 'remember me checkbox', 'submit button'], ['emai

In [20]:
print(flat_classification_report(y_test, y_pred, digits=3, labels=crf.classes_, target_names=crf.classes_))

                              precision    recall  f1-score   support

                search query      0.843     0.980     0.907        99
                       email      0.945     0.987     0.966       156
                    password      1.000     0.966     0.983        88
            product quantity      1.000     0.875     0.933         8
               submit button      0.895     1.000     0.944        68
                    username      0.767     0.767     0.767        43
       password confirmation      1.000     1.000     1.000        24
 receive emails confirmation      0.909     0.370     0.526        27
                  first name      0.913     0.840     0.875        25
                   last name      0.870     0.800     0.833        25
           organization name      1.000     0.417     0.588        12
                     address      0.706     0.667     0.686        18
                        city      0.909     0.714     0.800        14
                   

In [21]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-30s -> %-30s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(100))

Top likely transitions:
password                       -> password confirmation          4.850814
month                          -> day                            4.269847
first name                     -> last name                      4.197286
security question              -> answer to security question    4.150840
email                          -> email confirmation             3.249438
username                       -> password                       3.238952
sorting option                 -> sorting option                 3.191673
postal code                    -> city                           3.061328
last name                      -> first name                     3.003666
comment title or subject       -> comment text                   2.875847
first name                     -> middle name                    2.792112
comment title or subject       -> comment title or subject       2.684398
phone                          -> fax                            2.452416
other number  

In [22]:
print("\nTop unlikely transitions:") 
print_transitions(Counter(crf.transition_features_).most_common()[-10:])


Top unlikely transitions:
comment text                   -> comment text                   -1.000960
email                          -> password confirmation          -1.119661
other                          -> search category / refinement   -1.136573
password confirmation          -> other                          -1.156820
search category / refinement   -> other                          -1.269673
remember me checkbox           -> remember me checkbox           -1.380078
submit button                  -> submit button                  -2.082952
TOS confirmation               -> other                          -2.146346
password                       -> password                       -2.502929
email                          -> email                          -4.131105


In [23]:
def _filtered_state_features(state_features, query, k=1):
    return Counter({
        (attr, label): weight
        for ((attr, label), weight) in state_features.items()
        if (query in attr or query in label) and k*weight >= 0
    })


def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-30s %s" % (weight, label, attr))    
        

def print_top_positive(crf, N, query=''):
    print("\nTop positive:")
    cnt = _filtered_state_features(crf.state_features_, query, 1)
    print_state_features(cnt.most_common(N))
    

def print_top_negative(crf, N, query=''):
    print("\nTop negative:")
    cnt = _filtered_state_features(crf.state_features_, query, -1)
    print_state_features(cnt.most_common()[-N:])
    

def print_top(crf, N, query=''):
    cnt = _filtered_state_features(crf.state_features_, query, 0)
    print_state_features(cnt.most_common(N))
    

print_top(crf, 150, 'form-type')

6.082718 search category / refinement   form-type:search
5.461345 search query                   form-type:search
3.370666 product quantity               form-type:order/add to cart
3.248102 remember me checkbox           form-type:login
2.941115 comment title or subject       form-type:contact/comment
2.933746 comment text                   form-type:contact/comment
2.386734 email                          form-type:join mailing list
2.327087 full name                      form-type:contact/comment
2.109714 username                       form-type:password/login recovery
2.072472 other number                   form-type:search
1.982849 TOS confirmation               form-type:other
1.861928 sorting option                 form-type:search
1.839960 other                          form-type:other
1.679222 receive emails confirmation    form-type:registration
1.517986 other                          form-type:join mailing list
1.506806 password                       form-type:login
1.388499 

In [25]:
print("\nTop negative:")
print_top_negative(crf, 30, 'input-type')


Top negative:

Top negative:
-0.038681 country                        input-type:text
-0.172698 other                          input-type:button
-0.235398 other                          input-type:password
-0.702573 comment title or subject       input-type:text
-1.009200 other                          input-type:text
-3.053583 search category / refinement   input-type:text
