In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0, '..')

from formasaurus import formtype_features as features
from formasaurus import formtype_model
from formasaurus.storage import Storage, load_html

### Available training data

In [3]:
storage = Storage("../formasaurus/data")
storage.check()
storage.print_form_type_counts(simplify=True)

Checking:  27%|######8                   | 253/954 [00:02<00:06, 110.61 files/s]


Invalid form count for entry 'html/cafephim.vn-1.html': expected 0, got 2
Invalid number of form field annotations for entry 'html/cafephim.vn-1.html'

Invalid form count for entry 'html/postr.hu-1.html': expected 0, got 6
Invalid number of form field annotations for entry 'html/postr.hu-1.html'


Checking:  85%|######################8    | 808/954 [00:07<00:01, 89.40 files/s]


Invalid form count for entry 'html/postr.hu-2.html': expected 0, got 6
Invalid number of form field annotations for entry 'html/postr.hu-2.html'


Checking: 100%|##########################| 954/954 [00:08<00:00, 169.67 files/s]
Loading: 12 files [00:00, 181.28 files/s]


Invalid form count for entry 'html/ddl-warez.in-0.html': expected 0, got 1
Invalid number of form field annotations for entry 'html/ddl-warez.in-0.html'

Invalid form count for entry 'html/www.elandroidelibre.com-0.html': expected 0, got 1
Invalid number of form field annotations for entry 'html/www.elandroidelibre.com-0.html'
Status: 10 error(s) found
Annotated HTML forms (simplified classes):



                                          

413   search                    (s)
246   login                     (l)
164   registration              (r)
146   other                     (o)
138   contact/comment           (c)
132   join mailing list         (m)
105   password/login recovery   (p)
74    order/add to cart         (b)

Total form count: 1418




## Load training / evaluation data

In [4]:
annotations = list(storage.iter_annotations(
    simplify_form_types=True,
    simplify_field_types=True,
    verbose=True,
    leave=True,        
))
X, y = formtype_model.get_Xy(annotations, full_type_names=True)

Loading: 954 files [00:06, 124.52 files/s]







## Ideas for useful features

### Search forms

* a single query field
* a field named "q" or "s"
* "search" in URL
* "search" in submit button text (submit value)
* "search" in form css class or id
* no password field
* method == GET?

### Login forms

* username or email and password
* 2 passwords - likely not a login form
* "login" or "sign in" (or variations) in URL
* "login" or "sign in" (or variations) in form css class or id
* "login" or "sign in" in submit button text
* "Remember me" checkbox (or any single checkbox)
* no select elements
* no textarea elements
* openid?
* method == POST

### Registration forms

* 2 passwords 
* "register" / "sign up" in URL, form css class / id or submit button text
* "agree" checkbox
* email
* username
* method == POST

### Contact forms

* feedback in URL/class
* textarea
* "Send" button
* email
* method == POST

### Password reset

* a single email or username field
* "password" in URL/css class/ submit button text

### Join Mailing List

* a single email field
* subscribe/join/newsletter words
* a short form

The main problem with "join mailing list" forms is to distinguish them from search forms.

## How to handle them

Instead of hardcoding the features above many of them are generalized. For exmaple, instead of writing "search in URL" we extract all 5-character substrings from the URL and use "`urlsubstring<N>` in URL" as features. There are some disadvantages in this approach, but it provides a good starting point.

The feature extractors are stored in formtype.features module.

In [5]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, precision_recall_fscore_support

from eli5 import explain_weights, format_as_html

In [6]:
%%time
# reload(features)
from formasaurus.formtype_model import _create_feature_union
# ======= define the model ========

# features should be kept in sync with formasaurus.formtype_features
# a list of 3-tuples with default features:
# (feature_name, form_transformer, vectorizer)
FEATURES = [    
    (
        # bias feature is for easier debugging, it should be removed
        # in production!
        "bias",  
        features.Bias(),
        DictVectorizer(),  
    ),
    
    (
        "form elements",
        features.FormElements(),
        DictVectorizer()
    ),
    (
        "<input type=submit value=...>",
        features.SubmitText(),
        CountVectorizer(ngram_range=(1,2), min_df=1, binary=True)
    ),
    (
        "<a> TEXT </a>",
        features.FormLinksText(),
        TfidfVectorizer(ngram_range=(1,2), min_df=4, binary=True,
                        stop_words={'and', 'or', 'of'})
    ),
    (
        "<label> TEXT </label>",
        features.FormLabelText(),
        TfidfVectorizer(ngram_range=(1,2), min_df=3, binary=True,
                        stop_words="english")
    ),

    (
        "<form action=...>",
        features.FormUrl(),
        TfidfVectorizer(ngram_range=(5,6), min_df=4, binary=True,
                        analyzer="char_wb")
    ),
    (
        "<form class=... id=...>",
        features.FormCss(),
        TfidfVectorizer(ngram_range=(4,5), min_df=3, binary=True,
                        analyzer="char_wb")
    ),
    (
        "<input class=... id=...>",
        features.FormInputCss(),
        TfidfVectorizer(ngram_range=(4,5), min_df=5, binary=True,
                        analyzer="char_wb")
    ),
    (
        "<input name=...>",
        features.FormInputNames(),
        TfidfVectorizer(ngram_range=(5,6), min_df=3, binary=True,
                        analyzer="char_wb")
    ),
    (
        "<input title=...>",
        features.FormInputTitle(),
        TfidfVectorizer(ngram_range=(5,6), min_df=3, binary=True,
                        analyzer="char_wb")
    ),
]


# clf = SGDClassifier(
#     penalty='elasticnet', 
#     loss='log', 
#     alpha=0.0002,
#     fit_intercept=False, 
#     shuffle=True, 
#     random_state=0,
#     n_iter=50,
# )
clf = LogisticRegression(penalty='l2', C=5, fit_intercept=False, random_state=0, tol=0.01)

# clf = LinearSVC(C=0.5, random_state=0, fit_intercept=False)
fe = _create_feature_union(FEATURES)
model = Pipeline([
    ('fe', fe),
    ('clf', clf),
])

model.fit(X, y)
formtype_model.print_classification_report(annotations, n_splits=10, model=model)

                         precision    recall  f1-score   support

                 search       0.90      0.96      0.93       413
                  login       0.96      0.97      0.96       246
           registration       0.95      0.86      0.90       164
password/login recovery       0.86      0.85      0.86       105
        contact/comment       0.87      0.93      0.90       138
      join mailing list       0.87      0.88      0.87       132
      order/add to cart       0.93      0.57      0.71        74
                  other       0.65      0.66      0.66       146

            avg / total       0.88      0.88      0.88      1418

88.1% forms are classified correctly.
CPU times: user 21.1 s, sys: 335 ms, total: 21.4 s
Wall time: 22 s


## Check what classifier learned

For linear classifiers like Logistic Regression or an SVM without a kernel we can check coefficient values to understand better how the decision is made. 

For correlated features (notably, n-grams) weight will be spread across all correlated features, so just checking coefficients is not enough, but looking at them is useful anyways.

In [7]:
from IPython.display import HTML

model.fit(X, y)
expl = explain_weights(clf, vec=fe, top=100)
HTML(format_as_html(expl, highlight_spaces=False, horizontal_layout=False))

Weight,Feature
+5.857,form elements__has <textarea>
+2.440,<label> TEXT </label>__question
+1.618,<form action=...>__ kont
+1.570,<a> TEXT </a>__agent
+1.567,<a> TEXT </a>__affiliate
+1.521,<a> TEXT </a>__mobile
+1.509,<a> TEXT </a>__forums
+1.345,<input type=submit value=...>__send
+1.321,<a> TEXT </a>__top
+1.220,<a> TEXT </a>__privacy statement

Weight,Feature
+1.681,<input name=...>__ email
+1.681,<input name=...>__ emai
+1.473,<input name=...>__email
+1.351,form elements__has <input type=email>
+1.348,<input name=...>__ go
+1.310,<input type=submit value=...>__подписаться
+1.263,<input type=submit value=...>__subscribe
+1.250,<input type=submit value=...>__cadastrar
+1.209,<input name=...>__news
+1.192,<form class=... id=...>__mail

Weight,Feature
+3.315,form elements__exactly one <input type=password>
+2.071,<input type=submit value=...>__로그인
+2.053,<form action=...>__login
+1.837,<input type=submit value=...>__login
+1.559,<input type=submit value=...>__submit
+1.535,<input type=submit value=...>__in
+1.277,<input type=submit value=...>__sign in
+1.198,<input name=...>__next
+1.155,<form action=...>__pauth
+1.132,<form action=...>__oginc

Weight,Feature
+3.273,<input name=...>__ qty
+2.244,<form action=...>__ cart
+1.937,<input type=submit value=...>__заказ
+1.633,<a> TEXT </a>__email
+1.532,<form class=... id=...>__ frm
+1.525,<form class=... id=...>__frm
+1.465,<input name=...>__list
+1.426,<input name=...>__quant
+1.419,<input name=...>__oncode
+1.419,<input name=...>__oncod

Weight,Feature
+1.880,<input type=submit value=...>__跳转到
+1.792,<input class=... id=...>__from
+1.789,<input type=submit value=...>__enter
+1.766,<input name=...>__surl
+1.728,<form class=... id=...>__ xe
+1.490,<input type=submit value=...>__validate
+1.463,<a> TEXT </a>__conditions
+1.412,<input type=submit value=...>__keresés mentése
+1.412,<input type=submit value=...>__mentése
+1.367,<input name=...>__code

Weight,Feature
+2.395,<input type=submit value=...>__password
+2.288,<input type=submit value=...>__reset
+1.806,<input type=submit value=...>__priminti
+1.704,<input type=submit value=...>__odošli
+1.666,<form class=... id=...>__pass
+1.507,<input type=submit value=...>__reset password
+1.439,<label> TEXT </label>__email
+1.369,<input name=...>__mail
+1.327,<form action=...>__asswor
+1.327,<form action=...>__asswo

Weight,Feature
+3.221,form elements__exactly two <input type=password>
+2.622,<input type=submit value=...>__register
+1.834,<input type=submit value=...>__create
+1.662,<form action=...>__regis
+1.639,<form action=...>__egist
+1.639,<form action=...>__regist
+1.539,<form action=...>__egiste
+1.539,<form action=...>__giste
+1.539,<form action=...>__gister
+1.539,<form action=...>__ister

Weight,Feature
+2.916,<input name=...>__ q
+2.473,<input name=...>__ s
+1.945,<label> TEXT </label>__sort
+1.859,<input type=submit value=...>__search
+1.686,<form action=...>__&sid=
+1.623,<input name=...>__searc
+1.623,<input name=...>__search
+1.623,<input name=...>__earch
+1.521,form elements__has <select>
+1.503,<label> TEXT </label>__year


## Compare results with "loginform" library

It is not possible to compare the results with "loginform" library directly because loginform

* always tries to return a login form even if the score is low;
* only detects login forms;
* in case of several forms returns a single form with the best score instead of deciding for each form whether to return it or not.

So we used two approaches:

1. Use `loginform._form_score` with different thresholds; assume that if score is greater than or equal to a threshold `loginform` detected a login form.
2. Train the same model, but using features from loginform library (weights will be learned instead of being hardcoded as 'score' increments/decrements).



### 1. loginform scores + thresholds

* **score >= -100** means "simply treat all forms as login forms".

* **score >= 0** all (or most) login forms are captured, but there are many false positives. 
  It is only slightly better than treating all forms as login forms.

* **score >= 10** F1 score is the best among all thresholds, 
  but the quality is significantly worse than F1 of ML-based models.
  
* **score >= 20** ~90% of detected login forms are correct, but most 
  login forms are not detected. Also, ~90% number is still lower than what ML-based models give us.


In [8]:
%%time 
import loginform

def labels_to_binary(y):
    """ Convert labels to 2-classes: login forms and non-login forms """
    return [tp == 'login' for tp in y]

    
def predict_loginform(X, threshold):
    """
    Return if forms are login or not using loginform
    library scores and a threshold.
    """
    return [
        (loginform._form_score(form) >= threshold)
        for form in X
    ]


def print_threshold_metrics(X, y, threshold):
    y = labels_to_binary(y)
    y_pred = predict_loginform(X, threshold)

    precision, recall, f1, support = precision_recall_fscore_support(y, y_pred, pos_label=True)
    print(
        "score >= %4d:    precision = %0.3f    recall = %0.3f    F1 = %0.3f" % (
        threshold, precision[1], recall[1], f1[1]
    ))


for threshold in [-100, -10, 0, 10, 20, 30]:
    print_threshold_metrics(X, y, threshold)

  'precision', 'predicted', average, warn_for)


score >= -100:    precision = 0.173    recall = 1.000    F1 = 0.296
score >=  -10:    precision = 0.200    recall = 0.992    F1 = 0.333
score >=    0:    precision = 0.325    recall = 0.984    F1 = 0.489
score >=   10:    precision = 0.710    recall = 0.886    F1 = 0.788
score >=   20:    precision = 0.869    recall = 0.350    F1 = 0.499
score >=   30:    precision = 1.000    recall = 0.020    F1 = 0.040
CPU times: user 3.87 s, sys: 38.1 ms, total: 3.9 s
Wall time: 4.01 s


### 2. Use loginform features, but autodetect scores

The following ML-based model is trained using original loginform features (conditions used to increase or decrease the score). Roughly speaking, it uses the same information as loginform library, but instead of hardcoding `score += 10` and `score -= 10` the numbers are adjusted based on training data.

Note that the login form detection quality is significantly better than the quality of threshold-based model; it is only slightly worse than the quality of a "full" forms detection model. This means original loginform features are quite good at detecting login forms. But for other form types these features are not enough: other scores are bad.

In [9]:
%%time

LOGINFORM_FEATURES = [
    ('bias', features.Bias(), DictVectorizer()),
    ('loginform', features.OldLoginformFeatures(), DictVectorizer())
]
# loginform_clf = LinearSVC(C=0.5, fit_intercept=False)
loginform_clf = LogisticRegression(penalty='l2', C=5, fit_intercept=False, random_state=0)

fe = _create_feature_union(LOGINFORM_FEATURES)
model = make_pipeline(
    fe, 
    loginform_clf,
)

formtype_model.print_classification_report(annotations, n_splits=10, model=model)

                 precision    recall  f1-score   support

         search       0.54      0.80      0.65       413
          login       0.92      0.96      0.94       246
   registration       0.96      0.66      0.78       164
contact/comment       0.46      0.74      0.57       138
          other       0.40      0.60      0.48       146

    avg / total       0.66      0.78      0.70      1107

60.9% forms are classified correctly.
CPU times: user 6.67 s, sys: 66.3 ms, total: 6.73 s
Wall time: 6.88 s


In [10]:
model.fit(X, y)
explain_weights(loginform_clf, fe)

y=contact/comment  top features,y=contact/comment  top features,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0
Weight,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
y=join mailing list  top features,y=join mailing list  top features,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Weight,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
y=login  top features,y=login  top features,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4
Weight,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5
y=order/add to cart  top features,y=order/add to cart  top features,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6
Weight,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7
y=other  top features,y=other  top features,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8
Weight,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9
y=password/login recovery  top features,y=password/login recovery  top features,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10
Weight,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11
y=registration  top features,y=registration  top features,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12
Weight,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13
y=search  top features,y=search  top features,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14
Weight,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15
+3.690,loginform__typecount_password_0,,,,,,
+3.011,loginform__typecount_text_gt1,,,,,,
+0.045,loginform__typecount_text_0,,,,,,
+0.022,loginform__typecount_radio_gt0,,,,,,
-1.186,loginform__typecount_checkbox_gt1,,,,,,
-1.492,loginform__typecount_password_eq1,,,,,,
-1.521,loginform__2_or_3_inputs,,,,,,
-6.627,bias__bias,,,,,,
+2.484,loginform__typecount_password_0,,,,,,
+0.835,loginform__typecount_checkbox_gt1,,,,,,

y=contact/comment  top features,y=contact/comment  top features
Weight,Feature
3.69,loginform__typecount_password_0
3.011,loginform__typecount_text_gt1
0.045,loginform__typecount_text_0
0.022,loginform__typecount_radio_gt0
-1.186,loginform__typecount_checkbox_gt1
-1.492,loginform__typecount_password_eq1
-1.521,loginform__2_or_3_inputs
-6.627,bias__bias

y=join mailing list  top features,y=join mailing list  top features
Weight,Feature
2.484,loginform__typecount_password_0
0.835,loginform__typecount_checkbox_gt1
0.203,loginform__2_or_3_inputs
-0.35,loginform__typecount_text_0
-0.814,loginform__typecount_text_gt1
-0.871,loginform__typecount_radio_gt0
-2.101,loginform__typecount_password_eq1
-4.284,bias__bias

y=login  top features,y=login  top features
Weight,Feature
5.689,loginform__typecount_password_eq1
0.805,loginform__2_or_3_inputs
-0.246,loginform__typecount_text_0
-1.909,loginform__typecount_password_0
-1.956,loginform__typecount_checkbox_gt1
-2.165,loginform__typecount_text_gt1
-2.221,loginform__typecount_radio_gt0
-2.699,bias__bias

y=order/add to cart  top features,y=order/add to cart  top features
Weight,Feature
1.848,loginform__typecount_password_0
0.512,loginform__typecount_radio_gt0
0.276,loginform__typecount_text_0
-0.442,loginform__2_or_3_inputs
-0.564,loginform__typecount_checkbox_gt1
-0.76,loginform__typecount_text_gt1
-2.03,loginform__typecount_password_eq1
-4.264,bias__bias

y=other  top features,y=other  top features
Weight,Feature
2.552,loginform__typecount_text_0
1.98,loginform__typecount_password_0
0.968,loginform__typecount_radio_gt0
0.956,loginform__typecount_text_gt1
-0.322,loginform__2_or_3_inputs
-0.523,loginform__typecount_checkbox_gt1
-1.14,loginform__typecount_password_eq1
-4.958,bias__bias

y=password/login recovery  top features,y=password/login recovery  top features
Weight,Feature
1.278,loginform__typecount_password_0
0.747,loginform__2_or_3_inputs
-0.286,loginform__typecount_text_gt1
-0.641,loginform__typecount_text_0
-2.33,loginform__typecount_checkbox_gt1
-2.437,loginform__typecount_radio_gt0
-2.796,loginform__typecount_password_eq1
-3.494,bias__bias

y=registration  top features,y=registration  top features
Weight,Feature
2.017,loginform__typecount_text_gt1
1.581,bias__bias
0.762,loginform__typecount_checkbox_gt1
0.588,loginform__typecount_radio_gt0
0.396,loginform__typecount_text_0
0.062,loginform__2_or_3_inputs
-4.509,loginform__typecount_password_eq1
-6.013,loginform__typecount_password_0

y=search  top features,y=search  top features
Weight,Feature
3.781,loginform__typecount_password_0
1.039,loginform__typecount_checkbox_gt1
-0.014,loginform__2_or_3_inputs
-0.384,loginform__typecount_radio_gt0
-1.197,loginform__typecount_text_0
-2.221,loginform__typecount_text_gt1
-2.463,loginform__typecount_password_eq1
-3.596,bias__bias
