In [1]:
import sys
sys.path.insert(0, '..')

from formasaurus import formtype_features as features
from formasaurus import formtype_model
from formasaurus.storage import Storage, load_html

### Available training data

In [2]:
storage = Storage("../formasaurus/data")
storage.check()
storage.print_form_type_counts(simplify=True)

Checking: 100%|##########################| 832/832 [00:04<00:00, 162.55 files/s]
                                          

Status: OK
Annotated HTML forms (simplified classes):

364   search                    (s)
221   login                     (l)
153   registration              (r)
122   other                     (o)
120   contact/comment           (c)
107   join mailing list         (m)
95    password/login recovery   (p)
62    order/add to cart         (b)

Total form count: 1244




## Load training / evaluation data

In [3]:
annotations = list(storage.iter_annotations(
    simplify_form_types=True,
    simplify_field_types=True,
    verbose=True,
    leave=True,        
))
X, y = formtype_model.get_Xy(annotations, full_type_names=True)

Loading: 832 files [00:03, 259.92 files/s]







## Ideas for useful features

### Search forms

* a single query field
* a field named "q" or "s"
* "search" in URL
* "search" in submit button text (submit value)
* "search" in form css class or id
* no password field
* method == GET?

### Login forms

* username or email and password
* 2 passwords - likely not a login form
* "login" or "sign in" (or variations) in URL
* "login" or "sign in" (or variations) in form css class or id
* "login" or "sign in" in submit button text
* "Remember me" checkbox (or any single checkbox)
* no select elements
* no textarea elements
* openid?
* method == POST

### Registration forms

* 2 passwords 
* "register" / "sign up" in URL, form css class / id or submit button text
* "agree" checkbox
* email
* username
* method == POST

### Contact forms

* feedback in URL/class
* textarea
* "Send" button
* email
* method == POST

### Password reset

* a single email or username field
* "password" in URL/css class/ submit button text

### Join Mailing List

* a single email field
* subscribe/join/newsletter words
* a short form

The main problem with "join mailing list" forms is to distinguish them from search forms.

## How to handle them

Instead of hardcoding the features above many of them are generalized. For exmaple, instead of writing "search in URL" we extract all 5-character substrings from the URL and use "`urlsubstring<N>` in URL" as features. There are some disadvantages in this approach, but it provides a good starting point.

The feature extractors are stored in formtype.features module.

In [4]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [5]:
%%time
# reload(features)
from formasaurus.formtype_model import _create_feature_union
# ======= define the model ========

# features should be kept in sync with formasaurus.formtype_features
# a list of 3-tuples with default features:
# (feature_name, form_transformer, vectorizer)
FEATURES = [
    (
        "bias",
        features.Bias(),
        DictVectorizer(),
    ),
    (
        "form elements",
        features.FormElements(),
        DictVectorizer()
    ),
    (
        "<input type=submit value=...>",
        features.SubmitText(),
        CountVectorizer(ngram_range=(1,2), min_df=1, binary=True)
    ),
    (
        "<a> TEXT </a>",
        features.FormLinksText(),
        TfidfVectorizer(ngram_range=(1,2), min_df=4, binary=True,
                        stop_words={'and', 'or', 'of'})
    ),
    (
        "<label> TEXT </label>",
        features.FormLabelText(),
        TfidfVectorizer(ngram_range=(1,2), min_df=3, binary=True,
                        stop_words="english")
    ),

    (
        "<form action=...>",
        features.FormUrl(),
        TfidfVectorizer(ngram_range=(5,6), min_df=4, binary=True,
                        analyzer="char_wb")
    ),
    (
        "<form class=... id=...>",
        features.FormCss(),
        TfidfVectorizer(ngram_range=(4,5), min_df=3, binary=True,
                        analyzer="char_wb")
    ),
    (
        "<input class=... id=...>",
        features.FormInputCss(),
        TfidfVectorizer(ngram_range=(4,5), min_df=5, binary=True,
                        analyzer="char_wb")
    ),
    (
        "<input name=...>",
        features.FormInputNames(),
        TfidfVectorizer(ngram_range=(5,6), min_df=3, binary=True,
                        analyzer="char_wb")
    ),
    (
        "<input title=...>",
        features.FormInputTitle(),
        TfidfVectorizer(ngram_range=(5,6), min_df=3, binary=True,
                        analyzer="char_wb")
    ),
]


# clf = SGDClassifier(
#     penalty='elasticnet', 
#     loss='log', 
#     alpha=0.0002,
#     fit_intercept=False, 
#     shuffle=True, 
#     random_state=0,
#     n_iter=50,
# )
clf = LogisticRegression(penalty='l2', C=5, fit_intercept=False, random_state=0, tol=0.01)

# clf = LinearSVC(C=0.5, random_state=0, fit_intercept=False)
model = Pipeline([
    ('fe', _create_feature_union(FEATURES)),
    ('clf', clf),
])

formtype_model.print_classification_report(annotations, n_folds=10, model=model)

                         precision    recall  f1-score   support

                 search       0.90      0.96      0.93       364
                  login       0.96      0.96      0.96       221
           registration       0.96      0.84      0.90       153
password/login recovery       0.89      0.87      0.88        95
        contact/comment       0.86      0.95      0.90       120
      join mailing list       0.92      0.86      0.89       107
      order/add to cart       0.95      0.66      0.78        62
                  other       0.65      0.70      0.67       122

            avg / total       0.89      0.89      0.89      1244

88.8% forms are classified correctly.
CPU times: user 12.7 s, sys: 70 ms, total: 12.7 s
Wall time: 12.9 s


## Check what classifier learned

For linear classifiers like Logistic Regression or an SVM without a kernel we can check coefficient values to understand better how the decision is made. 

For correlated features (notably, n-grams) weight will be spread across all correlated features, so just checking coefficients is not enough, but looking at them is useful anyways.

In [6]:
def get_informative_features(vectorizers, clf, class_labels, N):
    """
    Return text with features with the highest absolute coefficient
    values, per class.
    """
    feature_names = []
    for vec_name, vec in vectorizers:
        feature_names.extend(
            "%30s  %s" % (vec_name, name) for name in vec.get_feature_names()
        )
    features_by_class = []
    for i, class_label in enumerate(class_labels):
        topN = np.argsort(clf.coef_[i])[-N:]
        bottomN = np.argsort(clf.coef_[i])[:N]
        res = []

        for j in reversed(topN):
            coef = clf.coef_[i][j]
            if coef > 0:
                res.append("+%0.4f: %s" % (coef, feature_names[j]))

        if (len(topN) >= N) or (len(bottomN) >= N):
            res.append("   ...")

        for j in reversed(bottomN):
            coef = clf.coef_[i][j]
            if coef < 0:
                res.append("%0.4f: %s" % (coef, feature_names[j]))
        features_by_class.append((class_label, '\n'.join(res)))
    return features_by_class


def print_informative_features(features, clf, top, classes=None, class_map=None):
    vectorizers = [(name, vec) for (name, fe, vec) in features]
    feat_info = get_informative_features(vectorizers, clf, clf.classes_, top)
    for cls, report in feat_info:
        if classes is not None and cls not in classes:
            continue
        if class_map is not None:
            print(class_map[cls])
        else:
            print(cls)
        print(report)
        print("-"*80)

In [7]:
model.fit(X, y)
print_informative_features(FEATURES, clf, 25)

contact/comment
+5.7038:                  form elements  has <textarea>
+2.4165:          <label> TEXT </label>  question
+1.6252:              <form action=...>   kont
+1.5018:                  <a> TEXT </a>  agent
+1.4977:                  <a> TEXT </a>  affiliate
+1.4516:                  <a> TEXT </a>  mobile
+1.4413:                  <a> TEXT </a>  forums
+1.4255:                  <a> TEXT </a>  privacy statement
+1.3809:                  <a> TEXT </a>  statement
+1.2500:                  <a> TEXT </a>  contact
+1.2029:          <label> TEXT </label>  message
+1.1769:  <input type=submit value=...>  send
+1.1669:                  <a> TEXT </a>  contact us
+1.0759:                  <a> TEXT </a>  us
+0.8678:                  <a> TEXT </a>  privacy
+0.8337:        <form class=... id=...>  ques
+0.8337:        <form class=... id=...>  uest
+0.8337:        <form class=... id=...>  quest
+0.7952:                  <a> TEXT </a>  http
+0.7952:                  <a> TEXT </a>  http www
+0.

## Compare results with "loginform" library

It is not possible to compare the results with "loginform" library directly because loginform

* always tries to return a login form even if the score is low;
* only detects login forms;
* in case of several forms returns a single form with the best score instead of deciding for each form whether to return it or not.

So we used two approaches:

1. Use `loginform._form_score` with different thresholds; assume that if score is greater than or equal to a threshold `loginform` detected a login form.
2. Train the same model, but using features from loginform library (weights will be learned instead of being hardcoded as 'score' increments/decrements).



### 1. loginform scores + thresholds

* **score >= -100** means "simply treat all forms as login forms".

* **score >= 0** all (or most) login forms are captured, but there are many false positives. 
  It is only slightly better than treating all forms as login forms.

* **score >= 10** F1 score is the best among all thresholds, 
  but the quality is significantly worse than F1 of ML-based models.
  
* **score >= 20** ~90% of detected login forms are correct, but most 
  login forms are not detected. Also, ~90% number is still lower than what ML-based models give us.


In [8]:
%%time 
import loginform

def labels_to_binary(y):
    """ Convert labels to 2-classes: login forms and non-login forms """
    return [tp == 'login' for tp in y]

    
def predict_loginform(X, threshold):
    """
    Return if forms are login or not using loginform
    library scores and a threshold.
    """
    return [
        (loginform._form_score(form) >= threshold)
        for form in X
    ]


def print_threshold_metrics(X, y, threshold):
    y = labels_to_binary(y)
    y_pred = predict_loginform(X, threshold)

    precision, recall, f1, support = precision_recall_fscore_support(y, y_pred, pos_label=True)
    print(
        "score >= %4d:    precision = %0.3f    recall = %0.3f    F1 = %0.3f" % (
        threshold, precision[1], recall[1], f1[1]
    ))


for threshold in [-100, -10, 0, 10, 20, 30]:
    print_threshold_metrics(X, y, threshold)

score >= -100:    precision = 0.178    recall = 1.000    F1 = 0.302
score >=  -10:    precision = 0.201    recall = 0.973    F1 = 0.333
score >=    0:    precision = 0.321    recall = 0.968    F1 = 0.483
score >=   10:    precision = 0.708    recall = 0.869    F1 = 0.780
score >=   20:    precision = 0.854    recall = 0.317    F1 = 0.462
score >=   30:    precision = 1.000    recall = 0.009    F1 = 0.018
CPU times: user 2.3 s, sys: 6.62 ms, total: 2.31 s
Wall time: 2.37 s


  'precision', 'predicted', average, warn_for)


### 2. Use loginform features, but autodetect scores

The following ML-based model is trained using original loginform features (conditions used to increase or decrease the score). Roughly speaking, it uses the same information as loginform library, but instead of hardcoding `score += 10` and `score -= 10` the numbers are adjusted based on training data.

Note that the login form detection quality is significantly better than the quality of threshold-based model; it is only slightly worse than the quality of a "full" forms detection model. This means original loginform features are quite good at detecting login forms. But for other form types these features are not enough: other scores are bad.

In [9]:
%%time

LOGINFORM_FEATURES = [
    ('bias', features.Bias(), DictVectorizer()),
    ('loginform', features.OldLoginformFeatures(), DictVectorizer())
]
# loginform_clf = LinearSVC(C=0.5, fit_intercept=False)
loginform_clf = LogisticRegression(penalty='l2', C=5, fit_intercept=False, random_state=0)

model = make_pipeline(
    _create_feature_union(LOGINFORM_FEATURES), 
    loginform_clf,
)

formtype_model.print_classification_report(annotations, n_folds=10, model=model)

model.fit(X, y)
print_informative_features(LOGINFORM_FEATURES, loginform_clf, 25)

                 precision    recall  f1-score   support

         search       0.55      0.80      0.65       364
          login       0.92      0.95      0.93       221
   registration       0.94      0.67      0.78       153
contact/comment       0.48      0.74      0.58       120
          other       0.38      0.59      0.46       122

    avg / total       0.66      0.78      0.70       980

61.6% forms are classified correctly.
contact/comment
+3.6031:                      loginform  typecount_password_0
+2.9595:                      loginform  typecount_text_gt1
+0.1378:                      loginform  typecount_text_0
+0.0472:                      loginform  typecount_radio_gt0
-1.3174:                      loginform  typecount_checkbox_gt1
-1.4609:                      loginform  typecount_password_eq1
-1.5214:                      loginform  2_or_3_inputs
-6.4988:                           bias  bias
--------------------------------------------------------------------------