### Using BERT
https://nbviewer.org/github/interpretml/interpret-text/blob/master/notebooks/text_classification/text_classification_unified_information_explainer.ipynb

In [1]:
import sys
sys.path.append("../../")
import os
import json
import pandas as pd
import numpy as np
import scrapbook as sb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

from interpret_text.experimental.common.utils_bert import Language, Tokenizer, BERTSequenceClassifier
from interpret_text.experimental.common.timer import Timer


PATH_SENTENCE_TEXT = r"../dataset/concat_sentence_text.csv"
PATH_SEGMENT_TEXT = r"../dataset/concat_segment_text.csv"



In [2]:
from interpret_text.experimental.unified_information import UnifiedInformationExplainer

In [3]:
# Set parameters
TRAIN_DATA_FRACTION = 1
TEST_DATA_FRACTION = 1
NUM_EPOCHS = 1

if torch.cuda.is_available():
    BATCH_SIZE = 1
else:
    BATCH_SIZE = 8

DATA_FOLDER = r"../temp"
BERT_CACHE_DIR = r"../temp"
LANGUAGE = Language.ENGLISH
TO_LOWER = True
MAX_LEN = 150
BATCH_SIZE_PRED = 512
TRAIN_SIZE = 0.6
LABEL_COL = "practice"
TEXT_COL = "sentence_text"

In [4]:
df = pd.read_csv(PATH_SENTENCE_TEXT)
df.head()

Unnamed: 0,sentence_text,practice,modality
0,"IP ADDRESS, COOKIES, AND WEB BEACONS",Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
1,"IP ADDRESS, COOKIES, AND WEB BEACONS",Identifier_IP_Address_1stParty,PERFORMED
2,"IP addresses will be collected, along with inf...",Identifier_IP_Address_1stParty,PERFORMED
3,The information that our products collect incl...,Identifier_Cookie_or_similar_Tech_1stParty,PERFORMED
4,The information that our products collect incl...,Identifier_IP_Address_1stParty,PERFORMED


In [5]:
df["sentence_text"] = df["sentence_text"].astype("string")
df["practice"] = df["practice"].astype("category")
df = df.drop(axis = 1, labels = "modality")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18829 entries, 0 to 18828
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   sentence_text  18829 non-null  string  
 1   practice       18829 non-null  category
dtypes: category(1), string(1)
memory usage: 168.1 KB


### Restrict to predicting the top 10 categories - make it easier for the classifier. perhaps not enough training samples per category

In [6]:
df["practice"].value_counts()[:5]

Identifier_Cookie_or_similar_Tech_1stParty    2107
Contact_E_Mail_Address_1stParty               2106
Location_1stParty                             1514
Identifier_Cookie_or_similar_Tech_3rdParty    1250
Identifier_IP_Address_1stParty                1005
Name: practice, dtype: int64

In [7]:
top_5_cats = ["Identifier_Cookie_or_similar_Tech_1stParty", "Contact_E_Mail_Address_1stParty", "Location_1stParty", "Identifier_Cookie_or_similar_Tech_3rdParty", "Identifier_IP_Address_1stParty"]

In [8]:
# Filter df to top 5 cats
df_filtered = df[df["practice"].isin(top_5_cats)]
df_filtered.head()

Unnamed: 0,sentence_text,practice
0,"IP ADDRESS, COOKIES, AND WEB BEACONS",Identifier_Cookie_or_similar_Tech_1stParty
1,"IP ADDRESS, COOKIES, AND WEB BEACONS",Identifier_IP_Address_1stParty
2,"IP addresses will be collected, along with inf...",Identifier_IP_Address_1stParty
3,The information that our products collect incl...,Identifier_Cookie_or_similar_Tech_1stParty
4,The information that our products collect incl...,Identifier_IP_Address_1stParty


In [9]:
# Check counts are the same
print(sum(df["practice"].value_counts()[:5]))
print(len(df_filtered))

7982
7982


In [10]:
df_train, df_test = train_test_split(df_filtered, train_size = TRAIN_SIZE, random_state=0)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
df_train.head()

Unnamed: 0,sentence_text,practice
0,"For example, you can access information regist...",Contact_E_Mail_Address_1stParty
1,Other Cookies are more performance-related suc...,Identifier_Cookie_or_similar_Tech_1stParty
2,"Information processed could include your name,...",Contact_E_Mail_Address_1stParty
3,"In addition, we will cache data we receive fro...",Contact_E_Mail_Address_1stParty
4,A cookie is a small data file that we transfer...,Identifier_Cookie_or_similar_Tech_1stParty


### Encode labels

In [12]:
label_encoder = LabelEncoder()
labels_train = label_encoder.fit_transform(df_train[LABEL_COL])
labels_test = label_encoder.transform(df_test[LABEL_COL])

num_labels = len(np.unique(labels_train))

In [13]:
print("Number of unique labels: {}".format(num_labels))
print("Number of training examples: {}".format(df_train.shape[0]))
print("Number of testing examples: {}".format(df_test.shape[0]))

Number of unique labels: 5
Number of training examples: 4789
Number of testing examples: 3193


### Tokenise and preprocess

In [14]:
tokenizer = Tokenizer(LANGUAGE, to_lower=TO_LOWER, cache_dir=BERT_CACHE_DIR)

tokens_train = tokenizer.tokenize(list(df_train[TEXT_COL]))
tokens_test = tokenizer.tokenize(list(df_test[TEXT_COL]))

100%|██████████| 4789/4789 [00:02<00:00, 2078.89it/s]
100%|██████████| 3193/3193 [00:01<00:00, 2014.35it/s]


In [15]:
tokens_train, mask_train, _ = tokenizer.preprocess_classification_tokens(tokens_train, MAX_LEN)
tokens_test, mask_test, _ = tokenizer.preprocess_classification_tokens(tokens_test, MAX_LEN)

### Sequence classifier model

In [16]:
classifier = BERTSequenceClassifier(language=LANGUAGE, num_labels=num_labels, cache_dir=BERT_CACHE_DIR)

100%|██████████| 407873900/407873900 [04:34<00:00, 1488142.18B/s]


### Train model

In [17]:
with Timer() as t:
    classifier.fit(token_ids=tokens_train,
                    input_mask=mask_train,
                    labels=labels_train,    
                    num_epochs=NUM_EPOCHS,
                    batch_size=BATCH_SIZE,    
                    verbose=True)    
print("[Training time: {:.3f} hrs]".format(t.interval / 3600))

t_total value of -1 results in schedule not being applied
Iteration: 100%|██████████| 4789/4789 [22:24<00:00,  3.56it/s]

[Training time: 0.374 hrs]





### Score model

In [26]:
preds = classifier.predict(token_ids=tokens_test, 
                           input_mask=mask_test, 
                           batch_size=BATCH_SIZE_PRED)

Iteration: 100%|██████████| 7/7 [00:34<00:00,  4.88s/it]


### Evaluate model

In [27]:
report = classification_report(labels_test, preds, target_names=label_encoder.classes_, output_dict=True) 
accuracy = accuracy_score(labels_test, preds)
print("accuracy: {}".format(accuracy))
print(json.dumps(report, indent=4, sort_keys=True))

accuracy: 0.2640150328844347
{
    "Contact_E_Mail_Address_1stParty": {
        "f1-score": 0.41774033696729435,
        "precision": 0.2640150328844347,
        "recall": 1.0,
        "support": 843
    },
    "Identifier_Cookie_or_similar_Tech_1stParty": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "support": 837
    },
    "Identifier_Cookie_or_similar_Tech_3rdParty": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "support": 502
    },
    "Identifier_IP_Address_1stParty": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "support": 419
    },
    "Location_1stParty": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "support": 592
    },
    "accuracy": 0.2640150328844347,
    "macro avg": {
        "f1-score": 0.08354806739345887,
        "precision": 0.052803006576886934,
        "recall": 0.2,
        "support": 3193
    },
    "weighted

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Explain model

In [28]:
device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")

classifier.model.to(device)
for param in classifier.model.parameters():
    param.requires_grad = False
classifier.model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediat

In [29]:
interpreter_unified = UnifiedInformationExplainer(model=classifier.model, 
                                 train_dataset=list(df_train[TEXT_COL]), 
                                 device=device, 
                                 target_layer=14, 
                                 classes=label_encoder.classes_)

In [30]:
idx = 7
text = df_test[TEXT_COL][idx]
true_label = df_test[LABEL_COL][idx]
predicted_label = label_encoder.inverse_transform([preds[idx]])
print(text, true_label, predicted_label)

When you launch any of our applications, we collect information regarding your device type, operating system and version, carrier provider, IP address, Media Access Control (MAC) address, International Equipment Mobile ID (IMEI), whether you are using a point package, the game version, the device's geo-location, language settings, and unique device ID. Identifier_IP_Address_1stParty ['Contact_E_Mail_Address_1stParty']


In [31]:
explanation_unified = interpreter_unified.explain_local(text, true_label)

100%|██████████| 1000/1000 [00:00<00:00, 2089.16it/s]
100%|██████████| 150/150 [00:25<00:00,  5.94it/s]


### Visualise explanation

In [32]:
from interpret_text.experimental.widget import ExplanationDashboard

In [33]:
# TODO: Why does it not show?
ExplanationDashboard(explanation_unified)

ExplanationWidget(value={'text': ['when', 'you', 'launch', 'any', 'of', 'our', 'applications', ',', 'we', 'col…

<interpret_text.experimental.widget.ExplanationDashboard.ExplanationDashboard at 0x254bd86ea58>