forked from deepset-ai/FARM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
doc_classification_word_embedding_LM.py
111 lines (93 loc) · 3.81 KB
/
doc_classification_word_embedding_LM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# fmt: off
import logging
from pathlib import Path
import time
from farm.data_handler.data_silo import DataSilo, StreamingDataSilo
from farm.data_handler.processor import TextClassificationProcessor
from farm.modeling.optimization import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import TextClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
def doc_classifcation():
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO)
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_glove")
##########################
########## Settings
##########################
set_all_seeds(seed=42)
n_epochs = 3
batch_size = 32
evaluate_every = 100
# load from a local path:
lang_model = Path("../saved_models/glove-german-uncased")
# or through s3
#lang_model = "glove-german-uncased"
do_lower_case = True
device, n_gpu = initialize_device_settings(use_cuda=True)
# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data automaticaly if it is not available.
# GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
label_list = ["OTHER", "OFFENSE"]
metric = "f1_macro"
processor = TextClassificationProcessor(
tokenizer=tokenizer,
max_seq_len=128,
data_dir=Path("../data/germeval18"),
label_list=label_list,
dev_split=0,
test_filename="test.tsv",
train_filename="train.tsv",
metric=metric,
label_column_name="coarse_label")
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
data_silo = DataSilo(
processor=processor,
batch_size=batch_size,
max_processes=1)
# 4. Create an AdaptiveModel
# a) which consists of an embedding model as a basis.
# Word embedding models only converts words it has seen during training to embedding vectors.
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(
layer_dims=[300,600,len(label_list)],
class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
num_labels=len(label_list))
model = AdaptiveModel(
language_model=language_model,
prediction_heads=[prediction_head],
embeds_dropout_prob=0.1,
lm_output_types=["per_sequence"],
device=device)
# 5. Create an optimizer
model, optimizer, lr_schedule = initialize_optimizer(
model=model,
learning_rate=3e-5,
device=device,
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs)
# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
trainer = Trainer(
model=model,
optimizer=optimizer,
data_silo=data_silo,
epochs=n_epochs,
n_gpu=n_gpu,
lr_schedule=lr_schedule,
evaluate_every=evaluate_every,
device=device)
# 7. Let it grow
trainer.train()
if __name__ == "__main__":
doc_classifcation()
# fmt: on