synchronize your pydoxtools directory:

- rclone sync pydoxtools/ xyntopia_gdrive:/pydoxtools -P --size-only --fast-list

In [None]:
%load_ext autoreload
%autoreload 2
#import os
#assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [None]:
#import torch
#import torch_xla
#import torch_xla.core.xla_model as xm
#
#t = torch.randn(2, 2, device=xm.xla_device())
#print(t.device)
#print(t)

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## setup pydoxtools code access, data & gdrive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
PDX_DIR="/content/pydoxtools"

In [None]:
%%capture
!pip install pytorch-lightning faker evaluate transformers datasets accelerate nvidia-ml-py3 sklearn

In [None]:
!ls "/content/gdrive/My Drive/pydoxtools/pydoxtools"
!rm -r $PDX_DIR
!cp -r "/content/gdrive/My Drive/pydoxtools/pydoxtools" $PDX_DIR

The main issue when using google colab for our training is, that we only have python version 3.9 available, while pydoxtools currently needs 3.10 to function.and

because of this we are trying to only use the relevant parts of the library which is mainly the data generation part which is used for
the training.

In [None]:
!rm $PDX_DIR/__init__.py
!touch $PDX_DIR/__init__.py
!pwd

finally...  run pydoxtools

In [None]:
# make sure we set the environment variable before loading pydoxtools for the first time
%env TRAINING_DATA_DIR=/content/gdrive/MyDrive/pydoxtools/training_data
from pydoxtools import random_data_generators, training

## prepare training

In [None]:
bg = random_data_generators.TextBlockGenerator.std_generator()
bg.classmap, bg.classmap_inv, bg.num_generators, bg.class_gen, bg.gen_mapping, bg.weights

In [None]:
bg.gen_mapping

In [None]:
bg.single(1000, convert_labels=True)

In [None]:
df,y = training.load_labeled_text_block_data(classmap=bg.classmap_inv)
df["label"]=y
df = df.rename(columns={"txt":"text"}).drop(columns="filename")

In [None]:
df.columns

In [None]:
df.label.unique()

In [None]:
from datasets import Dataset, Features, Value, ClassLabel
import random

dataset_size=200000

# generate datasets for finetuning
def my_gen():
    seed=random.randint(0,10000000)
    for i in range(1, dataset_size):
        text, label = bg.single(i+seed, convert_labels=True)
        yield {"label": label, "text":text}

class_names = ["address", "unknown"]
features = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})

dataset = Dataset.from_generator(my_gen, features=features)
dataset.set_format("torch")

In [None]:
val_dataset = Dataset.from_pandas(df,features=features) # this will already convert our labels!
val_dataset.set_format("torch")

In [None]:
dataset, val_dataset

In [None]:
dataset.features

In [None]:
model_name="sentence-transformers/all-MiniLM-L6-v2"
# model_name="bert-base-cased"

In [None]:
%%capture
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
def tokenize_function(batch):
    tokenized_batch = tokenizer(batch['text'], padding="max_length", truncation=True)
    #print(batch["label"])
    #tokenized_batch["labels"] = [bg.classmap_inv[label] for label in batch["label"]]
    return tokenized_batch

In [None]:
train_dataset = dataset.map(tokenize_function, batched=True)
validation_dataset = val_dataset.map(tokenize_function, batched=True)

In [None]:
# select smaller subdataset
#train_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000))
#eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000))

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
#model = AutoModelForSequenceClassification.from_pretrained(, num_labels=2).to("cuda")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to("cuda")

In [None]:
from transformers import TrainingArguments

In [None]:
import numpy as np
import evaluate

In [None]:
f1_metric = evaluate.load("f1")
acc = evaluate.load("accuracy")
rec = evaluate.load("recall")
prec = evaluate.load("precision")

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        **f1_metric.compute(predictions=predictions, references=labels),
        **acc.compute(predictions=predictions, references=labels),
        **rec.compute(predictions=predictions, references=labels),
        **prec.compute(predictions=predictions, references=labels)
    }

from sklearn.metrics import classification_report
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics = classification_report(predictions, labels, output_dict=True)
    label=str(bg.classmap_inv['address'])
    return {
        "address.f1":metrics[label]["f1-score"],
        "accuracy":metrics["accuracy"],
        "address.precision":metrics[label]["precision"],
        "address.recall":metrics[label]["recall"]
    }

In [None]:
#!nvidia-smi

## run actual training

In [None]:
#metric

In [None]:
default_args = {
    "output_dir": "/content/gdrive/MyDrive/models/txtblock",
    "evaluation_strategy": "steps",
    "num_train_epochs": 10,
    "log_level": "info",
    #"label_names": "label",
    "logging_strategy": "steps",
    "logging_steps": 200, 
    "save_strategy":"steps",
    "save_steps": 200,
    "report_to": "none",
    "metric_for_best_model":"address.f1"
}

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    per_device_train_batch_size=44,
    #gradient_accumulation_steps=4,
    #gradient_checkpointing=True,
    **default_args)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=compute_metrics,
)
#result = trainer.train()
#print_summary(result)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
#%load_ext tensorboard
#%tensorboard --logdir logs