<a href="https://colab.research.google.com/github/Samoed/text_classification_hackaton/blob/main/notebooks/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!gdown 1xPUpT6jMCwd-R8cxr6rZhfIJ1Z4r0RMt

Downloading...
From: https://drive.google.com/uc?id=1xPUpT6jMCwd-R8cxr6rZhfIJ1Z4r0RMt
To: /content/train_dataset_dataset.zip
  0% 0.00/16.6M [00:00<?, ?B/s] 57% 9.44M/16.6M [00:00<00:00, 91.2MB/s]100% 16.6M/16.6M [00:00<00:00, 134MB/s] 


In [2]:
!unzip -q train_dataset_dataset.zip

In [3]:
!pip install -q accelerate evaluate -U

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m204.8/297.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
from transformers import BertForSequenceClassification, BertTokenizerFast, BertModel
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from transformers import Trainer, TrainingArguments
import numpy as np
from tqdm.autonotebook import tqdm
from torch import nn
import re

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 512

In [6]:
class BaseDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_seq_len):
        self.data = dataframe
        self.text = dataframe['text'].tolist()
        self.targets = None
        if 'class' in dataframe:
            self.targets = dataframe['class'].tolist()
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __getitem__(self, index):
        text = str(self.text[index])
        text = ' '.join(text.split())

        inputs = self.tokenizer(
            "query: "+text,
            add_special_tokens=True,
            max_length=self.max_seq_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_tensors="pt",
        )
        text = inputs['input_ids'].squeeze(0)
        attention = inputs['attention_mask'].squeeze(0)
        token_type_ids = inputs["token_type_ids"].squeeze(0)


        if self.targets is not None:
            return {"input_ids": text, "attention_mask": attention, "token_type_ids": token_type_ids, 'labels': torch.tensor(self.targets[index], dtype=torch.long)}
        else:
            return {"input_ids": text, "attention_mask": attention, "token_type_ids": token_type_ids}


    def __len__(self) -> int:
        return len(self.text)

In [7]:
train_data = pd.read_csv("dataset/sample.csv")

train_data.head()

Unnamed: 0,class,text
0,arrangement,СОГЛАШЕНИЕ N 8\nо расторжении трудового догово...
1,arrangement,Соглашение о предоставлении опциона на заключе...
2,arrangement,Соглашение\nо реструктуризации задолженности\n...
3,arrangement,Дополнительное соглашение\r\nк договору купли-...
4,arrangement,Соглашение\nо расторжении договора об оказании...


In [8]:
train_data["class"].value_counts()

class
proxy             71
contract          70
act               69
application       61
order             50
invoice           43
bill              41
arrangement       40
contract offer    25
statute           21
determination     10
Name: count, dtype: int64

In [9]:
# train_data["text"] = train_data["text"].str.replace(r"\s+", " ").str.replace("\r", " ").str.replace("\t", "")

In [10]:
# text = train_data[train_data["text"].str.len() == train_data["text"].str.len().max()]["text"].values[0].strip()
# remove extra spaces
# re.sub(r"\s+", " ", text)

In [11]:
data = []

paragraphs = 0
for idx, row in train_data.iterrows():
    target = row["class"]
    text = row["text"]

    for paragraph in text.split("\n"):
        if len(paragraph) > 0:
            paragraph = paragraph.replace("\r", " ").replace("\t", " ")
            paragraph = re.sub(r"\s+", " ", paragraph)
            if len(paragraph.split(" ")) < 10:
                continue
            data.append({"id": idx, "class": target, "text": paragraph})

In [12]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,class,text
0,0,arrangement,"Общество с ограниченной ответственностью ""Стро..."
1,0,arrangement,1. Прекратить трудовой договор от 18.10.2010 N...
2,0,arrangement,2. Последним рабочим днем Работника считать 16...
3,0,arrangement,3.1. Перед увольнением предоставить Работнику ...
4,0,arrangement,3.2. 19 октября 2016 г. выплатить Работнику вы...


In [13]:
df.shape, train_data.shape

((7772, 3), (501, 2))

In [14]:
le = LabelEncoder()

df["class"] = le.fit_transform(df["class"])
df.head()

Unnamed: 0,id,class,text
0,0,2,"Общество с ограниченной ответственностью ""Стро..."
1,0,2,1. Прекратить трудовой договор от 18.10.2010 N...
2,0,2,2. Последним рабочим днем Работника считать 16...
3,0,2,3.1. Перед увольнением предоставить Работнику ...
4,0,2,3.2. 19 октября 2016 г. выплатить Работнику вы...


In [15]:
train_split, val_split = train_test_split(df, train_size=0.8, random_state=42, stratify=df["class"])

In [16]:
def read_model(model_name: str = "DeepPavlov/rubert-base-cased"):
    model = BertForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=len(le.classes_))
    tokenizer = BertTokenizerFast.from_pretrained(model_name)
    return model, tokenizer

In [17]:
model, tokenizer = read_model("intfloat/multilingual-e5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/multilingual-e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'BertTokenizerFast'.


In [18]:
train_dataset = BaseDataset(train_split, tokenizer, MAX_LEN)
val_dataset = BaseDataset(val_split, tokenizer, MAX_LEN)

In [19]:
import evaluate
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [20]:
from transformers import SchedulerType

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    # warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy="epoch",
    # per_device_train_batch_size=8,
    # per_device_eval_batch_size=64,
    auto_find_batch_size=True,
    save_strategy="epoch",
    report_to=None,
    lr_scheduler_type=SchedulerType.COSINE_WITH_RESTARTS,
    logging_first_step=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.6378,0.593295,0.557606
2,0.2868,0.366107,0.699351
3,0.1598,0.297888,0.794218
4,0.0722,0.288772,0.818826


TrainOutput(global_step=3112, training_loss=0.45027889280668565, metrics={'train_runtime': 1241.2192, 'train_samples_per_second': 20.035, 'train_steps_per_second': 2.507, 'total_flos': 1638387759353856.0, 'train_loss': 0.45027889280668565, 'epoch': 4.0})

In [22]:
!pip install -q huggingface_hub

In [23]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
repo = "e5-small-hackaton"

In [29]:
trainer.push_to_hub(repo)

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Samoed/results/commit/1d037ee146337cd9bb2a654eb0aebaa304e7f739', commit_message='e5-small-hackaton', commit_description='', oid='1d037ee146337cd9bb2a654eb0aebaa304e7f739', pr_url=None, pr_revision=None, pr_num=None)

In [30]:
tokenizer.push_to_hub(repo)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Samoed/e5-small-hackaton/commit/690c13cda97638670491a919f8d0d21b3a0f1483', commit_message='Upload tokenizer', commit_description='', oid='690c13cda97638670491a919f8d0d21b3a0f1483', pr_url=None, pr_revision=None, pr_num=None)