In [None]:
!pip install datasets

In [2]:
import numpy as np
import pandas as pd

In [None]:
!pip install belt-nlp
from belt_nlp.bert_classifier_with_pooling import BertClassifierWithPooling

In [None]:
# Скачиваем датасет
import gdown
url = 'https://drive.google.com/uc?export=download&confirm=no_antivirus&id=1LhHBKx2wzJrT7XXVbE6Yk9AHqtcCo2w8'
gdown.download(url, '/content/')

In [5]:
!unzip -q /content/df_with_labels.zip -d data

In [6]:
def replace_label_column(label):
    return str(label)

In [7]:
classes = ['0', '1', '2', '3', '4', '5', '6']

In [8]:
def train_test_split(full_df,
                     fraction,
                     split_by_class=False,
                     random=None):
    '''Perform split on train/val/test

    Args:
        full_df (pd.DataFrame): full dataset to split
        fraction (float): percentage of split
        split_by_class (bool, optional): Get fraction class-wise instead
            of total split. Defaults to False.
        random (int, optional): Random seed to use. Defaults to None.

    Raises:
        RuntimeError: if not all classes were included into test

    Returns:
        tuple: (train, val, test) dataframes
    '''

    print(f'Random seed: {random}')
    if split_by_class:
        train_parts = []
        val_parts = []
        test_parts = []
        for label, class_df in full_df.groupby('Label'):
            train_smpl = class_df.sample(frac=fraction, random_state=random)

            test_smpl = class_df.drop(train_smpl.index, axis=0)
            test_parts.append(test_smpl)
            val_part = train_smpl.sample(frac=(1 - fraction),
                                         random_state=random)
            val_parts.append(val_part)
            train_smpl.drop(val_part.index, axis=0, inplace=True)
            train_parts.append(train_smpl)
        train_df = pd.concat(train_parts, axis=0, ignore_index=True)
        val_df = pd.concat(val_parts, axis=0, ignore_index=True)
        test_df = pd.concat(test_parts, axis=0, ignore_index=True)
    else:
        test_df = full_df.sample(frac=(1 - fraction), random_state=random)

        train_df = full_df.drop(test_df.index, axis=0)
        val_df = train_df.sample(frac=(1 - fraction), random_state=random)
        train_df.drop(val_df.index, axis=0, inplace=True)

    return (train_df, val_df, test_df)

In [9]:
def make_dataframes(ds_path,
                    fraction,
                    split_by_class=False,
                    random=None,
                    low_threshold=200):
    '''Create dataframes for modelling from the dataset dir. Read parts,
    combine, preprocess and split on train, val, test

    Args:
        ds_path (str): path to dataset dir
        fraction (float): Percentage of split
        split_by_class (bool, optional): Apply fraction for train,
            validation, test split class-wise. Defaults to False.
        random (int, optional): Random seed to use. Defaults to None.
        test_only_singles (bool, optional): Include only single-labelled
            into test. Defaults to False.
        oversample_low (bool, optional): Perform oversampling of low
            classes. Defaults to False.
        low_threshold (int, optional): Threshold of elements in class to
            consider it as low. Defaults to 200.

    Returns:
        tuple: (train, val, test) dataframes
    '''

    full_df = pd.read_csv(ds_path)
    full_df['Label'] = full_df['Label'].apply(replace_label_column)

    train_df, val_df, test_df = train_test_split(full_df, fraction,
                                                 split_by_class,
                                                 random=random)
    for df in (train_df, val_df, test_df):
        assert df.index.is_unique
    train_df = train_df.sample(frac=1, random_state=random)
    train_df.reset_index(drop=True, inplace=True)

    return (train_df, val_df, test_df)

In [10]:
train_df, val_df, test_df = make_dataframes('/content/data/df_with_labels(2).csv',
                                            fraction=0.85,
                                            random=1682284394,
                                            split_by_class=True,
                                            low_threshold=500)

Random seed: 1682284394


In [11]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

# Example - Model BERT with pooling

In this notebook we will show how to use basic methods `fit` and `predict` for the BERT model with pooling.

## Load data - classification of IMDB reviews

In [12]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files={'train': "/content/train.csv",'test': "/content/test.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Category', 'Resume', 'Label'],
        num_rows: 2478
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Category', 'Resume', 'Label'],
        num_rows: 515
    })
})

## Divide to train and test sets

In [13]:
X_train = dataset["train"]["Resume"]
y_train = dataset["train"]["Label"]
X_test = dataset["test"]["Resume"]
y_test = dataset["test"]["Label"]

## Fit the model

In [14]:
MODEL_PARAMS = {
    "num_labels": 7,
    "batch_size": 16,
    "learning_rate": 5e-5,
    "epochs": 5,
    "chunk_size": 510,
    "stride": 510,
    "minimal_chunk_length": 510,
    "maximal_text_length": 510 * 3,
    "pooling_strategy": "mean",
    "device": "cuda",
    "many_gpus": True,
}
model = BertClassifierWithPooling(**MODEL_PARAMS)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
model.fit(X_train, y_train, epochs=5)  # Warning about tokeninizing too long text is expected

In [None]:
torch.save(model, 'belt.model')

In [15]:
import torch

In [16]:
import gdown
url = 'https://drive.google.com/uc?export=download&confirm=no_antivirus&id=1c0eEFLtwdWGYxk-f9CshNPd07iN1doYR'
gdown.download(url, '/content/')

Downloading...
From: https://drive.google.com/uc?export=download&confirm=no_antivirus&id=1c0eEFLtwdWGYxk-f9CshNPd07iN1doYR
To: /content/belt.zip
100%|██████████| 406M/406M [00:03<00:00, 104MB/s]


'/content/belt.zip'

In [18]:
!unzip -q /content/belt.zip -d /content/

In [19]:
model2 = torch.load('/content/belt.model', map_location=torch.device('cuda'))

  model2 = torch.load('/content/belt.model', map_location=torch.device('cuda'))


## Get predictions

In [20]:
classes = model2.predict(X_test)
probabilities = model2.predict_scores(X_test)

In [21]:
classes = classes.to('cpu')

## Calculate model accuracy on the test data

In [1]:
accurate = sum(classes == np.array(y_test))
accuracy = accurate / len(y_test)

print(f"Test accuracy: {accuracy}")

NameError: name 'classes' is not defined

In [38]:
classes = model2.predict([X_train[2017]])
print(classes)

tensor([3], device='cuda:0')


In [39]:
predict_scores = model2.predict_scores([X_train[2017]])
print(predict_scores)

tensor([[0.0650, 0.1974, 0.0052, 0.7048, 0.0252, 0.0012, 0.0012]],
       device='cuda:0')
