In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from scipy.sparse import hstack
import re
from tqdm.notebook import tqdm
import json
import gzip

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from torch.nn import BCELoss

In [3]:
def f1score(y_true, y_pred):
    y_true = set(y_true)
    y_pred = set(y_pred)
    
    tp = len(y_true & y_pred)
    precision = tp / len(y_pred) if len(y_pred) else 0
    recall = tp / len(y_true)
    if precision == 0.0 and recall == 0.0:
        score = 0.0
    else:
        score = 2 * precision * recall / (precision + recall)
    return score

def mean_f1score(y_true, y_pred):
    lst = []
    for true_specs, pred_specs in zip(y_true, y_pred):
        lst.append(f1score(true_specs, pred_specs))
    return np.mean(lst)

In [4]:
def read_vacancies_part(part):
    with gzip.open(f'dataset_headhunter/vacancies-{part:02}.json.gz', 'r') as fp:
        return json.loads(fp.read())

In [5]:
# def read_vacancies_part(part):
#     with gzip.open(f'dataset_headhunter/vacancies-{part:02}.json.gz', 'r') as fp:
#         result = json.loads(fp.read())
#         for i in result:
#             del result[i]['description']
#         return result

In [6]:
data = {}
for i in range(1, 11):
    data.update(read_vacancies_part(i))
data = pd.DataFrame([{'vacancy_id': int(i), 'vacancy': re.sub(r'<[/\w]*>', '', str(data[i]))} for i in data])

In [19]:
maxl = 300
batch_size = 8

In [3]:
tokenizer = BertTokenizerFast.from_pretrained('DeepPavlov/rubert-base-cased-conversational')

# Подготовка train датасета

In [9]:
train_specializations = pd.read_csv('dataset_headhunter/train_labels.csv.gz', compression='gzip')

In [10]:
train_data = train_specializations.merge(data, how='left').dropna()

In [11]:
train_data

Unnamed: 0,vacancy_id,specializations,vacancy
0,1,"[242, 256, 302, 324, 358, 440]","{'name': 'Администратор торгового зала', 'desc..."
1,3,[211],"{'name': 'Системный администратор', 'descripti..."
2,4,"[389, 412, 437]","{'name': 'Специалист по закупкам', 'descriptio..."
3,6,[445],"{'name': 'Ведущий инженер', 'description': 'Об..."
4,9,[503],{'name': 'Автомеханик /Автослесарь (Моторист)'...
...,...,...,...
1456320,2912638,"[80, 191, 470, 472]",{'name': 'Оператор (ВП) вращающегося превентор...
1456321,2912640,[153],"{'name': 'Инженер (обогатитель)', 'description..."
1456322,2912648,"[87, 91, 221]","{'name': 'Методист курса JavaScript', 'descrip..."
1456323,2912649,"[227, 437, 574]",{'name': 'Менеджер по продажам и закупкам изде...


In [12]:
X_train, X_val, y_train, y_val = train_test_split(train_data.vacancy, train_data.specializations.apply(json.loads), test_size=0.1, random_state=0)

In [None]:
train_inputs = torch.tensor([tokenizer.encode(q, max_length=maxl, pad_to_max_length=True) for q in tqdm(X_train)])
train_labels = torch.zeros(y_train.shape[0], 764)
xs = []
ys = []
for i, j in enumerate(y_train.tolist()):
    xs += [i] * len(j)
    ys += j
train_labels[xs, ys] = 1

In [None]:
train_data = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(
    train_data,
    sampler=RandomSampler(train_data),
    batch_size=batch_size
)

In [13]:
validation_inputs = torch.tensor([tokenizer.encode(q, max_length=maxl, pad_to_max_length=True) for q in tqdm(X_val)])
validation_labels = torch.zeros(y_val.shape[0], 764)
xs = []
ys = []
for i, j in enumerate(y_val.tolist()):
    xs += [i] * len(j)
    ys += j
validation_labels[xs, ys] = 1

HBox(children=(FloatProgress(value=0.0, max=145633.0), HTML(value='')))




In [21]:
validation_data = TensorDataset(validation_inputs, validation_labels)
validation_dataloader = DataLoader(
    validation_data,
    sampler=SequentialSampler(validation_data),
    batch_size=batch_size
)

# Инициализация модели

In [15]:
def forward(
    self,
    input_ids=None,
    attention_mask=None,
    token_type_ids=None,
    position_ids=None,
    head_mask=None,
    inputs_embeds=None,
    labels=None,
):

    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
    )

    pooled_output = outputs[1]

    pooled_output = self.dropout(pooled_output)
    logits = torch.sigmoid(self.classifier(pooled_output))

    outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

    if labels is not None:
        loss_fct = BCELoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
        outputs = (loss,) + outputs

    return outputs  # (loss), logits, (hidden_states), (attentions)

In [16]:
# Load dataset, tokenizer, model from pretrained model/vocabulary
BertForSequenceClassification.forward = forward
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased-conversational', num_labels=764)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [17]:
model.load_state_dict(torch.load('bert_epoch_8.pth'))

<All keys matched successfully>

# Обучение

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

In [None]:
for epoch in range(9):
    if epoch == 8:
        for g in optimizer.param_groups:
            g['lr'] = 2e-6
    
    model.train()
    train_loss = 0

    for step, batch in tqdm(enumerate(train_dataloader), total=train_inputs.shape[0] / batch_size):
        batch = tuple(t.to('cuda') for t in batch)
        b_input_ids, b_labels = batch
      
        optimizer.zero_grad()

        loss = model(b_input_ids.long(), token_type_ids=None, labels=b_labels.float())
        loss[0].backward()
      
        optimizer.step()

        train_loss += loss[0].item()
      
    print("Loss на обучающей выборке: {0:.5f}".format(train_loss / len(train_dataloader)))


    model.eval()

    valid_preds, valid_labels = [], []

    for batch in tqdm(validation_dataloader):   
        batch = tuple(t.to('cuda') for t in batch)
        b_input_ids, b_labels = batch


        with torch.no_grad():
            logits = model(b_input_ids.long(), token_type_ids=None)

        logits = logits[0].detach().cpu()

        batch_preds = pd.DataFrame((logits >= 0.3).cpu().nonzero().numpy()).groupby(0)[1].apply(list).to_dict()
        batch_preds = [batch_preds.get(i, [int(torch.argmax(logits[i]))]) for i in range(logits.shape[0])]
        batch_labels = pd.DataFrame((b_labels >= 0.5).cpu().nonzero().numpy()).groupby(0)[1].apply(list).to_dict()
        batch_labels = [batch_labels.get(i, []) for i in range(logits.shape[0])]
        valid_preds.extend(batch_preds)
        valid_labels.extend(batch_labels)

    print("f1-score: {0:.3f}".format(
        mean_f1score(valid_labels, valid_preds)
    ))
    
    torch.save(model.state_dict(), f'bert_epoch_{epoch}.pth')

# Загрузка теста и предикт

In [None]:
test_ids = pd.read_csv('dataset_headhunter/test_vacancy_ids.csv.gz', compression='gzip')

In [None]:
test_data_df = test_ids.merge(data, how='left').fillna('')

In [None]:
test_inputs = torch.tensor([tokenizer.encode(q, max_length=maxl, pad_to_max_length=True) for q in tqdm(test_data_df.vacancy)])

In [None]:
test_data = TensorDataset(test_inputs)
test_dataloader = DataLoader(
    test_data,
    sampler=SequentialSampler(test_data),
    batch_size=batch_size
)

In [None]:
test_preds = []

for batch in tqdm(test_dataloader):   
    batch = batch[0]
    b_input_ids = batch.to('cuda')
    
    with torch.no_grad():
        logits = model(b_input_ids.long(), token_type_ids=None)

    logits = logits[0].detach().cpu()
    
    batch_preds = pd.DataFrame((logits >= 0.3).cpu().nonzero().numpy()).groupby(0)[1].apply(list).to_dict()
    batch_preds = [batch_preds.get(i, [int(torch.argmax(logits[i]))]) for i in range(logits.shape[0])]
    test_preds.extend(batch_preds)

In [None]:
sub = pd.DataFrame({'vacancy_id': test_data_df.vacancy_id, 'specializations': test_preds})

In [None]:
sub.to_csv('submission.csv.gz', index=False, compression='gzip')