In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score

from transformers import AutoModel, AutoTokenizer

import torch


import warnings
warnings.filterwarnings('ignore')
from tqdm.auto import tqdm


In [28]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [29]:
train_raw = pd.read_csv('/kaggle/input/bytedatahack-22/train.csv')
train_raw = train_raw[train_raw['target'] != -1]
targets = train_raw['target'].values

In [30]:
# Попробую предобученненные эмбеденги

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
emb_model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
emb_model.to(device)  

def embed_bert_cls(text, model=emb_model, tokenizer=tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings.cpu().numpy()

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
embed_bert_cls(['hello world', 'hi how are you']).shape

(2, 312)

In [36]:
# Преобразует текста в эмбеденги
class BertVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        texts = list(X['full_description'].values)
        self.t = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        self.model.eval()
        with torch.no_grad():
            model_output = self.model(**{k: v.to(self.model.device) for k, v in self.t.items()})
        embeddings = model_output.last_hidden_state[:, 0, :]
        embeddings = torch.nn.functional.normalize(embeddings)
        return embeddings.cpu().numpy()
    
# Создает полное описания, состоящее из Названия вакансии и обязанностей
class FullDescriptionCreator(BaseEstimator, TransformerMixin):
    """Добавляет столбец с полным описанием вакансии"""

    patt = re.compile("[^\s\w]")

    def __init__(self, responsibilities):
        self.responsibilities = responsibilities

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X["responsibilities"] = self.responsibilities
        X["full_description"] = (
            X["name"] + " " + X["responsibilities"].fillna("")
        ).map(str.lower)
        X.loc[:, "full_description"] = X["full_description"].str.replace(
            self.patt, " ", regex=True
        )
        return X
   

In [37]:
# Загружаем обязанности для некоторый вакансий
import json
import os

DIR_PATH = '/kaggle/input/bytedatahack-22/vacancy_descriptions/2_parsed.json'

with open(DIR_PATH,'r', encoding='utf8') as fp:
    descriptions = json.load(fp)

responsibilities = pd.Series(
    {
        description["ID"]: description["Content"].get("Обязанности")[0]
        if (description["Content"].get("Обязанности")) is not None
        else None
        for description in descriptions
    },
    name="responsibilities",
)

In [38]:
len(descriptions)

10000

In [40]:
# Pipeline для предобработки данных
pipeline = make_pipeline(FullDescriptionCreator(responsibilities=responsibilities), BertVectorizer(emb_model, tokenizer))

In [41]:
# Загружаем файл с окз
okz = pd.read_csv('/kaggle/input/bytedatahack-22/okz_3_4_professions.csv', sep='	')
n_classes = len(okz['code'].unique())

# Предобработка лейблов
le = LabelEncoder()
le.fit(okz['code'].values)

LabelEncoder()

In [43]:
train_data = pipeline.fit_transform(train_raw)
train_data.shape

(15650, 312)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(train_data, le.transform(targets))

In [45]:
# Попробуем Градиентный Бустинг
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=1000,
                           task_type="GPU",
                           devices='0:1')

model.fit(X_train, y_train, verbose=3)



Learning rate set to 0.107269
0:	learn: 3.3170260	total: 145ms	remaining: 2m 24s
3:	learn: 2.4153064	total: 545ms	remaining: 2m 15s
6:	learn: 1.8598403	total: 959ms	remaining: 2m 16s
9:	learn: 1.5521734	total: 1.36s	remaining: 2m 14s
12:	learn: 1.3101753	total: 1.77s	remaining: 2m 14s
15:	learn: 1.1491580	total: 2.17s	remaining: 2m 13s
18:	learn: 1.0076889	total: 2.6s	remaining: 2m 13s
21:	learn: 0.8933145	total: 3s	remaining: 2m 13s
24:	learn: 0.8104676	total: 3.5s	remaining: 2m 16s
27:	learn: 0.7412081	total: 3.99s	remaining: 2m 18s
30:	learn: 0.6864063	total: 4.4s	remaining: 2m 17s
33:	learn: 0.6378748	total: 4.8s	remaining: 2m 16s
36:	learn: 0.6052545	total: 5.19s	remaining: 2m 15s
39:	learn: 0.5771144	total: 5.59s	remaining: 2m 14s
42:	learn: 0.5430812	total: 5.99s	remaining: 2m 13s
45:	learn: 0.5172077	total: 6.39s	remaining: 2m 12s
48:	learn: 0.4937567	total: 6.79s	remaining: 2m 11s
51:	learn: 0.4709702	total: 7.43s	remaining: 2m 15s
54:	learn: 0.4502920	total: 8.03s	remaining: 

<catboost.core.CatBoostClassifier at 0x7f7dd35e9cd0>

In [46]:
def compute_metrics(preds, targets):
    f1 = f1_score(targets, preds, average='macro')
    accuracy = accuracy_score(targets, preds)
    return {'F1': f1, 'accuracy_score': accuracy}

In [47]:
val_predictions = model.predict(X_test)
val_predictions.shape

(3913, 1)

In [48]:
compute_metrics(val_predictions.flatten(), y_test)

{'F1': 0.6762891545830143, 'accuracy_score': 0.9412215691285458}

# Тренировка модели на полном датасете

In [49]:
eval_model = CatBoostClassifier(iterations=1000,
                           task_type="GPU",
                           devices='0:1')

eval_model.fit(train_data,
          le.transform(targets),
          verbose=True)



Learning rate set to 0.113753
0:	learn: 3.1488728	total: 178ms	remaining: 2m 57s
1:	learn: 2.7331589	total: 340ms	remaining: 2m 49s
2:	learn: 2.4572117	total: 509ms	remaining: 2m 49s
3:	learn: 2.2235810	total: 665ms	remaining: 2m 45s
4:	learn: 2.0737018	total: 816ms	remaining: 2m 42s
5:	learn: 1.9076021	total: 979ms	remaining: 2m 42s
6:	learn: 1.7533312	total: 1.15s	remaining: 2m 42s
7:	learn: 1.6541032	total: 1.29s	remaining: 2m 40s
8:	learn: 1.5541969	total: 1.44s	remaining: 2m 38s
9:	learn: 1.4591256	total: 1.63s	remaining: 2m 41s
10:	learn: 1.3881134	total: 1.79s	remaining: 2m 41s
11:	learn: 1.3096871	total: 1.96s	remaining: 2m 41s
12:	learn: 1.2559537	total: 2.12s	remaining: 2m 40s
13:	learn: 1.1904905	total: 2.28s	remaining: 2m 40s
14:	learn: 1.1321407	total: 2.45s	remaining: 2m 40s
15:	learn: 1.0831555	total: 2.6s	remaining: 2m 40s
16:	learn: 1.0330240	total: 2.78s	remaining: 2m 40s
17:	learn: 0.9877021	total: 2.95s	remaining: 2m 40s
18:	learn: 0.9461247	total: 3.12s	remaining: 

<catboost.core.CatBoostClassifier at 0x7f7dd32c9310>

In [50]:
test_raw = pd.read_csv('/kaggle/input/bytedatahack-22/test.csv')
test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090 entries, 0 to 1089
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   index        1090 non-null   int64 
 1   name         1090 non-null   object
 2   description  1090 non-null   object
dtypes: int64(1), object(2)
memory usage: 25.7+ KB


In [51]:
test_data = pipeline.transform(test_raw)

In [52]:
preds = eval_model.predict(test_data)
result = pd.DataFrame(dict(index=test_raw['index'], target=le.inverse_transform(preds.flatten())))
result.to_csv('submission.csv', index=None)