In [2]:
import pandas as pd
import numpy as np

In [3]:
df_train_X = pd.read_csv('resources/X_train.csv')
df_test_X = pd.read_csv('resources/X_test.csv')
df_train_Y = pd.read_csv('resources/y_train.csv')

df = df_train_Y.merge(df_train_X)

df['salary_from'] = df['salary_from'].fillna(0)
df['salary_gross'] = df['salary_gross'].fillna(True)
df['description'] = df['description'].fillna("")
df.isnull().sum()

id                          0
salary_to                   0
name                        0
has_test                    0
response_letter_required    0
salary_from                 0
salary_currency             0
salary_gross                0
published_at                0
created_at                  0
employer_name               0
description                 0
area_id                     0
area_name                   0
dtype: int64

In [4]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


# Функция для векторизации текста с использованием BERT
def get_bert_vectors(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = bert_model(**tokens)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


df['description_vectors'] = df['description'].apply(get_bert_vectors)

# Создание отдельных столбцов для каждой компоненты вектора
df[['vector_dim_' + str(i) for i in range(df['description_vectors'][0].shape[0])]] = pd.DataFrame(
    df['description_vectors'].to_list(), index=df.index)

# Удаление столбца с векторами текста
df = df.drop('description_vectors', axis=1)

KeyboardInterrupt: 

In [None]:
import spacy

# Загрузка предобученной модели spaCy для работы с векторными представлениями
nlp = spacy.load("ru_core_news_lg")

# Применение spaCy для векторизации текста в столбце 'description'
df['description_vectors'] = df['description'].apply(lambda x: nlp(x).vector)

# Создание отдельных столбцов для каждой компоненты вектора
df[['vector_dim_' + str(i) for i in range(df['description_vectors'][0].shape[0])]] =(
    pd.DataFrame(df['description_vectors'].to_list(), index=df.index))

# Удаление столбца с векторами текста
df = df.drop('description_vectors', axis=1)
df.info()

In [58]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

train, test = train_test_split(df[df['salary_from'] < 500000], train_size=0.6, random_state=42)
val, test = train_test_split(test, train_size=0.5, random_state=42)

X = ['name', 'has_test', 'response_letter_required', 'salary_from', 'employer_name', 'area_name']
cat_features = ['name', 'employer_name', 'area_name']
y = ['salary_to']

model = CatBoostRegressor(
    cat_features=cat_features,
    eval_metric='SMAPE',
    learning_rate=0.33,
    random_seed=42,
    verbose=100
)

model.fit(df[X], df[y])

test['salary_pred'] = model.predict(test[X])
predict = model.predict(df_test_X[X])


def smape(y_true, y_pred):
    return 100 / len(y_true) * np.sum(2 * np.abs(y_true - y_pred) / (np.abs(y_true) + np.abs(y_pred)))


submit_id = df_test_X.id.to_list()
result = pd.DataFrame({'id': submit_id, 'salary_to': np.round(predict)})
result.to_csv('submission.csv', index=False)

smape(test['salary_pred'], test['salary_to'])


0:	learn: 51.2130451	total: 66.8ms	remaining: 1m 6s
100:	learn: 28.0168501	total: 3.88s	remaining: 34.5s
200:	learn: 27.4213972	total: 7.45s	remaining: 29.6s
300:	learn: 27.2878215	total: 11s	remaining: 25.6s
400:	learn: 27.0837745	total: 14.9s	remaining: 22.3s
500:	learn: 26.8877383	total: 19.4s	remaining: 19.3s
600:	learn: 26.7096342	total: 23.1s	remaining: 15.3s
700:	learn: 26.5692220	total: 26.9s	remaining: 11.5s
800:	learn: 26.4043663	total: 30.5s	remaining: 7.57s
900:	learn: 26.3342207	total: 34.6s	remaining: 3.8s
999:	learn: 26.2788466	total: 38.2s	remaining: 0us


25.142078344326702