In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error

from category_encoders import TargetEncoder
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor



def parse_json_column(df, column):
    return df[column].apply(lambda x: eval(x) if pd.notnull(x) else {})


def preprocess_data(df, is_train=True):
    data = df.copy()
    # удаляем бесполезные колонки
    data.drop(columns=[
        'response_url','sort_point_distance',
        'immediate_redirect_url','url','alternate_url'
    ], inplace=True, errors='ignore')

    # разбираем JSON
    for col in ['area','address','experience']:
        data[col] = parse_json_column(data, col)
    data['key_skills'] = data['key_skills'].apply(lambda x: [i['name'] for i in eval(x)] if pd.notnull(x) else [])
    data['specializations'] = data['specializations'].apply(lambda x: [i['name'] for i in eval(x)] if pd.notnull(x) else [])

    # новые фичи
    data['area_id'] = data['area'].apply(lambda d: d.get('id',''))
    data['city'] = data['address'].apply(lambda d: d.get('city',''))
    data['lat'] = data['address'].apply(lambda d: d.get('lat', np.nan))
    data['lng'] = data['address'].apply(lambda d: d.get('lng', np.nan))
    data['exp_level'] = data['experience'].apply(lambda d: d.get('id',''))
    data['skill_count'] = data['key_skills'].apply(len)
    data['spec_count'] = data['specializations'].apply(len)

    # текстовые длины
    data['desc_len'] = data['description'].fillna('').str.len()
    data['snippet_req'] = data['snippet'].apply(
        lambda x: eval(x).get('requirement','') if pd.notnull(x) else ''
    )
    data['snippet_resp'] = data['snippet'].apply(
        lambda x: eval(x).get('responsibility','') if pd.notnull(x) else ''
    )
    data['req_len'] = data['snippet_req'].str.len()
    data['resp_len'] = data['snippet_resp'].str.len()

    # временные
    data['published_at'] = pd.to_datetime(data['published_at'], utc=True)
    data['created_at'] = pd.to_datetime(data['created_at'],   utc=True)
    data['pub_month'] = data['published_at'].dt.month
    data['pub_weekday'] = data['published_at'].dt.weekday
    data['pub_hour'] = data['published_at'].dt.hour
    data['days_active'] = (data['published_at'] - data['created_at']).dt.days

    # убираем исходники
    data.drop(columns=[
        'area','address','experience',
        'key_skills','specializations',
        'description','snippet',
        'published_at','created_at'
    ], inplace=True)

    if is_train:
        data['log_mean_salary'] = np.log1p(data['mean_salary'])

    # заполнение пропусков
    for c in data.select_dtypes(['number','bool']).columns:
        data[c].fillna(0, inplace=True)
    for c in data.select_dtypes(['object']).columns:
        data[c].fillna('', inplace=True)

    # списки признаков
    num_feats = [
        'skill_count','spec_count','desc_len',
        'req_len','resp_len','lat','lng',
        'pub_month','pub_weekday','pub_hour','days_active'
    ]
    cat_feats = ['area_id','city','exp_level','department','region','schedule']
    text_feats = ['name','snippet_req','snippet_resp']

    if is_train:
        X = data.drop(columns=['mean_salary','log_mean_salary'])
        y = data['log_mean_salary']
        return X, y, num_feats, cat_feats, text_feats
    else:
        X = data.drop(columns=['mean_salary'], errors='ignore')
        return X, num_feats, cat_feats, text_feats


def get_preprocessor(numerical_features, categorical_features, text_features):
    num_tf = StandardScaler()
    cat_tf = TargetEncoder(cols=categorical_features, smoothing=0.3)

    transformers = [
        ('num', num_tf, numerical_features),
        ('cat', cat_tf, categorical_features),
    ]
    # отдельно TF-IDF для каждого текстового поля
    for fld in text_features:
        transformers.append((
            f'tfidf_{fld}',
            TfidfVectorizer(max_features=5000, ngram_range=(1,2)),
            fld
        ))

    return ColumnTransformer(transformers=transformers, remainder='drop')


def build_ensemble(preprocessor):
    cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                             eval_metric='MAE', random_seed=42, verbose=False)
    lgbm = LGBMRegressor(n_estimators=1000, learning_rate=0.05,
                         num_leaves=31, objective='regression_l1', random_state=42)
    xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05,
                       max_depth=6, objective='reg:squarederror', random_state=42)

    
    voting = VotingRegressor(
        estimators=[('xgb',xgb),('cat',cat),('lgbm',lgbm)],
        weights=[3,1,1]
    )

    return Pipeline([('preproc', preprocessor), ('reg', voting)])


def train_and_evaluate(path):
    df = pd.read_csv(path)
    X, y, num_feats, cat_feats, text_feats = preprocess_data(df, is_train=True)
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    preproc = get_preprocessor(num_feats, cat_feats, text_feats)
    model  = build_ensemble(preproc)
    model.fit(X_tr, y_tr)

    y_pred_log = model.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_true = np.expm1(y_val)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"Validation MAE: {mae:.2f}")
    return model



trained_model = train_and_evaluate("train_contest.csv")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[c].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.554850 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 228403
[LightGBM] [Info] Number of data points in the train set: 24057, number of used features: 9057
[LightGBM] [Info] Start training from score 11.156265




Validation MAE: 28972.15


In [2]:
data_for_pred = (
    pd.read_csv("for_prediction.csv")
      .drop(columns=[
          "response_url",
          "immediate_redirect_url",
          "url",
          "alternate_url",
          "Id"
      ], errors="ignore")
)
data_for_pred

Unnamed: 0,premium,name,department,has_test,response_letter_required,area,type,address,sort_point_distance,published_at,...,schedule,working_days,working_time_intervals,working_time_modes,accept_temporary,description,experience,key_skills,specializations,region
0,False,Помощник маркетолога,,False,False,"{'id': '1', 'name': 'Москва', 'url': 'https://...","{'id': 'open', 'name': 'Открытая'}",,,2022-04-30T00:02:25+0300,...,"{'id': 'remote', 'name': 'Удаленная работа'}",[],[],[],False,<p>В IT-компанию Ищем Интернет-маркетолога (уд...,"{'id': 'between1And3', 'name': 'От 1 года до 3...","[{'name': 'Контекстная реклама'}, {'name': 'Ин...","[{'id': '3.206', 'name': 'Печатная реклама', '...",Москва
1,False,Менеджер по персоналу,,False,False,"{'id': '2', 'name': 'Санкт-Петербург', 'url': ...","{'id': 'open', 'name': 'Открытая'}","{'city': 'Санкт-Петербург', 'street': 'Новочер...",,2022-05-18T12:58:27+0300,...,"{'id': 'fullDay', 'name': 'Полный день'}",[],[],[],False,<p>Мы приглашаем на работу соискателей на долж...,"{'id': 'noExperience', 'name': 'Нет опыта'}","[{'name': 'Подбор персонала'}, {'name': 'Работ...","[{'id': '6.254', 'name': 'Рекрутмент', 'profar...",Санкт-Петербург
2,False,"Инструктор на детскую игровую площадку (ТЦ ""Ме...",,False,False,"{'id': '99', 'name': 'Уфа', 'url': 'https://ap...","{'id': 'open', 'name': 'Открытая'}","{'city': 'Уфа', 'street': 'Рубежная улица', 'b...",,2022-05-06T16:07:14+0300,...,"{'id': 'fullDay', 'name': 'Полный день'}",[],[],[],False,"<strong>Если ты любишь играть, прыгать и бегат...","{'id': 'noExperience', 'name': 'Нет опыта'}",[],"[{'id': '24.378', 'name': 'Тренерский состав',...",Республика Башкортостан
3,False,Программист Delphi,,False,False,"{'id': '4', 'name': 'Новосибирск', 'url': 'htt...","{'id': 'open', 'name': 'Открытая'}",,,2022-05-24T10:25:45+0300,...,"{'id': 'remote', 'name': 'Удаленная работа'}",[],[],[],False,<p>Компания AMS Software - разработчик популяр...,"{'id': 'between3And6', 'name': 'От 3 до 6 лет'}","[{'name': 'Delphi'}, {'name': 'Разработка ПО'}...","[{'id': '1.221', 'name': 'Программирование, Ра...",Новосибирская область
4,False,Ведущий специалист ВКС,,False,False,"{'id': '1', 'name': 'Москва', 'url': 'https://...","{'id': 'open', 'name': 'Открытая'}","{'city': 'Москва', 'street': 'Чистопрудный бул...",,2022-05-18T09:56:41+0300,...,"{'id': 'fullDay', 'name': 'Полный день'}",[],[],[],False,<p><strong>Компания &quot;ТехноКад&quot; - оди...,"{'id': 'between1And3', 'name': 'От 1 года до 3...","[{'name': 'Настройка ПК'}, {'name': 'Техническ...","[{'id': '1.172', 'name': 'Начальный уровень, М...",Москва
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7513,False,HR менеджер,,False,False,"{'id': '131', 'name': 'Симферополь', 'url': 'h...","{'id': 'open', 'name': 'Открытая'}","{'city': 'Симферополь', 'street': 'улица Жени ...",,2022-05-19T10:39:51+0300,...,"{'id': 'fullDay', 'name': 'Полный день'}",[],[],[],False,<strong>Обязанности:</strong> <ul> <li>Поиск и...,"{'id': 'between1And3', 'name': 'От 1 года до 3...","[{'name': 'Подбор персонала'}, {'name': 'Работ...","[{'id': '6.254', 'name': 'Рекрутмент', 'profar...",Республика Крым
7514,False,Руководитель HR отдела,,False,False,"{'id': '1', 'name': 'Москва', 'url': 'https://...","{'id': 'open', 'name': 'Открытая'}","{'city': 'Москва', 'street': 'Электрический пе...",,2022-05-04T20:15:16+0300,...,"{'id': 'fullDay', 'name': 'Полный день'}",[],[],[],False,<strong>Обязанности:</strong> <ul> <li>Организ...,"{'id': 'between1And3', 'name': 'От 1 года до 3...","[{'name': 'Ведение отчетности'}, {'name': 'Вед...","[{'id': '6.254', 'name': 'Рекрутмент', 'profar...",Москва
7515,False,Системный аналитик (Стажер),"{'id': '80-80-bank', 'name': ' Альфа-Банк'}",False,False,"{'id': '3', 'name': 'Екатеринбург', 'url': 'ht...","{'id': 'open', 'name': 'Открытая'}","{'city': 'Екатеринбург', 'street': 'Горького',...",,2022-05-24T09:08:55+0300,...,"{'id': 'flexible', 'name': 'Гибкий график'}",[],[],[],False,<p>Альфа-Банк — крупнейший частный банк России...,"{'id': 'noExperience', 'name': 'Нет опыта'}","[{'name': 'Разработка технических заданий'}, {...","[{'id': '17.751', 'name': 'Другое', 'profarea_...",Свердловская область
7516,False,Руководитель Отдела развития банковских технол...,,False,False,"{'id': '22', 'name': 'Владивосток', 'url': 'ht...","{'id': 'open', 'name': 'Открытая'}","{'city': 'Владивосток', 'street': 'Партизански...",,2022-05-05T10:33:05+0300,...,"{'id': 'fullDay', 'name': 'Полный день'}",[],[],[],False,<strong>Обязанности:</strong> <ul> <li> <p>Ана...,"{'id': 'between1And3', 'name': 'От 1 года до 3...","[{'name': 'MS PowerPoint'}, {'name': 'Бизнес-а...","[{'id': '1.221', 'name': 'Программирование, Ра...",Приморский край


In [None]:
X_pred, num_feats, cat_feats, text_feats = preprocess_data(data_for_pred, is_train=False)

preds_log = trained_model.predict(X_pred)

preds = np.expm1(preds_log)

data_for_pred["predicted_mean_salary"] = preds

#data_for_pred.to_csv("predictions.csv", index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[c].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[c].fillna('', inplace=True)


In [4]:
submission = pd.DataFrame({
    "Id": np.arange(len(preds)),
    "Predicted": preds
})
#submission.to_csv("predictions.csv", index=False)

### Сравним с одним CatBoost

In [5]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.metrics import mean_absolute_error

from category_encoders import TargetEncoder
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor



def parse_json_column(df, column):
    return df[column].apply(lambda x: eval(x) if pd.notnull(x) else {})


def preprocess_data(df, is_train=True):
    data = df.copy()
    # удаляем бесполезные колонки
    data.drop(columns=[
        'response_url','sort_point_distance',
        'immediate_redirect_url','url','alternate_url'
    ], inplace=True, errors='ignore')

    # разбираем JSON
    for col in ['area','address','experience']:
        data[col] = parse_json_column(data, col)
    data['key_skills'] = data['key_skills'].apply(lambda x: [i['name'] for i in eval(x)] if pd.notnull(x) else [])
    data['specializations'] = data['specializations'].apply(lambda x: [i['name'] for i in eval(x)] if pd.notnull(x) else [])

    # новые фичи
    data['area_id'] = data['area'].apply(lambda d: d.get('id',''))
    data['city'] = data['address'].apply(lambda d: d.get('city',''))
    data['lat'] = data['address'].apply(lambda d: d.get('lat', np.nan))
    data['lng'] = data['address'].apply(lambda d: d.get('lng', np.nan))
    data['exp_level'] = data['experience'].apply(lambda d: d.get('id',''))
    data['skill_count'] = data['key_skills'].apply(len)
    data['spec_count'] = data['specializations'].apply(len)

    # текстовые длины
    data['desc_len'] = data['description'].fillna('').str.len()
    data['snippet_req'] = data['snippet'].apply(
        lambda x: eval(x).get('requirement','') if pd.notnull(x) else ''
    )
    data['snippet_resp'] = data['snippet'].apply(
        lambda x: eval(x).get('responsibility','') if pd.notnull(x) else ''
    )
    data['req_len'] = data['snippet_req'].str.len()
    data['resp_len'] = data['snippet_resp'].str.len()

    # временные
    data['published_at'] = pd.to_datetime(data['published_at'], utc=True)
    data['created_at'] = pd.to_datetime(data['created_at'],   utc=True)
    data['pub_month'] = data['published_at'].dt.month
    data['pub_weekday'] = data['published_at'].dt.weekday
    data['pub_hour'] = data['published_at'].dt.hour
    data['days_active'] = (data['published_at'] - data['created_at']).dt.days

    # убираем исходники
    data.drop(columns=[
        'area','address','experience',
        'key_skills','specializations',
        'description','snippet',
        'published_at','created_at'
    ], inplace=True)

    if is_train:
        data['log_mean_salary'] = np.log1p(data['mean_salary'])

    # заполнение пропусков
    for c in data.select_dtypes(['number','bool']).columns:
        data[c].fillna(0, inplace=True)
    for c in data.select_dtypes(['object']).columns:
        data[c].fillna('', inplace=True)

    # списки признаков
    num_feats = [
        'skill_count','spec_count','desc_len',
        'req_len','resp_len','lat','lng',
        'pub_month','pub_weekday','pub_hour','days_active'
    ]
    cat_feats = ['area_id','city','exp_level','department','region','schedule']
    text_feats = ['name','snippet_req','snippet_resp']

    if is_train:
        X = data.drop(columns=['mean_salary','log_mean_salary'])
        y = data['log_mean_salary']
        return X, y, num_feats, cat_feats, text_feats
    else:
        X = data.drop(columns=['mean_salary'], errors='ignore')
        return X, num_feats, cat_feats, text_feats


def get_preprocessor(numerical_features, categorical_features, text_features):
    num_tf = StandardScaler()
    cat_tf = TargetEncoder(cols=categorical_features, smoothing=0.3)

    transformers = [
        ('num', num_tf, numerical_features),
        ('cat', cat_tf, categorical_features),
    ]
    # отдельно TF-IDF для каждого текстового поля
    for fld in text_features:
        transformers.append((
            f'tfidf_{fld}',
            TfidfVectorizer(max_features=5000, ngram_range=(1,2)),
            fld
        ))

    return ColumnTransformer(transformers=transformers, remainder='drop')


def build_ensemble(preprocessor):
    cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6,
                             eval_metric='MAE', random_seed=42, verbose=False)
    

    return Pipeline([('preproc', preprocessor), ('reg', cat)])


def train_and_evaluate(path):
    df = pd.read_csv(path)
    X, y, num_feats, cat_feats, text_feats = preprocess_data(df, is_train=True)
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    preproc = get_preprocessor(num_feats, cat_feats, text_feats)
    model  = build_ensemble(preproc)
    model.fit(X_tr, y_tr)

    y_pred_log = model.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_true = np.expm1(y_val)
    mae = mean_absolute_error(y_true, y_pred)
    print(f"Validation MAE: {mae:.2f}")
    return model



trained_model = train_and_evaluate("train_contest.csv")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[c].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[c].fillna('', inplace=True)


Validation MAE: 30493.52
