In [1]:
import ast
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.manifold import TSNE
from umap import UMAP

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
vacancies = pd.read_csv(
    '../data/prepared_positions_with_emmbedings.csv')
vacancies['work_skills'] = vacancies['work_skills'].apply(
    lambda x: ast.literal_eval(x) if pd.notna(x) else [])
vacancies_all = vacancies


In [3]:
vacancies = vacancies[['custom_position', 'title_embedding']]
vacancies.head(10)

Unnamed: 0,custom_position,title_embedding
0,продавец кассир,[-1.8447946 0.53536004 1.4867423 3.949570...
1,продавец кассир,[-1.8447946 0.53536004 1.4867423 3.949570...
2,кассир,[-0.79767126 0.24670209 1.7672167 5.054442...
3,грузчик,[ 0.91264826 -1.5279536 -2.3753495 5.660104...
4,продавец,[-2.4975226 0.88552547 2.2982452 6.313721...
5,продавец,[-2.4975226 0.88552547 2.2982452 6.313721...
6,мерчендайзер грузчик,[-0.30694836 -0.3114081 -2.3399024 3.872964...
7,мерчендайзер грузчик,[-0.30694836 -0.3114081 -2.3399024 3.872964...
8,продавец,[-2.4975226 0.88552547 2.2982452 6.313721...
9,мерчендайзер грузчик,[-0.30694836 -0.3114081 -2.3399024 3.872964...


In [4]:
def dataframe_string_to_embedings(dataframe):
    embeddings = []
    for item in dataframe.title_embedding:
        digits = re.sub(r'\s+', item.strip('[]'), ' ').split()
        if len(digits) != 100:
            raise Exception('Wrong amount of numbers')
        embeddings.append([float(d) for d in digits])
    return embeddings

In [5]:
vacancies = vacancies.drop_duplicates(subset='title_embedding')
positions = vacancies.custom_position
pos_vec100 = dataframe_string_to_embedings(vacancies)
array_embedings = np.array(pos_vec100)

In [6]:
# tsne = TSNE(n_components=2)
# vec_reduced = tsne.fit_transform(array_embedings)
# print(vec_reduced.shape)
umap_model = UMAP(n_components=2, random_state=17)
vec_reduced = umap_model.fit_transform(array_embedings)
print(vec_reduced.shape)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


(16628, 2)


In [7]:
def get_dataframe(embeddings, labels, positions):
    df = pd.DataFrame({
        'X1': embeddings[:, 0],
        'X2': embeddings[:, 1],
        'Class': labels,
        'position': positions,
    })
    df['Class'] = df['Class'].astype(str)
    return df

In [8]:
def get_fig(df, method_name):
    return px.scatter(df, x='X1', y='X2', 
                      color='Class', 
                      title=f'Scatter Plot with 2D Embedding Space {method_name}',
                      labels={'X1': 'Feature 1', 
                              'X2': 'Feature 2',
                              'Class': 'Class'},
                      category_orders={'Class': df['Class'].unique()},
                      hover_data=['position'])

In [9]:
def get_clustering_labels(model, n_clusters=50):
    model.fit(vec_reduced)
    return model.labels_

def get_figures(methods):
    figures = []
    for method in methods:
        labels = get_clustering_labels(method, 50)
        df = get_dataframe(vec_reduced, labels, positions)
        method_name = method.__class__.__name__
        figures.append(get_fig(df, method_name))
    return figures

In [10]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=41, random_state=37)

labels = get_clustering_labels(model, 50)
df = get_dataframe(vec_reduced, labels, positions)
fig = get_fig(df, model.__class__.__name__)
fig.update_layout(width=860, height=530)
fig.show()

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
print('Мы берем класс номер 2, это водители, уникальных позиций:', df.Class.value_counts()[2])

Мы берем класс номер 2, это водители, уникальных позиций: 740



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [12]:
unique_positions = df[df.Class == '2'].position.unique()

In [13]:
filtered_df = vacancies_all[vacancies_all['custom_position'].isin(
    unique_positions)].drop(['Unnamed: 0', 'id'], axis=1)
print(filtered_df.shape)
filtered_df.head()

(10087, 20)


Unnamed: 0,custom_position,schedule,salary_from,salary_to,education_name,city_id,work_skills,count_skills,required_experience,skill_1,skill_2,skill_3,skill_4,skill_5,skill_6,skill_7,skill_8,skill_9,skill_10,title_embedding
52,водитель категории,полный рабочий день,55000.0,95000.0,любое,8,"[перевозка грузов, работа с сопроводительной д...",10,От 1 года до 3 лет,перевозка грузов,работа с сопроводительной документацией,правила дорожного движения,вежливость,пунктуальность,обучаемость,содержание автомобиля в чистоте,спокойный стиль вождения,экспедирование,сдача груза,[ 0.5440114 -0.16116399 1.0065441 1.148866...
53,водитель экспедитор кат,полный рабочий день,90000.0,90000.0,любое,1,[],0,,,,,,,,,,,,[ 0.10713582 -0.0115784 0.2086894 0.945820...
61,водитель категории,вахта,120000.0,140000.0,любое,9,[],0,От 1 года до 3 лет,,,,,,,,,,,[ 0.5440114 -0.16116399 1.0065441 1.148866...
155,водитель экспедитор категории,полный рабочий день,80000.0,140000.0,любое,2,"[работа с документами, экспедирование, перевоз...",5,От 1 года до 3 лет,работа с документами,экспедирование,перевозка грузов,правила дорожного движения,безаварийное вождение,,,,,,[ 6.6381365e-02 -3.1727878e-03 1.4653969e-01 ...
356,водитель категории,полный рабочий день,70000.0,150000.0,любое,6,"[пунктуальность, спокойный стиль вождения, пра...",4,От 1 года до 3 лет,пунктуальность,спокойный стиль вождения,правила дорожного движения,перевозка грузов,,,,,,,[ 0.5440114 -0.16116399 1.0065441 1.148866...


In [14]:
from pprint import pprint

cluster_skills = filtered_df.filter(regex='^skill_')
cluster_skills = cluster_skills.applymap(str)
skills_count = cluster_skills.apply(cluster_skills.value_counts).sum(axis=1)
skills_count = skills_count.sort_values(ascending=False)
for item in skills_count.items():
    pprint(item)

('nan', 79044.0)
('перевозка грузов', 2919.0)
('правила дорожного движения', 1840.0)
('безаварийное вождение', 1259.0)
('работа с сопроводительной документацией', 1224.0)
('спокойный стиль вождения', 1083.0)
('содержание автомобиля в чистоте', 910.0)
('ответственность', 873.0)
('устройство автомобилей', 815.0)
('экспедирование', 803.0)
('техническое обслуживание автомобилей', 705.0)
('перевозка пассажиров', 592.0)
('водительские права BC', 582.0)
('пунктуальность', 458.0)
('знание Москвы и Московской области', 444.0)
('стаж вождения от 3 лет', 423.0)
('погрузочно-разгрузочные работы', 360.0)
('работа на погрузчике', 301.0)
('междугородние перевозки', 279.0)
('перевозка грузов по России', 252.0)
('удостоверения тракториста-машиниста', 235.0)
('ремонт автомобилей', 214.0)
('стрессоустойчивость', 213.0)
('сдача груза', 204.0)
('сопровождение груза', 198.0)
('стаж вождения от 5 лет', 196.0)
('медицинская справка', 190.0)
('ВОДИТЕЛЬ ПОГРУЗЧИКА', 169.0)
('работа с документами', 164.0)
('хоро


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



In [15]:
skills = [
    ['перевозка грузов', 'транспортная логистика', 'экспедирование', 'доставка заказов', 'доставка товаров', 'автомобильные грузоперевозки', 'экспедитор'],
    ['работа с сопроводительной документацией', 'работа с накладными', 'работа с документами', 'складской документооборот', 'оформление документации'],
    ['перевозка пассажиров', 'перевозка детей', ],
    ['техническое обслуживание автомобилей', 'ремонт автомобилей', 'устройство автомобилей', 'знание устройства автомобиля', ],
    # ['сопровождение груза', 'перевозка опасных грузов',  ],
    ['работа на погрузчике', 'погрузочно-разгрузочные работы', 'водитель погрузчика', 'управление электропогрузчиком', 'права на погрузчик', 'погрузочные работы', ],
]

In [16]:
import fasttext
from scipy.spatial.distance import cosine

ft = fasttext.load_model('../data/cc.ru.300.bin')

v1 = ft.get_sentence_vector('перевозка грузов')
v2 = ft.get_sentence_vector('перевозка пассажиров')
cosine(v1, v2)



0.16281461715698242

Определим порог как 0.15

In [17]:
skills_vec = []
i = 0
for skill in skills:
    skills_vec.append([])
    for item in skill:
        skills_vec[i].append(ft.get_sentence_vector(item))
    i += 1

In [18]:
work_skills = filtered_df.work_skills
skills_ohe = []
for skills in work_skills:
    vec_ohe = np.zeros(len(skills_vec))
    for skill in skills:
        skill = ' '.join(re.findall(r'\w+', skill))
        skill_vec = ft.get_sentence_vector(skill.lower())
        for i in range(len(skills_vec)):
            for prep_vec in skills_vec[i]:
                if cosine(skill_vec, prep_vec) < 0.15:
                    vec_ohe[i] = 1
    skills_ohe.append(vec_ohe)
# skills_ohe

In [19]:
columns = ['грузоперевозка', 'документооборот', 'пассажиры', 'ремонт', 'погрузка']
skills_OHE = pd.DataFrame(skills_ohe, columns=columns)
print(skills_OHE.shape)
skills_OHE

(10087, 5)


Unnamed: 0,грузоперевозка,документооборот,пассажиры,ремонт,погрузка
0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...
10082,0.0,1.0,0.0,0.0,0.0
10083,0.0,0.0,0.0,0.0,0.0
10084,0.0,0.0,0.0,0.0,0.0
10085,0.0,0.0,0.0,0.0,0.0


In [20]:
filtered_df = filtered_df[[
    'city_id', 
    'custom_position', 
    'schedule', 
    'salary_from', 
    'salary_to', 
    'education_name', 
    ]]

In [21]:
columns_to_remove = filtered_df.filter(regex='^skill_').columns
filtered_df = filtered_df.drop(columns_to_remove, axis=1)
drivers_ohe = pd.concat([filtered_df.reset_index(), skills_OHE], axis=1).drop('index', axis=1)
drivers_ohe.head()

Unnamed: 0,city_id,custom_position,schedule,salary_from,salary_to,education_name,грузоперевозка,документооборот,пассажиры,ремонт,погрузка
0,8,водитель категории,полный рабочий день,55000.0,95000.0,любое,1.0,1.0,0.0,0.0,0.0
1,1,водитель экспедитор кат,полный рабочий день,90000.0,90000.0,любое,0.0,0.0,0.0,0.0,0.0
2,9,водитель категории,вахта,120000.0,140000.0,любое,0.0,0.0,0.0,0.0,0.0
3,2,водитель экспедитор категории,полный рабочий день,80000.0,140000.0,любое,1.0,1.0,0.0,0.0,0.0
4,6,водитель категории,полный рабочий день,70000.0,150000.0,любое,1.0,0.0,0.0,0.0,0.0


In [36]:
df_test = drivers_ohe.drop(['custom_position', 'city_id'], axis=1)
df_test = pd.get_dummies(df_test, columns=['schedule', 'education_name'], dtype=int)
df_test

Unnamed: 0,salary_from,salary_to,грузоперевозка,документооборот,пассажиры,ремонт,погрузка,schedule_вахта,schedule_полный рабочий день,schedule_свободный график,schedule_сменный график,schedule_удаленная работа,schedule_частичная занятость,education_name_высшее,education_name_любое,education_name_неполное высшее,education_name_среднее,education_name_среднее профессиональное
0,55000.000000,95000.000000,1.0,1.0,0.0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,0
1,90000.000000,90000.000000,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,0
2,120000.000000,140000.000000,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0
3,80000.000000,140000.000000,1.0,1.0,0.0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,0
4,70000.000000,150000.000000,1.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10082,60000.000000,65000.000000,0.0,1.0,0.0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,0
10083,40000.000000,40000.000000,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,0
10084,114942.528736,149425.287356,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0
10085,110000.000000,110000.000000,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,0


In [22]:
positions_vec = drivers_ohe.custom_position.apply(lambda x: ft.get_sentence_vector(x))
# print(positions_vec)
df_positions = pd.DataFrame(positions_vec.to_numpy().tolist(), columns=[f'position_{i}' for i in range(len(positions_vec.values[0]))])
df_positions

Unnamed: 0,position_0,position_1,position_2,position_3,position_4,position_5,position_6,position_7,position_8,position_9,...,position_290,position_291,position_292,position_293,position_294,position_295,position_296,position_297,position_298,position_299
0,0.037722,-0.026277,-0.083610,0.039702,0.023920,-0.017857,0.052328,0.042038,-0.058359,0.002514,...,-0.055645,-0.017912,0.062783,-0.028709,-0.045368,-0.018462,-0.019988,0.031160,-0.034890,-0.012783
1,0.057113,-0.036494,-0.057906,-0.009426,0.050849,-0.059959,0.012705,0.029293,-0.066008,0.006297,...,-0.026878,-0.008690,0.035961,-0.019801,-0.009858,-0.029855,-0.027322,-0.003753,0.009796,-0.018155
2,0.037722,-0.026277,-0.083610,0.039702,0.023920,-0.017857,0.052328,0.042038,-0.058359,0.002514,...,-0.055645,-0.017912,0.062783,-0.028709,-0.045368,-0.018462,-0.019988,0.031160,-0.034890,-0.012783
3,0.048318,-0.038399,-0.077475,0.010614,0.027379,-0.025955,0.045775,0.032823,-0.043736,0.029078,...,-0.041101,-0.017975,0.049148,-0.017079,-0.018976,-0.015754,-0.021058,0.001742,0.000186,-0.004655
4,0.037722,-0.026277,-0.083610,0.039702,0.023920,-0.017857,0.052328,0.042038,-0.058359,0.002514,...,-0.055645,-0.017912,0.062783,-0.028709,-0.045368,-0.018462,-0.019988,0.031160,-0.034890,-0.012783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10082,0.068219,-0.030418,-0.086025,0.006649,0.041626,-0.069877,0.042986,0.030692,-0.072538,0.017961,...,-0.042967,-0.011246,0.036777,0.011921,0.000618,-0.005842,-0.031306,-0.030486,0.030940,-0.002282
10083,0.024755,-0.011593,-0.034155,0.038152,0.037316,-0.055205,0.040712,0.046663,-0.102554,-0.038263,...,-0.071965,0.067809,-0.001512,0.024822,-0.019607,0.001784,-0.007610,0.031606,-0.052274,-0.040917
10084,0.037722,-0.026277,-0.083610,0.039702,0.023920,-0.017857,0.052328,0.042038,-0.058359,0.002514,...,-0.055645,-0.017912,0.062783,-0.028709,-0.045368,-0.018462,-0.019988,0.031160,-0.034890,-0.012783
10085,0.057113,-0.036494,-0.057906,-0.009426,0.050849,-0.059959,0.012705,0.029293,-0.066008,0.006297,...,-0.026878,-0.008690,0.035961,-0.019801,-0.009858,-0.029855,-0.027322,-0.003753,0.009796,-0.018155


In [45]:
df_full = pd.concat([drivers_ohe, df_positions], axis=1).drop('custom_position', axis=1)
print(df_full.shape)
df_full.city_id.value_counts(normalize=True).nlargest(10).sum()
# len(df_full.city_id.value_counts(normalize=True))

(10087, 310)


0.5142262317834837

In [24]:
df_full.city_id = df_full.city_id.apply(str)
df_full.head()

Unnamed: 0,city_id,schedule,salary_from,salary_to,education_name,грузоперевозка,документооборот,пассажиры,ремонт,погрузка,...,position_290,position_291,position_292,position_293,position_294,position_295,position_296,position_297,position_298,position_299
0,8,полный рабочий день,55000.0,95000.0,любое,1.0,1.0,0.0,0.0,0.0,...,-0.055645,-0.017912,0.062783,-0.028709,-0.045368,-0.018462,-0.019988,0.03116,-0.03489,-0.012783
1,1,полный рабочий день,90000.0,90000.0,любое,0.0,0.0,0.0,0.0,0.0,...,-0.026878,-0.00869,0.035961,-0.019801,-0.009858,-0.029855,-0.027322,-0.003753,0.009796,-0.018155
2,9,вахта,120000.0,140000.0,любое,0.0,0.0,0.0,0.0,0.0,...,-0.055645,-0.017912,0.062783,-0.028709,-0.045368,-0.018462,-0.019988,0.03116,-0.03489,-0.012783
3,2,полный рабочий день,80000.0,140000.0,любое,1.0,1.0,0.0,0.0,0.0,...,-0.041101,-0.017975,0.049148,-0.017079,-0.018976,-0.015754,-0.021058,0.001742,0.000186,-0.004655
4,6,полный рабочий день,70000.0,150000.0,любое,1.0,0.0,0.0,0.0,0.0,...,-0.055645,-0.017912,0.062783,-0.028709,-0.045368,-0.018462,-0.019988,0.03116,-0.03489,-0.012783


In [25]:
one_hot_columns = ['city_id', 'schedule', 'education_name']
df_full = pd.get_dummies(df_full, columns=one_hot_columns, dtype=int)
df_full

Unnamed: 0,salary_from,salary_to,грузоперевозка,документооборот,пассажиры,ремонт,погрузка,position_0,position_1,position_2,...,schedule_полный рабочий день,schedule_свободный график,schedule_сменный график,schedule_удаленная работа,schedule_частичная занятость,education_name_высшее,education_name_любое,education_name_неполное высшее,education_name_среднее,education_name_среднее профессиональное
0,55000.000000,95000.000000,1.0,1.0,0.0,0.0,0.0,0.037722,-0.026277,-0.083610,...,1,0,0,0,0,0,1,0,0,0
1,90000.000000,90000.000000,0.0,0.0,0.0,0.0,0.0,0.057113,-0.036494,-0.057906,...,1,0,0,0,0,0,1,0,0,0
2,120000.000000,140000.000000,0.0,0.0,0.0,0.0,0.0,0.037722,-0.026277,-0.083610,...,0,0,0,0,0,0,1,0,0,0
3,80000.000000,140000.000000,1.0,1.0,0.0,0.0,0.0,0.048318,-0.038399,-0.077475,...,1,0,0,0,0,0,1,0,0,0
4,70000.000000,150000.000000,1.0,0.0,0.0,0.0,0.0,0.037722,-0.026277,-0.083610,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10082,60000.000000,65000.000000,0.0,1.0,0.0,0.0,0.0,0.068219,-0.030418,-0.086025,...,1,0,0,0,0,0,1,0,0,0
10083,40000.000000,40000.000000,0.0,0.0,0.0,0.0,0.0,0.024755,-0.011593,-0.034155,...,1,0,0,0,0,0,1,0,0,0
10084,114942.528736,149425.287356,0.0,0.0,0.0,0.0,0.0,0.037722,-0.026277,-0.083610,...,0,0,0,0,0,0,1,0,0,0
10085,110000.000000,110000.000000,0.0,0.0,0.0,0.0,0.0,0.057113,-0.036494,-0.057906,...,1,0,0,0,0,0,1,0,0,0


In [30]:
from sklearn.model_selection import train_test_split

y = df_full['salary_from']
X = df_full.drop(columns=['salary_from', 'salary_to'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2023)

In [31]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import (mean_absolute_percentage_error,
                             mean_squared_error, r2_score)

ridge_model = Ridge(alpha=0.7).fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
print('MSE = ', mean_squared_error(y_test, y_pred))
print('R2 = ', r2_score(y_test, y_pred))
print('MAPE = ', mean_absolute_percentage_error(y_test, y_pred))

MSE =  884845913.5607736
R2 =  0.4752485679902122
MAPE =  0.2832367900862105


In [33]:
X_train

Unnamed: 0,грузоперевозка,документооборот,пассажиры,ремонт,погрузка,position_0,position_1,position_2,position_3,position_4,...,schedule_полный рабочий день,schedule_свободный график,schedule_сменный график,schedule_удаленная работа,schedule_частичная занятость,education_name_высшее,education_name_любое,education_name_неполное высшее,education_name_среднее,education_name_среднее профессиональное
7150,0.0,0.0,0.0,0.0,0.0,0.048318,-0.038399,-0.077475,0.010614,0.027379,...,0,0,0,0,0,0,1,0,0,0
374,0.0,0.0,0.0,0.0,0.0,0.004050,-0.024270,0.021373,0.050033,0.022982,...,0,0,1,0,0,0,1,0,0,0
5344,1.0,0.0,0.0,1.0,0.0,0.039067,0.031114,-0.034662,0.061687,0.027555,...,0,0,0,0,0,0,1,0,0,0
8520,0.0,0.0,0.0,0.0,0.0,0.068219,-0.030418,-0.086025,0.006649,0.041626,...,1,0,0,0,0,0,1,0,0,0
5505,0.0,0.0,0.0,0.0,0.0,0.037722,-0.026277,-0.083610,0.039702,0.023920,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5852,1.0,0.0,1.0,1.0,0.0,0.065700,0.023771,-0.040222,0.080796,0.017857,...,0,0,0,0,0,0,1,0,0,0
6049,1.0,1.0,0.0,1.0,0.0,0.066927,0.001807,-0.106845,0.060860,0.048954,...,1,0,0,0,0,0,1,0,0,0
2743,0.0,0.0,0.0,1.0,0.0,0.037722,-0.026277,-0.083610,0.039702,0.023920,...,1,0,0,0,0,0,1,0,0,0
5657,1.0,1.0,1.0,0.0,0.0,0.031791,-0.056728,0.004739,0.063229,0.024619,...,0,1,0,0,0,0,1,0,0,0


In [34]:
ridge_model.coef_[:5]

array([ 3965.47412616, -1529.52186678,  -488.94300504, -7983.40342218,
        2580.36032187])

In [28]:
y = df_full['salary_to']
X = df_full.drop(columns=['salary_from', 'salary_to'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2023)

In [29]:
ridge_model = Ridge(alpha=0.7).fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
print('MSE = ', mean_squared_error(y_test, y_pred))
print('R2 = ', r2_score(y_test, y_pred))
print('MAPE = ', mean_absolute_percentage_error(y_test, y_pred))

MSE =  2043104062.6735604
R2 =  0.4120364960529318
MAPE =  0.3246194750922604


In [41]:
y = df_test['salary_from']
X = df_test.drop(columns=['salary_from', 'salary_to'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2023)

ridge_model = Ridge(alpha=0.7).fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
print('MSE = ', mean_squared_error(y_test, y_pred))
print('R2 = ', r2_score(y_test, y_pred))
print('MAPE = ', mean_absolute_percentage_error(y_test, y_pred))

X_train

MSE =  1183502809.0640538
R2 =  0.2981322687644262
MAPE =  0.3373951909884348


Unnamed: 0,грузоперевозка,документооборот,пассажиры,ремонт,погрузка,schedule_вахта,schedule_полный рабочий день,schedule_свободный график,schedule_сменный график,schedule_удаленная работа,schedule_частичная занятость,education_name_высшее,education_name_любое,education_name_неполное высшее,education_name_среднее,education_name_среднее профессиональное
7150,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,1,0,0,0
374,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,1,0,0,0
5344,1.0,0.0,0.0,1.0,0.0,1,0,0,0,0,0,0,1,0,0,0
8520,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,0
5505,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5852,1.0,0.0,1.0,1.0,0.0,1,0,0,0,0,0,0,1,0,0,0
6049,1.0,1.0,0.0,1.0,0.0,0,1,0,0,0,0,0,1,0,0,0
2743,0.0,0.0,0.0,1.0,0.0,0,1,0,0,0,0,0,1,0,0,0
5657,1.0,1.0,1.0,0.0,0.0,0,0,1,0,0,0,0,1,0,0,0


In [42]:
ridge_model.coef_[0:5]


array([  9581.15125953,  -2102.45954297,  -2303.68407277, -10092.12468251,
       -14167.01048749])