In [30]:
import ast

import pandas as pd
import numpy as np

import fasttext
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (mean_absolute_percentage_error,
                             mean_squared_error, r2_score)
from sklearn.neighbors import NearestNeighbors

from sklearn.linear_model import Ridge

In [2]:
dataset = pd.read_csv('../data/data_vacancies.csv')
df = dataset[[
    'city_id', 
    'custom_position', 
    'schedule', 
    'salary_from', 
    'salary_to', 
    'education_name', 
    'work_skills'
    ]]
top5_cityids = df.city_id.value_counts(normalize=True).nlargest(5).keys()
df = df[df.city_id.isin(top5_cityids)]
df = df.reset_index().drop('index', axis=1)
df.work_skills = df.work_skills.apply(ast.literal_eval)
df

Unnamed: 0,city_id,custom_position,schedule,salary_from,salary_to,education_name,work_skills
0,2,Сварщик-сборщик,полный рабочий день,60000,120000,любое,"[сварочные работы, сборка изделий по чертежам,..."
1,2,Сварщик-монтажник,полный рабочий день,60000,120000,любое,"[монтажные работы, строительные работы, электр..."
2,2,Слесарь-сборщик,полный рабочий день,60000,80000,любое,"[работа на фрезерных станках, слесарный ремонт..."
3,1,Грузчик-упаковщик,частичная занятость,30000,35000,любое,"[комплектация товара, маркировка, стрессоустой..."
4,57,Грузчик-упаковщик,частичная занятость,30000,35000,любое,"[маркировка, стрессоустойчивость, погрузочно-р..."
...,...,...,...,...,...,...,...
18410,1,Кладовщик,полный рабочий день,45000,70000,среднее профессиональное,"[комплектация заказов, работа с документацией,..."
18411,1,Кассир,сменный график,35000,58000,любое,"[ответственность, контроль срока годности, раб..."
18412,1,Инженер по медицинской технике,полный рабочий день,77000,77000,высшее,"[уверенный пользователь ПК, ремонт оборудовани..."
18413,2,Автомеханик-автослесарь,полный рабочий день,80000,120000,любое,"[устройство автомобилей, ремонт тормозной сист..."


In [3]:
ft = fasttext.load_model('../data/cc.ru.300.bin')
ft.get_dimension()



300

In [4]:
positions_vec = df.custom_position.apply(lambda x: ft.get_sentence_vector(x))
df_positions = pd.DataFrame(positions_vec.to_numpy().tolist(), columns=[f'position_{i}' for i in range(len(positions_vec.values[0]))])
df_positions

Unnamed: 0,position_0,position_1,position_2,position_3,position_4,position_5,position_6,position_7,position_8,position_9,...,position_290,position_291,position_292,position_293,position_294,position_295,position_296,position_297,position_298,position_299
0,0.050771,-0.007349,-0.076555,-0.006560,0.027061,0.013700,0.014645,-0.049583,-0.011111,0.002839,...,0.053989,0.031275,-0.020565,0.062088,0.137415,-0.044907,0.055137,0.052176,-0.007833,-0.067442
1,0.036317,-0.028934,-0.049667,0.019598,0.004137,-0.005131,0.031874,-0.085778,0.033659,-0.018037,...,0.083372,-0.015954,-0.027596,0.074016,0.126700,-0.024558,0.042158,0.053539,0.004708,-0.052585
2,0.069421,-0.014271,-0.063945,-0.014081,0.041252,0.061979,-0.000641,-0.047326,-0.018930,-0.011699,...,0.066686,0.063099,0.013671,0.051973,0.102922,-0.009312,0.027799,0.040270,-0.017081,-0.015839
3,0.014442,0.030691,-0.065059,-0.012545,0.040037,-0.007176,0.062810,0.032525,-0.002185,0.085377,...,0.072154,0.034601,-0.024409,0.024460,0.088771,0.030615,0.020678,0.028920,0.060850,-0.051959
4,0.014442,0.030691,-0.065059,-0.012545,0.040037,-0.007176,0.062810,0.032525,-0.002185,0.085377,...,0.072154,0.034601,-0.024409,0.024460,0.088771,0.030615,0.020678,0.028920,0.060850,-0.051959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18410,0.049371,-0.056621,-0.043403,-0.099676,0.044521,0.013236,0.002104,0.045774,-0.047097,0.071743,...,0.012399,-0.019462,-0.032409,0.024379,0.171492,-0.009100,-0.053378,0.013966,0.035721,0.041110
18411,0.105399,-0.119790,0.034709,-0.068842,0.085813,-0.065957,0.015720,-0.084088,-0.008549,0.048369,...,0.041495,0.024418,-0.069329,0.021259,0.159714,0.085343,-0.088587,0.029054,0.051901,0.057364
18412,0.038625,-0.005139,-0.013067,0.012266,0.008623,-0.003373,-0.006893,0.005935,0.005261,-0.014566,...,0.024653,0.003901,0.011163,-0.001966,0.028986,-0.017348,0.018914,0.012256,-0.004895,0.021321
18413,0.062741,-0.060041,0.023153,0.050334,-0.043190,-0.072364,0.025282,0.045926,-0.041479,0.002487,...,0.070804,0.053139,-0.069804,-0.016172,0.033209,0.045326,0.022228,0.030130,0.017900,-0.024232


In [5]:
salary_from = df.salary_from
X_train, X_test, y_train, y_test = train_test_split(
    df_positions, salary_from, test_size=0.2, random_state=1)

In [6]:
knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)
print('R2 = ', r2_score(y_test, y_pred))
print('MAPE = ', mean_absolute_percentage_error(y_test, y_pred))

R2 =  0.4545597538759243
MAPE =  0.22790182513976298


In [7]:
knn = NearestNeighbors(n_neighbors=6, algorithm='brute')  # 6, чтобы включить 5 ближайших соседей и сам объект

knn.fit(df_positions)

distances, indices = knn.kneighbors(df_positions)

# Исключите сам объект из списка соседей
neighbor_indices = indices[:, 1:]
# Усредните таргеты соседей для каждого объекта
predicted_targets = np.mean(salary_from.to_numpy()[neighbor_indices], axis=1)

print("Predicted Targets:", predicted_targets)

Predicted Targets: [70000. 78000. 57000. ... 66800. 90000. 90000.]


In [8]:
print('R2 = ', r2_score(salary_from, predicted_targets))
print('MAPE = ', 
      mean_absolute_percentage_error(salary_from, predicted_targets))

R2 =  0.4851637362935204
MAPE =  0.22215735988268012


In [9]:
df['knn_salary_from'] = predicted_targets
df[['salary_from', 'knn_salary_from']]

Unnamed: 0,salary_from,knn_salary_from
0,60000,70000.0
1,60000,78000.0
2,60000,57000.0
3,30000,41600.0
4,30000,41600.0
...,...,...
18410,45000,56100.0
18411,35000,75368.0
18412,77000,66800.0
18413,80000,90000.0


In [10]:
df = df.drop('custom_position', axis=1)
df

Unnamed: 0,city_id,schedule,salary_from,salary_to,education_name,work_skills,knn_salary_from
0,2,полный рабочий день,60000,120000,любое,"[сварочные работы, сборка изделий по чертежам,...",70000.0
1,2,полный рабочий день,60000,120000,любое,"[монтажные работы, строительные работы, электр...",78000.0
2,2,полный рабочий день,60000,80000,любое,"[работа на фрезерных станках, слесарный ремонт...",57000.0
3,1,частичная занятость,30000,35000,любое,"[комплектация товара, маркировка, стрессоустой...",41600.0
4,57,частичная занятость,30000,35000,любое,"[маркировка, стрессоустойчивость, погрузочно-р...",41600.0
...,...,...,...,...,...,...,...
18410,1,полный рабочий день,45000,70000,среднее профессиональное,"[комплектация заказов, работа с документацией,...",56100.0
18411,1,сменный график,35000,58000,любое,"[ответственность, контроль срока годности, раб...",75368.0
18412,1,полный рабочий день,77000,77000,высшее,"[уверенный пользователь ПК, ремонт оборудовани...",66800.0
18413,2,полный рабочий день,80000,120000,любое,"[устройство автомобилей, ремонт тормозной сист...",90000.0


In [11]:
def summed_skill_embedding(skills: list[str]):
    sentence_vectors = []

    for skill in skills:
        sentence_vector = ft.get_sentence_vector(skill)
        sentence_vectors.append(sentence_vector)

    if sentence_vectors:
        average_vector = np.sum(sentence_vectors, axis=0)
        return average_vector
    else:
        return None

In [12]:
skills_vec = df.work_skills.apply(summed_skill_embedding)
df_skills = pd.DataFrame(skills_vec.to_numpy().tolist(), columns=[f'skill_{i}' for i in range(len(skills_vec.values[0]))])
df_skills

Unnamed: 0,skill_0,skill_1,skill_2,skill_3,skill_4,skill_5,skill_6,skill_7,skill_8,skill_9,...,skill_290,skill_291,skill_292,skill_293,skill_294,skill_295,skill_296,skill_297,skill_298,skill_299
0,0.496555,0.057306,0.114850,0.554914,0.581040,-0.497559,0.433427,-0.062471,0.048226,-0.003185,...,0.423141,0.043040,-0.156141,0.142049,0.207225,-0.465708,0.666333,-0.104790,-0.446345,-0.415346
1,0.455160,0.154789,0.101754,0.230861,0.348718,-0.549088,0.346809,-0.110286,0.211650,-0.084932,...,0.410208,-0.035330,-0.090265,0.073296,0.218671,-0.468301,0.724818,-0.133032,-0.569143,-0.250594
2,0.089320,0.113591,0.293412,0.179086,0.579596,-0.694003,0.370526,0.301742,-0.059800,0.090789,...,0.273359,-0.047166,-0.048604,-0.087584,0.319142,-0.172678,0.135131,-0.214116,-0.392513,-0.209285
3,1.073623,-0.301291,0.523090,-0.139433,0.729587,-0.235452,0.891929,0.976602,-0.491665,0.863185,...,-0.273604,-0.659561,0.588310,-0.244965,0.343399,0.113444,0.010813,-0.179356,-0.890568,-0.545186
4,1.073623,-0.301291,0.523090,-0.139433,0.729587,-0.235452,0.891929,0.976602,-0.491665,0.863185,...,-0.273604,-0.659561,0.588310,-0.244965,0.343399,0.113444,0.010813,-0.179356,-0.890568,-0.545186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18410,0.098419,-0.073784,0.128757,-0.077470,0.119577,-0.116098,0.045353,0.167039,-0.062040,0.100396,...,-0.100881,-0.067481,0.080198,-0.080725,-0.019916,-0.088795,-0.035186,-0.160902,-0.110632,-0.046684
18411,0.135777,-0.104747,0.151913,-0.071176,0.228948,0.039386,0.131393,0.144104,-0.177951,-0.024684,...,-0.008580,-0.215827,-0.024955,-0.052108,0.135202,0.018936,-0.009083,0.149588,-0.210943,-0.065488
18412,0.199614,-0.123595,0.122611,0.070004,0.047072,-0.047563,0.159157,0.057610,0.075998,0.089094,...,-0.131467,-0.061653,0.227522,-0.108873,0.027370,-0.058918,0.091712,0.073994,-0.176446,-0.143105
18413,0.215416,-0.123778,0.163658,0.285030,0.112806,-0.432400,0.264421,0.181222,-0.212729,-0.113759,...,-0.054859,-0.023220,0.121892,0.022679,0.032787,-0.031328,0.094059,0.092891,-0.271978,-0.201329


In [34]:
df_full = pd.concat([df, df_skills], axis=1)
df_full = df_full.drop('work_skills', axis=1)
df_full.city_id = df_full.city_id.apply(lambda x: str(x))
one_hot_columns = ['city_id', 'schedule', 'education_name']
df_full = pd.get_dummies(df_full, columns=one_hot_columns, dtype=int)
df_full
# for column in df_full.columns:
#     print(column)

Unnamed: 0,salary_from,salary_to,knn_salary_from,skill_0,skill_1,skill_2,skill_3,skill_4,skill_5,skill_6,...,schedule_полный рабочий день,schedule_свободный график,schedule_сменный график,schedule_удаленная работа,schedule_частичная занятость,education_name_высшее,education_name_любое,education_name_неполное высшее,education_name_среднее,education_name_среднее профессиональное
0,60000,120000,70000.0,0.496555,0.057306,0.114850,0.554914,0.581040,-0.497559,0.433427,...,1,0,0,0,0,0,1,0,0,0
1,60000,120000,78000.0,0.455160,0.154789,0.101754,0.230861,0.348718,-0.549088,0.346809,...,1,0,0,0,0,0,1,0,0,0
2,60000,80000,57000.0,0.089320,0.113591,0.293412,0.179086,0.579596,-0.694003,0.370526,...,1,0,0,0,0,0,1,0,0,0
3,30000,35000,41600.0,1.073623,-0.301291,0.523090,-0.139433,0.729587,-0.235452,0.891929,...,0,0,0,0,1,0,1,0,0,0
4,30000,35000,41600.0,1.073623,-0.301291,0.523090,-0.139433,0.729587,-0.235452,0.891929,...,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18410,45000,70000,56100.0,0.098419,-0.073784,0.128757,-0.077470,0.119577,-0.116098,0.045353,...,1,0,0,0,0,0,0,0,0,1
18411,35000,58000,75368.0,0.135777,-0.104747,0.151913,-0.071176,0.228948,0.039386,0.131393,...,0,0,1,0,0,0,1,0,0,0
18412,77000,77000,66800.0,0.199614,-0.123595,0.122611,0.070004,0.047072,-0.047563,0.159157,...,1,0,0,0,0,1,0,0,0,0
18413,80000,120000,90000.0,0.215416,-0.123778,0.163658,0.285030,0.112806,-0.432400,0.264421,...,1,0,0,0,0,0,1,0,0,0


In [26]:
X = df_full.drop(['salary_from', 'salary_to'], axis=1)
y_from = df_full.salary_from
y_to = df_full.salary_to

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_from, test_size=0.2, random_state=1)

In [44]:
ridge_model = Ridge(alpha=1.0).fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
print('R2 = ', r2_score(y_test, y_pred))
print('MAPE = ', mean_absolute_percentage_error(y_test, y_pred))

R2 =  0.5162767752579538
MAPE =  0.214341180549193


In [48]:
ridge_model.coef_[0] # Коэф. для knn_salary_from

0.7086314498557752

In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_to, test_size=0.2, random_state=1)

In [52]:
ridge_model = Ridge(alpha=1).fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
print('R2 = ', r2_score(y_test, y_pred))
print('MAPE = ', mean_absolute_percentage_error(y_test, y_pred))

R2 =  0.4523618990213689
MAPE =  0.28124900694261573


In [53]:
ridge_model.coef_[0] # Коэф. для knn_salary_from

0.9966645180744976