In [1]:
import ast

import fasttext
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import (mean_absolute_percentage_error,
                             mean_squared_error, r2_score)
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('../data/data_vacancies.csv')
dataset.head()

Unnamed: 0,id,custom_position,schedule,salary_from,salary_to,salary_pay_type,offer_education_id,education_name,education_is_base,education_order_num,city_id,list_regions,work_skills,tags_id
0,48202096,Сварщик-сборщик,полный рабочий день,60000,120000,,0,любое,True,0,2,[4],"['сварочные работы', 'сборка изделий по чертеж...",
1,48202097,Сварщик-монтажник,полный рабочий день,60000,120000,,0,любое,True,0,2,[4],"['монтажные работы', 'строительные работы', 'э...",
2,48202098,Слесарь-сборщик,полный рабочий день,60000,80000,,0,любое,True,0,2,[4],"['работа на фрезерных станках', 'слесарный рем...",
3,48202356,Грузчик-упаковщик,частичная занятость,30000,35000,,0,любое,True,0,1,[3],"['комплектация товара', 'маркировка', 'стрессо...","[6, 9]"
4,48202357,Грузчик-упаковщик,частичная занятость,30000,35000,,0,любое,True,0,57,"[181, 182, 183, 185, 186, 187, 188, 189, 190, ...","['маркировка', 'стрессоустойчивость', 'погрузо...","[6, 9]"


In [3]:
df = dataset[[
    'city_id', 
    'custom_position', 
    'schedule', 
    'salary_from', 
    'salary_to', 
    'education_name', 
    'work_skills'
    ]]

In [4]:
top5_cityids = df.city_id.value_counts(normalize=True).nlargest(5).keys()
print(top5_cityids)
print('Все регионы', len(df))
df = df[df.city_id.isin(top5_cityids)]
print('Топ 5 регионы', len(df))
df = df.reset_index().drop('index', axis=1)
df

Index([1, 57, 2, 102, 174], dtype='int64', name='city_id')
Все регионы 19489
Топ 5 регионы 18415


Unnamed: 0,city_id,custom_position,schedule,salary_from,salary_to,education_name,work_skills
0,2,Сварщик-сборщик,полный рабочий день,60000,120000,любое,"['сварочные работы', 'сборка изделий по чертеж..."
1,2,Сварщик-монтажник,полный рабочий день,60000,120000,любое,"['монтажные работы', 'строительные работы', 'э..."
2,2,Слесарь-сборщик,полный рабочий день,60000,80000,любое,"['работа на фрезерных станках', 'слесарный рем..."
3,1,Грузчик-упаковщик,частичная занятость,30000,35000,любое,"['комплектация товара', 'маркировка', 'стрессо..."
4,57,Грузчик-упаковщик,частичная занятость,30000,35000,любое,"['маркировка', 'стрессоустойчивость', 'погрузо..."
...,...,...,...,...,...,...,...
18410,1,Кладовщик,полный рабочий день,45000,70000,среднее профессиональное,"['комплектация заказов', 'работа с документаци..."
18411,1,Кассир,сменный график,35000,58000,любое,"['ответственность', 'контроль срока годности',..."
18412,1,Инженер по медицинской технике,полный рабочий день,77000,77000,высшее,"['уверенный пользователь ПК', 'ремонт оборудов..."
18413,2,Автомеханик-автослесарь,полный рабочий день,80000,120000,любое,"['устройство автомобилей', 'ремонт тормозной с..."


Видно что мы теряем всего примерно 5% датасета. Думаю на этапе бейзлайна нас это устраивает.

In [5]:
df.custom_position.value_counts().nlargest(20)

custom_position
Продавец-кассир                    409
Менеджер по продажам               283
Продавец-консультант               238
Курьер                             193
Охранник                           134
Повар                              130
Разнорабочий                       127
Водитель по доставке документов    118
Грузчик                            118
Комплектовщик                      112
Работник торгового зала            105
Продавец                            96
Менеджер по работе с клиентами      95
Кладовщик                           95
Мерчандайзер-грузчик                94
Водитель-экспедитор                 87
Оператор входящих звонков           82
Копирайтер                          75
Кассир                              71
Швея                                70
Name: count, dtype: int64

In [6]:
df.custom_position.value_counts().nsmallest(20)

custom_position
Бариста в торговую зону                                      1
Инженер по 3d-моделированию (моделирования для 3d-печати)    1
Врач травматолог-ортопед                                     1
Мастер по установке бытовой техники                          1
Врач мануальный терапевт                                     1
Менеджер по программам страхования (удаленно)                1
Специалист по уборке прилегающей территории                  1
Домработница вахта                                           1
Повар (м. Митино)                                            1
Сборщик интернет заказов                                     1
Консультант на входящие обращения                            1
Специалист ввода данных/редактор текста (удаленно)           1
Специалист ввода данных/сортировщик заявок (удаленно)        1
Упаковщик на склад одежды                                    1
Водитель экскаватора-погрузчика                              1
Сотрудник ввода данных и приема заявок 

[Скачал фасттекст здесь](https://fasttext.cc/docs/en/pretrained-vectors.html)

In [7]:
ft = fasttext.load_model('../data/cc.ru.300.bin')
ft.get_dimension()



300

In [8]:
seller = df.custom_position.value_counts().nlargest(20).keys()[0]
print(seller)
ft.get_word_vector(seller)

Продавец-кассир


array([ 0.0306605 , -0.01950333,  0.02881816, -0.06943367, -0.02349308,
       -0.03814335,  0.02185529,  0.008966  , -0.00766545,  0.0434695 ,
       -0.00082192,  0.02592272, -0.01957024,  0.03838321, -0.03806059,
        0.00613764,  0.01588907,  0.00906658, -0.03683465,  0.02239164,
        0.01028472, -0.06023714, -0.02575636,  0.01723426,  0.01511259,
       -0.00232358, -0.01489399, -0.02019401,  0.05304326,  0.06486466,
        0.02080936, -0.05129497, -0.00389587,  0.003278  , -0.01736712,
       -0.01042855,  0.01571086, -0.03753878,  0.00104838, -0.00619643,
       -0.01720199, -0.00244347, -0.06890276, -0.00073298, -0.03587189,
        0.04738117,  0.00839593,  0.0012039 ,  0.05042906,  0.04763807,
        0.03691753,  0.0413822 ,  0.03577876,  0.00641947, -0.03715985,
        0.02430377, -0.0004829 ,  0.02820999, -0.00910002, -0.0629483 ,
       -0.00434958,  0.03800977, -0.03776591,  0.00634689, -0.01462634,
        0.00818167,  0.03072939,  0.01398441, -0.05762712,  0.01

In [9]:
ft.get_nearest_neighbors(seller.lower())

[(0.8002034425735474, 'Продавец-кассир'),
 (0.7729693651199341, 'продавец-консультант'),
 (0.7511923909187317, 'кассир-продавец'),
 (0.7381469011306763, 'продавец-'),
 (0.7343067526817322, 'Продавец-консультант'),
 (0.7251340746879578, 'бухгалтер-кассир'),
 (0.7235550880432129, 'оператор-кассир'),
 (0.7220218777656555, 'Кассир-продавец'),
 (0.6989647150039673, 'кассир-операционист'),
 (0.6903725266456604, 'кассир')]

In [10]:
postion = df.iloc[0].custom_position
skills = df.iloc[0].work_skills
skills = ast.literal_eval(skills)

print(type(postion))
print(type(skills))
print(postion)
print(skills)

<class 'str'>
<class 'list'>
Сварщик-сборщик
['сварочные работы', 'сборка изделий по чертежам', 'ручная дуговая сварка', 'электродуговая сварка', 'аргонодуговая сварка', 'автоматическая  сварка', 'сварка на полуавтомате', 'изготовление металлоконструкций', 'чтение чертежей', 'сборка металлоконструкций', 'работа с электроинструментом']


In [11]:
position_vec = ft.get_sentence_vector(postion)

print("Косинусное расстояние:")
for skill in skills:
    skill_vec = ft.get_sentence_vector(skill)
    cosine_distance = cosine(skill_vec, position_vec)
    print(cosine_distance)

Косинусное расстояние:
0.6375634074211121
0.641302227973938
0.6166938841342926
0.594580203294754
0.5893785655498505
0.6758362948894501
0.6010209023952484
0.5351738333702087
0.7861718684434891
0.5309136509895325
0.75563845038414


In [12]:
sql_skill_vec = ft.get_sentence_vector('python')
position_vec = ft.get_sentence_vector('программист')

print("Косинусное расстояние:")
cosine_distance = cosine(sql_skill_vec, position_vec)
# print(cosine_distance)
print(1 - cosine_distance)

ft.get_nearest_neighbors('c++') # вот тут видна проблема с некоторыми словами

Косинусное расстояние:
0.40474915504455566


[(0.3940935432910919, 'РИСКОВАТЬ'),
 (0.3654024302959442, 'разряда-4'),
 (0.36446481943130493, 'rabotarabotaidinafedota.ru'),
 (0.36392560601234436,
  'STRATTONChampionEndressEuropowerFubagGeneracHuterHyundaiKiporMirkon'),
 (0.36349737644195557, 'StrattonChampionGreenfieldHondaKohler'),
 (0.3557996153831482, 'разряда-6'),
 (0.35346558690071106, 'Красносельскстройматериалы'),
 (0.35338646173477173, 'ПОДЪЕМНО-ТРАНСПОРТНОЕ'),
 (0.35280171036720276, 'ПСКОВА'),
 (0.3519258499145508,
  'RSSаварияавтоармияАртвидеогифкиГифки-анимашкиГрубасДевочкидевушкиДемотиваторыдетидтпдуматьдушевножестьживотныежизненнозагадкаинтереснойошкар-олакартинкакартинкикинокомикскомиксыкоткотикикотятакрасивокрасотакреативмарий')]

In [13]:
df.work_skills = df.work_skills.apply(ast.literal_eval)
df

Unnamed: 0,city_id,custom_position,schedule,salary_from,salary_to,education_name,work_skills
0,2,Сварщик-сборщик,полный рабочий день,60000,120000,любое,"[сварочные работы, сборка изделий по чертежам,..."
1,2,Сварщик-монтажник,полный рабочий день,60000,120000,любое,"[монтажные работы, строительные работы, электр..."
2,2,Слесарь-сборщик,полный рабочий день,60000,80000,любое,"[работа на фрезерных станках, слесарный ремонт..."
3,1,Грузчик-упаковщик,частичная занятость,30000,35000,любое,"[комплектация товара, маркировка, стрессоустой..."
4,57,Грузчик-упаковщик,частичная занятость,30000,35000,любое,"[маркировка, стрессоустойчивость, погрузочно-р..."
...,...,...,...,...,...,...,...
18410,1,Кладовщик,полный рабочий день,45000,70000,среднее профессиональное,"[комплектация заказов, работа с документацией,..."
18411,1,Кассир,сменный график,35000,58000,любое,"[ответственность, контроль срока годности, раб..."
18412,1,Инженер по медицинской технике,полный рабочий день,77000,77000,высшее,"[уверенный пользователь ПК, ремонт оборудовани..."
18413,2,Автомеханик-автослесарь,полный рабочий день,80000,120000,любое,"[устройство автомобилей, ремонт тормозной сист..."


In [14]:
def average_sentence_embedding(skills: list[str]):
    # Инициализируйте список для хранения векторов предложений
    sentence_vectors = []

    # Преобразуйте каждое предложение в вектор и добавьте его в список
    for skill in skills:
        sentence_vector = ft.get_sentence_vector(skill)
        sentence_vectors.append(sentence_vector)

    # Усредните векторы
    if sentence_vectors:
        average_vector = np.mean(sentence_vectors, axis=0)
        return average_vector
    else:
        return None 
    

In [15]:
skills_vec = df.work_skills.apply(average_sentence_embedding)
skills_vec

0        [0.045141395, 0.005209608, 0.0104408935, 0.050...
1        [0.045515973, 0.015478872, 0.0101753725, 0.023...
2        [0.009924493, 0.012621231, 0.032601323, 0.0198...
3        [0.056506485, -0.015857443, 0.02753105, -0.007...
4        [0.056506485, -0.015857443, 0.02753105, -0.007...
                               ...                        
18410    [0.032806203, -0.024594657, 0.04291911, -0.025...
18411    [0.027155465, -0.020949401, 0.030382698, -0.01...
18412    [0.049903445, -0.030898858, 0.030652665, 0.017...
18413    [0.03077372, -0.017682595, 0.023379734, 0.0407...
18414    [0.03077372, -0.017682595, 0.023379734, 0.0407...
Name: work_skills, Length: 18415, dtype: object

In [16]:
df_skills = pd.DataFrame(skills_vec.to_numpy().tolist(), columns=[f'skill_{i}' for i in range(len(skills_vec.values[0]))])
df_skills

Unnamed: 0,skill_0,skill_1,skill_2,skill_3,skill_4,skill_5,skill_6,skill_7,skill_8,skill_9,...,skill_290,skill_291,skill_292,skill_293,skill_294,skill_295,skill_296,skill_297,skill_298,skill_299
0,0.045141,0.005210,0.010441,0.050447,0.052822,-0.045233,0.039402,-0.005679,0.004384,-0.000290,...,0.038467,0.003913,-0.014195,0.012914,0.018839,-0.042337,0.060576,-0.009526,-0.040577,-0.037759
1,0.045516,0.015479,0.010175,0.023086,0.034872,-0.054909,0.034681,-0.011029,0.021165,-0.008493,...,0.041021,-0.003533,-0.009027,0.007330,0.021867,-0.046830,0.072482,-0.013303,-0.056914,-0.025059
2,0.009924,0.012621,0.032601,0.019898,0.064400,-0.077111,0.041170,0.033527,-0.006644,0.010088,...,0.030373,-0.005241,-0.005400,-0.009732,0.035460,-0.019186,0.015015,-0.023791,-0.043613,-0.023254
3,0.056506,-0.015857,0.027531,-0.007339,0.038399,-0.012392,0.046944,0.051400,-0.025877,0.045431,...,-0.014400,-0.034714,0.030964,-0.012893,0.018074,0.005971,0.000569,-0.009440,-0.046872,-0.028694
4,0.056506,-0.015857,0.027531,-0.007339,0.038399,-0.012392,0.046944,0.051400,-0.025877,0.045431,...,-0.014400,-0.034714,0.030964,-0.012893,0.018074,0.005971,0.000569,-0.009440,-0.046872,-0.028694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18410,0.032806,-0.024595,0.042919,-0.025823,0.039859,-0.038699,0.015118,0.055680,-0.020680,0.033465,...,-0.033627,-0.022494,0.026733,-0.026908,-0.006639,-0.029598,-0.011729,-0.053634,-0.036877,-0.015561
18411,0.027155,-0.020949,0.030383,-0.014235,0.045790,0.007877,0.026279,0.028821,-0.035590,-0.004937,...,-0.001716,-0.043165,-0.004991,-0.010422,0.027040,0.003787,-0.001817,0.029918,-0.042189,-0.013098
18412,0.049903,-0.030899,0.030653,0.017501,0.011768,-0.011891,0.039789,0.014402,0.019000,0.022273,...,-0.032867,-0.015413,0.056881,-0.027218,0.006843,-0.014730,0.022928,0.018499,-0.044111,-0.035776
18413,0.030774,-0.017683,0.023380,0.040719,0.016115,-0.061771,0.037774,0.025889,-0.030390,-0.016251,...,-0.007837,-0.003317,0.017413,0.003240,0.004684,-0.004475,0.013437,0.013270,-0.038854,-0.028761


In [17]:
positions_vec = df.custom_position.apply(lambda x: ft.get_sentence_vector(x))
print(positions_vec)
df_positions = pd.DataFrame(positions_vec.to_numpy().tolist(), columns=[f'position_{i}' for i in range(len(positions_vec.values[0]))])
df_positions

0        [0.050771426, -0.0073487004, -0.07655495, -0.0...
1        [0.03631743, -0.028933693, -0.04966745, 0.0195...
2        [0.069421284, -0.014270519, -0.063944936, -0.0...
3        [0.014441796, 0.030691095, -0.06505858, -0.012...
4        [0.014441796, 0.030691095, -0.06505858, -0.012...
                               ...                        
18410    [0.049370885, -0.056620527, -0.0434035, -0.099...
18411    [0.10539878, -0.119790435, 0.034708977, -0.068...
18412    [0.03862549, -0.005139487, -0.013067292, 0.012...
18413    [0.062741496, -0.06004133, 0.02315333, 0.05033...
18414    [0.062741496, -0.06004133, 0.02315333, 0.05033...
Name: custom_position, Length: 18415, dtype: object


Unnamed: 0,position_0,position_1,position_2,position_3,position_4,position_5,position_6,position_7,position_8,position_9,...,position_290,position_291,position_292,position_293,position_294,position_295,position_296,position_297,position_298,position_299
0,0.050771,-0.007349,-0.076555,-0.006560,0.027061,0.013700,0.014645,-0.049583,-0.011111,0.002839,...,0.053989,0.031275,-0.020565,0.062088,0.137415,-0.044907,0.055137,0.052176,-0.007833,-0.067442
1,0.036317,-0.028934,-0.049667,0.019598,0.004137,-0.005131,0.031874,-0.085778,0.033659,-0.018037,...,0.083372,-0.015954,-0.027596,0.074016,0.126700,-0.024558,0.042158,0.053539,0.004708,-0.052585
2,0.069421,-0.014271,-0.063945,-0.014081,0.041252,0.061979,-0.000641,-0.047326,-0.018930,-0.011699,...,0.066686,0.063099,0.013671,0.051973,0.102922,-0.009312,0.027799,0.040270,-0.017081,-0.015839
3,0.014442,0.030691,-0.065059,-0.012545,0.040037,-0.007176,0.062810,0.032525,-0.002185,0.085377,...,0.072154,0.034601,-0.024409,0.024460,0.088771,0.030615,0.020678,0.028920,0.060850,-0.051959
4,0.014442,0.030691,-0.065059,-0.012545,0.040037,-0.007176,0.062810,0.032525,-0.002185,0.085377,...,0.072154,0.034601,-0.024409,0.024460,0.088771,0.030615,0.020678,0.028920,0.060850,-0.051959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18410,0.049371,-0.056621,-0.043403,-0.099676,0.044521,0.013236,0.002104,0.045774,-0.047097,0.071743,...,0.012399,-0.019462,-0.032409,0.024379,0.171492,-0.009100,-0.053378,0.013966,0.035721,0.041110
18411,0.105399,-0.119790,0.034709,-0.068842,0.085813,-0.065957,0.015720,-0.084088,-0.008549,0.048369,...,0.041495,0.024418,-0.069329,0.021259,0.159714,0.085343,-0.088587,0.029054,0.051901,0.057364
18412,0.038625,-0.005139,-0.013067,0.012266,0.008623,-0.003373,-0.006893,0.005935,0.005261,-0.014566,...,0.024653,0.003901,0.011163,-0.001966,0.028986,-0.017348,0.018914,0.012256,-0.004895,0.021321
18413,0.062741,-0.060041,0.023153,0.050334,-0.043190,-0.072364,0.025282,0.045926,-0.041479,0.002487,...,0.070804,0.053139,-0.069804,-0.016172,0.033209,0.045326,0.022228,0.030130,0.017900,-0.024232


In [18]:
df_full = pd.concat([df, df_skills], axis=1)
df_full = pd.concat([df_full, df_positions], axis=1)
df_full = df_full.drop(['custom_position', 'work_skills'], axis=1)

In [19]:
df_full.city_id = df_full.city_id.apply(lambda x: str(x))
one_hot_columns = ['city_id', 'schedule', 'education_name']
df_full = pd.get_dummies(df_full, columns=one_hot_columns, dtype=int)
df_full

Unnamed: 0,salary_from,salary_to,skill_0,skill_1,skill_2,skill_3,skill_4,skill_5,skill_6,skill_7,...,schedule_полный рабочий день,schedule_свободный график,schedule_сменный график,schedule_удаленная работа,schedule_частичная занятость,education_name_высшее,education_name_любое,education_name_неполное высшее,education_name_среднее,education_name_среднее профессиональное
0,60000,120000,0.045141,0.005210,0.010441,0.050447,0.052822,-0.045233,0.039402,-0.005679,...,1,0,0,0,0,0,1,0,0,0
1,60000,120000,0.045516,0.015479,0.010175,0.023086,0.034872,-0.054909,0.034681,-0.011029,...,1,0,0,0,0,0,1,0,0,0
2,60000,80000,0.009924,0.012621,0.032601,0.019898,0.064400,-0.077111,0.041170,0.033527,...,1,0,0,0,0,0,1,0,0,0
3,30000,35000,0.056506,-0.015857,0.027531,-0.007339,0.038399,-0.012392,0.046944,0.051400,...,0,0,0,0,1,0,1,0,0,0
4,30000,35000,0.056506,-0.015857,0.027531,-0.007339,0.038399,-0.012392,0.046944,0.051400,...,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18410,45000,70000,0.032806,-0.024595,0.042919,-0.025823,0.039859,-0.038699,0.015118,0.055680,...,1,0,0,0,0,0,0,0,0,1
18411,35000,58000,0.027155,-0.020949,0.030383,-0.014235,0.045790,0.007877,0.026279,0.028821,...,0,0,1,0,0,0,1,0,0,0
18412,77000,77000,0.049903,-0.030899,0.030653,0.017501,0.011768,-0.011891,0.039789,0.014402,...,1,0,0,0,0,1,0,0,0,0
18413,80000,120000,0.030774,-0.017683,0.023380,0.040719,0.016115,-0.061771,0.037774,0.025889,...,1,0,0,0,0,0,1,0,0,0


In [20]:
y = df_full['salary_from']
X = df_full.drop(columns=['salary_from', 'salary_to'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2023)

In [21]:
logreg_model = LinearRegression().fit(X_train, y_train)

In [22]:
logreg_model.score(X_train, y_train)

0.4916419440374704

In [23]:
y_pred = logreg_model.predict(X_test)
print('MSE = ', mean_squared_error(y_test, y_pred))
print('R2 = ', r2_score(y_test, y_pred))
print('MAPE = ', mean_absolute_percentage_error(y_test, y_pred))

MSE =  476429296.2413119
R2 =  0.44602533902493335
MAPE =  0.26521265638560443


In [25]:
ridge_model = Ridge(alpha=0.7).fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)
print('MSE = ', mean_squared_error(y_test, y_pred))
print('R2 = ', r2_score(y_test, y_pred))
print('MAPE = ', mean_absolute_percentage_error(y_test, y_pred))

MSE =  468389088.6629372
R2 =  0.4553742000260068
MAPE =  0.25916687745756856
