# Алгоритм для сопоставления резюме и вакансии [ML TalentMatch]

## Импорт и парсинг данных

In [1]:
import pandas as pd
from datetime import datetime

data_test = pd.read_json('data_test.json')

In [2]:
resumes = ['confirmed_resumes', 'failed_resumes']
for resume in resumes:
  for i in range(len(data_test[resume])):
    for j in range(len(data_test[resume][i])):
      if resume == 'confirmed_resumes':
        data_test[resume][i][j]['passed'] = 1
      else:
        data_test[resume][i][j]['passed'] = 0

for i in range(len(data_test['failed_resumes'])):
  data_test['failed_resumes'][i] += data_test['confirmed_resumes'][i]
data_test.drop('confirmed_resumes', axis=1, inplace=True)
data_test.rename(columns={'failed_resumes': 'resumes'}, inplace=True)

In [3]:
def calculate_experience(start_date_str, end_date_str=None):
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.now() if not end_date_str else datetime.strptime(end_date_str, '%Y-%m-%d')
    experience = (end_date - start_date).days / 365
    return round(experience, 1)

def process_data(input_data, good_keys, data_type):
    df = input_data.copy()
    translate_dict = {'about': "О себе",
                      'experienceItem': 'Опыт работы',
                      'educationItem': 'Образование',
                      'key_skills': "Стэк",
                      'uuid': "id",
                      'first_name': "Имя",
                      'last_name': "Фамилия",
                      'country': "Страна",
                      'city': "Город",
                      }
    experience_item_good_keys = ['position', 'description']

    for i, resume_list in enumerate(df['resumes']):
        for j, resume_dict in enumerate(resume_list):
            string = ""
            for key, value in resume_dict.items():
                if key in good_keys and value:
                    string += f"{translate_dict[key]}: "
                    if key == 'experienceItem':
                        for exp_item in resume_dict['experienceItem']:
                            for k, v in exp_item.items():
                                if k in experience_item_good_keys and v:
                                    string += f"{v} "
                                if k == "starts":
                                    num = calculate_experience(v, exp_item.get('ends', ''))
                                    string += f"Время работы: {num} года "
                    elif key == 'educationItem':
                        for edu_item in resume_dict['educationItem']:
                            for k in ['faculty', 'specialty']:
                                if edu_item[k]:
                                    string += f"{edu_item[k]} "
                                    break
                            for k in ['education_type', 'education_level', 'result']:
                                if edu_item[k]:
                                    string += f"{edu_item[k]} "
                                    break
                    else:
                        string += f"{value} "
            if data_type == "test_input":
              df['resumes'][i][j] = [resume_dict['uuid'], string, 1 if resume_dict['passed'] == 1 else 0]
            elif data_type == "result_input":
              df['resumes'][i][j] = [resume_dict['uuid'], string]

    for i, vacancy_dict in enumerate(df['vacancy']):
        string = ""
        for key, value in vacancy_dict.items():
            if key != "uuid":
                if key == 'name' and value:
                    string += f"Название вакансии: {value} "
                elif key == 'keywords' and value:
                    string += f"Стэк: {value} "
                elif value:
                    string += f"{value} "
        df['vacancy'][i] = [vacancy_dict['uuid'], string]

    df = pd.DataFrame(df)

    return df


In [4]:
keys = ['about', 'experienceItem', 'educationItem', 'key_skills']
df = process_data(data_test, keys, "test_input")

In [5]:
df

Unnamed: 0,vacancy,resumes
0,"[779f3a59-206a-3241-adc4-d7db504f960b, Названи...","[[74392e00-ecfb-335b-9fc1-c2652dca06e5, Стэк: ..."
1,"[7a4813fc-43bc-3896-a607-4c8682b01002, Названи...","[[254487e1-81ba-3f2b-9f15-eba98d891efc, Стэк: ..."
2,"[c03085c3-9b1e-3564-bb1e-59aa72e5fbca, Названи...","[[8746a855-022c-34d4-9b55-58da5483c255, О себе..."
3,"[a8dd83c3-178d-3c70-90c2-7c3648f6b96a, Названи...","[[557c9b5b-9707-360b-bb1f-18c3c1b94391, О себе..."
4,"[9d98eba0-13bb-38d3-b742-4fd445954b3d, Названи...","[[821b6466-f3e2-37c9-b44f-676d91bde045, О себе..."
5,"[4e2299c0-13fc-301d-8f3c-3ccfd0281ce6, Названи...","[[fa3c0058-5e01-3908-8aa0-148b57913695, Стэк: ..."
6,"[a8f56ed3-3ef3-365d-ade4-2df4db5d4af8, Названи...","[[222fd583-7cb5-3d68-9dbe-54eba687f1c5, Стэк: ..."
7,"[cdbe4d64-991a-35a1-8b71-a05a5ce24123, Названи...","[[7f3113b5-01a9-3dc0-b10c-0b513f443790, Стэк: ..."
8,"[7eca3dc1-2108-3152-85be-cd0ee7aa1493, Названи...","[[66b942de-5351-3721-a35b-3d008127a248, О себе..."
9,"[01713376-e04d-3f9d-9287-7a5ff74918c3, Названи...","[[75ff2b68-a4d2-357e-955f-094c47fb741c, Стэк: ..."


In [6]:
def remove_empty_str(x):
    return list(filter(lambda y: y[1], x))

df['resumes'] = df.resumes.apply(remove_empty_str)

In [7]:
sentences = []
for k in range(29):
    sentence = []
    for i in range(len(df['resumes'][k])):
        sentence.append(df['resumes'][k][i][1])
    sentence.append(df['vacancy'][k][1])
    sentences.append(sentence)
k = 0
for i in range(29):
    k += len(sentences[i])
print(k)

684


In [8]:
jobs_vector = []

## Преобразование данных с помощью SBERT

In [9]:
#!pip install -U sentence-transformers

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask
#Sentences we want sentence embeddings for
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
sentence_embeddings = []
for k in range(29): 
#Tokenize sentences
    encoded_input = tokenizer(sentences[k], padding=True, truncation=True, max_length=512, return_tensors='pt')
    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    #Perform pooling. In this case, mean pooling
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings.append(sentence_embedding[:-1])## Добавили без вакансий
    jobs_vector.append(sentence_embedding[-1]) ## наполняем вектор вакансий

In [13]:
data_st = pd.DataFrame()

In [14]:
values = []
vecs = []

for k in range(29):
    for j in range(sentence_embeddings[k].shape[0]):
        vecs.append(np.array(sentence_embeddings[k][j]))
print(len(vecs))
for k in range(29):
    for j in range(sentence_embeddings[k].shape[0]):
        values.append(df['resumes'][k][j][2])
print(len(values))

655
655


In [15]:
data_st['vecs'] = vecs
data_st['vals'] = values
data_st

Unnamed: 0,vecs,vals
0,"[0.7732333, 0.289302, -1.2152073, -1.4086607, ...",0
1,"[0.031669363, 0.30589706, -0.90228844, -1.4447...",0
2,"[0.5407877, 0.2600664, -1.3797959, -2.1094708,...",0
3,"[0.6904063, 0.3117696, -0.8063593, -1.5944325,...",0
4,"[0.34903842, 0.30436012, -1.1898199, -1.492681...",0
...,...,...
650,"[0.529927, 0.3030882, -1.2093245, -2.1595325, ...",1
651,"[0.5190171, 0.33838537, -1.1467922, -1.3140029...",1
652,"[0.35875368, 0.16959004, -0.87094903, -1.05390...",1
653,"[0.8259417, 0.39573658, -1.2202284, -1.620497,...",1


## Обучение моделей

In [17]:
X = np.array(vecs)
y = np.array(values)

In [18]:
type(X)

numpy.ndarray

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Логистическая регрессия

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

class LogisticRegressionModel:
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.model = None
        self.best_params = None
        self.accuracy = None

    def train(self, param_grid):
        model = LogisticRegression()
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
        grid_search.fit(self.X_train, self.y_train)
        self.model = grid_search.best_estimator_
        self.best_params = grid_search.best_params_

    def evaluate(self):
        y_pred = self.model.predict(self.X_test)
        self.accuracy = accuracy_score(self.y_test, y_pred)
        print("Best parameters:", self.best_params)
        print(f'Accuracy: {self.accuracy}')
        print(classification_report(self.y_test, y_pred))

    def predict(self, X_new):
        return self.model.predict(X_new)

# Пример использования
param_grid = {
    'penalty': ['l1', 'l2'],  # Регуляризация: l1 - Lasso, l2 - Ridge
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Обратная сила регуляризации
    'solver': ['liblinear', 'saga']  # Алгоритм оптимизации
}

# Создание экземпляра модели и обучение
log_reg_model = LogisticRegressionModel(X_train, y_train, X_test, y_test)
log_reg_model.train(param_grid)

# Оценка модели
log_reg_model.evaluate()




Best parameters: {'C': 0.001, 'penalty': 'l1', 'solver': 'liblinear'}
Accuracy: 0.7633587786259542
              precision    recall  f1-score   support

           0       0.76      1.00      0.87       100
           1       0.00      0.00      0.00        31

    accuracy                           0.76       131
   macro avg       0.38      0.50      0.43       131
weighted avg       0.58      0.76      0.66       131



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Метод опорных векторов

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

class SVMModel:
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.model = None
        self.best_params = None
        self.accuracy = None

    def train(self, param_grid, cv=5):
        svm = SVC()
        grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=cv)
        grid_search.fit(self.X_train, self.y_train)
        self.model = grid_search.best_estimator_
        self.best_params = grid_search.best_params_

    def evaluate(self):
        y_pred = self.model.predict(self.X_test)
        self.accuracy = accuracy_score(self.y_test, y_pred)
        print("Best parameters:", self.best_params)
        print(f'Accuracy: {self.accuracy}')
        print(classification_report(self.y_test, y_pred))

    def predict(self, X_new):
        return self.model.predict(X_new)

# Пример использования для SVM с перебором гиперпараметров
param_grid = {
    'C': [0.1, 1, 10, 100],  
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

svm_model = SVMModel(X_train, y_train, X_test, y_test)
svm_model.train(param_grid)
svm_model.evaluate()

Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}
Accuracy: 0.7633587786259542
              precision    recall  f1-score   support

           0       0.76      1.00      0.87       100
           1       0.00      0.00      0.00        31

    accuracy                           0.76       131
   macro avg       0.38      0.50      0.43       131
weighted avg       0.58      0.76      0.66       131



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Случайный лес

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

class RandomForestModel:
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.model = None
        self.best_params = None
        self.accuracy = None

    def train(self, param_grid, cv=5):
        model = RandomForestClassifier(random_state=42)
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
        grid_search.fit(self.X_train, self.y_train)
        self.model = grid_search.best_estimator_
        self.best_params = grid_search.best_params_

    def evaluate(self):
        y_pred = self.model.predict(self.X_test)
        self.accuracy = accuracy_score(self.y_test, y_pred)
        print("Best parameters:", self.best_params)
        print(f'Accuracy: {self.accuracy}')
        print(classification_report(self.y_test, y_pred))

    def predict(self, X_new):
        return self.model.predict(X_new)

# Пример использования для RandomForest с перебором гиперпараметров
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_forest_model = RandomForestModel(X_train, y_train, X_test, y_test)
random_forest_model.train(param_grid)
random_forest_model.evaluate()


Best parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.7022900763358778
              precision    recall  f1-score   support

           0       0.75      0.91      0.82       100
           1       0.10      0.03      0.05        31

    accuracy                           0.70       131
   macro avg       0.43      0.47      0.44       131
weighted avg       0.60      0.70      0.64       131



### Градиентный бустинг

### Ансамбль моделей

In [25]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Определение базовых моделей
svm_clf = SVC(kernel='poly', C=0.1, gamma='scale', probability=True)
rf_clf = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_leaf=4, min_samples_split=10)
lr_clf = LogisticRegression(C=0.01, penalty='l2', solver='liblinear')

# Создание ансамбля моделей
ensemble_clf = VotingClassifier(estimators=[('svm', svm_clf), ('rf', rf_clf), ('lr', lr_clf)], voting='soft')

# Обучение ансамбля моделей
ensemble_clf.fit(X_train, y_train)

# Предсказание классов на тестовом наборе
y_pred = ensemble_clf.predict(X_test)

# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7557251908396947


# Проверка работоспособности моделей на тестовых данных

## Считывание новых данных и парсинг

In [26]:
import json

# Открываем файл на чтение
with open('data_result.json', 'r') as file:
    # Загружаем данные из файла
    data = json.load(file)

In [27]:
df2 = pd.DataFrame({'vacancy': [data['vacancy']], 'resumes': [data['resumes']]})

In [28]:
goodKeys = ['about', 'key_skills', 'experienceItem', 'educationItem']
translateDictionary = {'about': "О себе", 'experienceItem': 'Опыт работы', 'educationItem': 'Образование', 'key_skills': "Стэк"}
experienceItemGoodKeys = ['position', 'description']
resumes = ['resumes']
for resume in resumes:
  for i in range(1):
    for j in range(len(df2[resume][i])):
      string = ""
      for key in df2[resume][i][j].keys():
        if key in goodKeys and df2[resume][i][j][key]:
          string += str(translateDictionary[key]) + ": "
          if key == 'experienceItem':
            for val in df2[resume][i][j]['experienceItem']:
              for k in val.keys():
                if k in experienceItemGoodKeys and val[k]:
                  string += str(val[k]) + " "
                if k == "starts":
                  if val['starts'] and val['ends']:
                    num = str(calculate_experience(val['starts'], val['ends']))
                  elif val['starts']:
                    num = str(calculate_experience(val['starts'], ""))
                  string += "Время работы: " + str(num) + " года "
          elif key == 'educationItem':
            for val in df2[resume][i][j]['educationItem']:
              for k in ['faculty', 'specialty']:
                if val[k]:
                  string += str(val[k]) + " "
                  break
              for k in ['education_type', 'education_level', 'result']:
                if val[k]:
                  string += str(val[k]) + " "
                  break
          else:
            string += str(df2[resume][i][j][key]) + " "
      df2[resume][i][j] = [df2[resume][i][j]['uuid'], string]

In [29]:
for i in range(len(df2['vacancy'])):
    string = ""
    for key in df2['vacancy'][i].keys():
      if key != "uuid":
        if key == 'name' and df2['vacancy'][i][key]:
            string += "Название вакансии: " + df2['vacancy'][i][key] + " "
        elif key == 'keywords' and df2['vacancy'][i][key]:
            string += "Стэк: " + df2['vacancy'][i][key] + " "
        elif df2['vacancy'][i][key]:
            string += str(df2['vacancy'][i][key]) + " "
    df2['vacancy'][i] = [df2['vacancy'][i]['uuid'], string]

In [30]:
df2

Unnamed: 0,vacancy,resumes
0,"[8b9c8d16-c7f0-38a2-b80c-d94030c15a6f, Названи...","[[0dfe8e63-d7a3-3fe4-b9d7-1b8122158f33, О себе..."


In [31]:
sentences = []
for i in range(len(df2['resumes'][0])):
    sentences.append(df2['resumes'][0][i][1])
sentences.append(df2['vacancy'][0][1])

## Преобразование данных с помощью SBERT

In [32]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask
#Sentences we want sentence embeddings for
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_mt_nlu_ru")
sentence_embeddings = []

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors='pt')
#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
#Perform pooling. In this case, mean pooling
sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings.append(sentence_embedding[:-1])## Добавили без вакансий

In [33]:
job_vector = np.array(sentence_embedding[-1])
job_vector

array([ 0.08710935,  0.15967016, -0.966236  , ...,  0.04721457,
        0.24707209, -0.3176238 ], dtype=float32)

In [34]:
sentence_embedding

tensor([[ 0.2465,  0.2464, -1.0799,  ...,  0.0639,  0.1531, -0.1566],
        [ 0.3442,  0.2103, -0.9089,  ...,  0.1197,  0.2454, -0.0529],
        [ 0.4926,  0.3405, -0.6838,  ...,  0.0087,  0.1944, -0.2104],
        ...,
        [ 0.6508,  0.3867, -1.3045,  ...,  0.1611,  0.5513,  0.0436],
        [ 0.9174,  0.1345, -0.7396,  ..., -0.1968,  0.4457, -0.2316],
        [ 0.0871,  0.1597, -0.9662,  ...,  0.0472,  0.2471, -0.3176]])

In [35]:
vecs = []
for j in range(sentence_embeddings[0].shape[0]):
        vecs.append(np.array(sentence_embeddings[0][j]))

In [36]:
vecs

[array([ 0.24652943,  0.2464189 , -1.0799193 , ...,  0.06387492,
         0.15311268, -0.15657374], dtype=float32),
 array([ 0.34422022,  0.2102894 , -0.9089198 , ...,  0.11972912,
         0.24536158, -0.05291133], dtype=float32),
 array([ 0.4926394 ,  0.3404511 , -0.68377084, ...,  0.00868226,
         0.19442463, -0.2103905 ], dtype=float32),
 array([ 0.30837828,  0.3804361 , -0.94617987, ...,  0.21000925,
        -0.06710762, -0.31272864], dtype=float32),
 array([ 0.65831983,  0.45235175, -1.1508372 , ..., -0.35638547,
         0.2377165 , -0.00732666], dtype=float32),
 array([ 0.59139186,  0.1259082 , -0.98164046, ...,  0.15558776,
         0.16797107, -0.21483791], dtype=float32),
 array([ 0.580642  ,  0.3478492 , -1.0144283 , ..., -0.04908693,
         0.39683747, -0.1964718 ], dtype=float32),
 array([ 0.6190639 ,  0.24929254, -1.0699365 , ..., -0.30053523,
         0.2657002 , -0.0011801 ], dtype=float32),
 array([ 0.7152965 ,  0.35398978, -1.0054626 , ..., -0.12492006,
       

## Классификация данных

Прописать логику прогона данных через классификатор. Не забыть обработать, чтобы выводило id вакансии и результат работы классификатора

In [37]:
X_new = vecs
vac_sum_res = []

### Логистическая регрессия

In [38]:
y_new_pred_log = log_reg_model.predict(X_new)
vac_sum_res.append(y_new_pred_log)
print("Predictions for new data:", y_new_pred_log)

Predictions for new data: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


### Метод опорных векторов

In [39]:
y_new_pred_svm = svm_model.predict(X_new)
vac_sum_res.append(y_new_pred_svm)
print("Predictions for new data:", y_new_pred_svm)

Predictions for new data: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


### Случайный лес

In [40]:
y_new_pred_rf = random_forest_model.predict(X_new)
vac_sum_res.append(y_new_pred_rf)
print("Predictions for new data:", y_new_pred_rf)

Predictions for new data: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 1]


### Градиентный бустинг

In [41]:
y_new_pred_gb = gb_model.predict(X_new)
vac_sum_res.append(y_new_pred_gb)
print("Predictions for new data:", y_new_pred_gb)

Predictions for new data: [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


### Ансамбль моделей

In [42]:
y_new_pred_ans = ensemble_clf.predict(X_new)
vac_sum_res.append(y_new_pred_ans)
print("Predictions for new data:", y_new_pred_ans)

Predictions for new data: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


## Создание результата

In [43]:
vectors_array = np.array(vac_sum_res)
ones_count = np.sum(vectors_array, axis=0)
result_vector = np.where(ones_count >= 1, 1, 0)

### Получение таблицы с итогам по отсмотру резюме

In [44]:
result = pd.DataFrame()
uuid = []
for i in range(len(df2['resumes'][0])):
    uuid.append(df2['resumes'][0][i][0])
result['uuid'] = uuid
result['Rejected(0)/Accepted(1)'] = result_vector
result

Unnamed: 0,uuid,Rejected(0)/Accepted(1)
0,0dfe8e63-d7a3-3fe4-b9d7-1b8122158f33,0
1,f8b69e24-e2c0-3186-9578-380835eb2ee7,0
2,e3976e74-e71b-34db-8e98-08dc422fa567,0
3,9a9c3ff1-49f8-30dd-a294-e56fc60cae64,0
4,6561771c-7ef3-3e50-ab3a-ba8547201480,0
...,...,...
108,82df355a-235e-3046-9e6e-782ddf1600eb,0
109,915597ce-24e5-31fa-8dca-29437f49f839,0
110,f288a532-0b58-30cb-ac3c-f87e53984719,0
111,3e3a379f-226e-305e-b7d8-cf341e00cbd7,0


In [45]:
result_vecs = result
result_vecs['vecs'] = vecs
result_vecs

Unnamed: 0,uuid,Rejected(0)/Accepted(1),vecs
0,0dfe8e63-d7a3-3fe4-b9d7-1b8122158f33,0,"[0.24652943, 0.2464189, -1.0799193, -1.1649997..."
1,f8b69e24-e2c0-3186-9578-380835eb2ee7,0,"[0.34422022, 0.2102894, -0.9089198, -1.7640398..."
2,e3976e74-e71b-34db-8e98-08dc422fa567,0,"[0.4926394, 0.3404511, -0.68377084, -1.1540236..."
3,9a9c3ff1-49f8-30dd-a294-e56fc60cae64,0,"[0.30837828, 0.3804361, -0.94617987, -1.582027..."
4,6561771c-7ef3-3e50-ab3a-ba8547201480,0,"[0.65831983, 0.45235175, -1.1508372, -1.436009..."
...,...,...,...
108,82df355a-235e-3046-9e6e-782ddf1600eb,0,"[0.31780303, 0.39202797, -1.0043111, -1.627238..."
109,915597ce-24e5-31fa-8dca-29437f49f839,0,"[0.51568246, 0.40074402, -0.93600553, -1.82872..."
110,f288a532-0b58-30cb-ac3c-f87e53984719,0,"[0.63192165, 0.39823768, -1.152894, -1.2465897..."
111,3e3a379f-226e-305e-b7d8-cf341e00cbd7,0,"[0.65080595, 0.38674492, -1.3045287, -1.967828..."


### Получение таблицы только принятых участников

In [46]:
result_accepted = result[result['Rejected(0)/Accepted(1)'] == 1]
result_accepted.reset_index(drop=True, inplace=True)
result_accepted

Unnamed: 0,uuid,Rejected(0)/Accepted(1),vecs
0,5785c202-6744-3e1b-994a-d5bffc6aad14,1,"[0.1849089, 0.24451229, -1.0020154, -1.3733037..."
1,a71b0749-1ebc-3099-b0a1-342ca64d1575,1,"[0.7269409, 0.29399747, -0.9463735, -1.2645509..."
2,fd0ccbd0-3a58-3818-8691-98f31de17527,1,"[0.83842385, 0.17105225, -0.96567845, -1.23559..."
3,d9847b13-dc30-36e7-8d75-c632495a7eec,1,"[0.76192975, 0.21210292, -0.9566865, -1.718751..."
4,73592479-12bf-38d4-84f0-91fe33518b47,1,"[0.83359784, 0.16159025, -0.7055698, -1.526828..."
5,fbab422e-7e0b-38d2-a7bc-cc7286858c10,1,"[0.6285856, 0.3753297, -0.8481231, -1.4256591,..."
6,37cba700-eed6-3018-bad6-f720f8217aeb,1,"[0.56260884, 0.24108998, -0.9691159, -1.281389..."
7,c70de373-9f3a-3647-ab66-f25e98c29409,1,"[0.21727204, 0.28287935, -0.9965392, -1.327295..."
8,9a9b0a97-3514-3137-b8b8-129474b24528,1,"[0.7417369, 0.45123923, -1.036248, -1.8223083,..."
9,ebcd86ef-6e1f-39cf-8af3-85adaec6d3b3,1,"[0.60794646, 0.5423548, -0.7852259, -1.3598261..."


In [47]:
result_accepted_vecs = result_vecs[result_vecs['Rejected(0)/Accepted(1)'] == 1]
result_accepted_vecs.reset_index(drop=True, inplace=True)
result_accepted_vecs

Unnamed: 0,uuid,Rejected(0)/Accepted(1),vecs
0,5785c202-6744-3e1b-994a-d5bffc6aad14,1,"[0.1849089, 0.24451229, -1.0020154, -1.3733037..."
1,a71b0749-1ebc-3099-b0a1-342ca64d1575,1,"[0.7269409, 0.29399747, -0.9463735, -1.2645509..."
2,fd0ccbd0-3a58-3818-8691-98f31de17527,1,"[0.83842385, 0.17105225, -0.96567845, -1.23559..."
3,d9847b13-dc30-36e7-8d75-c632495a7eec,1,"[0.76192975, 0.21210292, -0.9566865, -1.718751..."
4,73592479-12bf-38d4-84f0-91fe33518b47,1,"[0.83359784, 0.16159025, -0.7055698, -1.526828..."
5,fbab422e-7e0b-38d2-a7bc-cc7286858c10,1,"[0.6285856, 0.3753297, -0.8481231, -1.4256591,..."
6,37cba700-eed6-3018-bad6-f720f8217aeb,1,"[0.56260884, 0.24108998, -0.9691159, -1.281389..."
7,c70de373-9f3a-3647-ab66-f25e98c29409,1,"[0.21727204, 0.28287935, -0.9965392, -1.327295..."
8,9a9b0a97-3514-3137-b8b8-129474b24528,1,"[0.7417369, 0.45123923, -1.036248, -1.8223083,..."
9,ebcd86ef-6e1f-39cf-8af3-85adaec6d3b3,1,"[0.60794646, 0.5423548, -0.7852259, -1.3598261..."


# Построение модели регрессии резюме

## Ранжирование резюме, основываясь на значениях метрик сравнения резюме с вакансией

### Используем несколько метрик

In [48]:
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

# Заданный вектор вакансии - задан ранее, job_vector

# Векторы резюме
resume_vectors = result_accepted_vecs['vecs']

# Определение метрик
def manhattan_distance(vec1, vec2):
    return np.sum(np.abs(vec1 - vec2))

def euclidean_distance(vec1, vec2):
    return np.sqrt(np.sum((vec1 - vec2) ** 2))

def pearson_correlation(vec1, vec2):
    corr, _ = pearsonr(vec1, vec2)
    return corr

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

# Вычисление минимальных и максимальных значений метрик
min_manhattan_dist = np.inf
max_manhattan_dist = -np.inf
min_euclidean_dist = np.inf
max_euclidean_dist = -np.inf
min_pearson_corr = np.inf
max_pearson_corr = -np.inf
min_cosine_sim = np.inf
max_cosine_sim = -np.inf

for resume_vector in resume_vectors:
    manhattan_dist = manhattan_distance(resume_vector, job_vector)
    euclidean_dist = euclidean_distance(resume_vector, job_vector)
    pearson_corr = pearson_correlation(resume_vector, job_vector)
    cosine_sim = cosine_similarity(resume_vector, job_vector)
    
    min_manhattan_dist = min(min_manhattan_dist, manhattan_dist)
    max_manhattan_dist = max(max_manhattan_dist, manhattan_dist)
    min_euclidean_dist = min(min_euclidean_dist, euclidean_dist)
    max_euclidean_dist = max(max_euclidean_dist, euclidean_dist)
    min_pearson_corr = min(min_pearson_corr, pearson_corr)
    max_pearson_corr = max(max_pearson_corr, pearson_corr)
    min_cosine_sim = min(min_cosine_sim, cosine_sim)
    max_cosine_sim = max(max_cosine_sim, cosine_sim)

# Нормализация значений метрик
def normalize(score, min_val, max_val):
    return (score - min_val) / (max_val - min_val) if max_val != min_val else 0

# Ранжирование резюме
ranked_resumes = []
counter = 0
for resume_vector in resume_vectors:
    manhattan_dist = manhattan_distance(resume_vector, job_vector)
    euclidean_dist = euclidean_distance(resume_vector, job_vector)
    pearson_corr = pearson_correlation(resume_vector, job_vector)
    cosine_sim = cosine_similarity(resume_vector, job_vector)
    
    # Нормализация значений метрик
    manhattan_dist_normalized = normalize(manhattan_dist, min_manhattan_dist, max_manhattan_dist)
    euclidean_dist_normalized = normalize(euclidean_dist, min_euclidean_dist, max_euclidean_dist)
    pearson_corr_normalized = normalize(pearson_corr, min_pearson_corr, max_pearson_corr)
    cosine_sim_normalized = normalize(cosine_sim, min_cosine_sim, max_cosine_sim)
    
    # Объединение и нормализация значений метрик
    combined_score = (manhattan_dist_normalized + euclidean_dist_normalized +
                      pearson_corr_normalized + cosine_sim_normalized) / 4
    
    ranked_resumes.append((resume_vector, combined_score, counter))
    counter += 1

# Сортировка резюме по убыванию значения объединенной метрики
ranked_resumes.sort(key=lambda x: x[1], reverse=True)

# Вывод результатов
res_ = []
for resume, score, counter in ranked_resumes:
    print(f"UUID: {result_accepted_vecs['uuid'][counter]}, Combined Score: {score}")
    res_.append(result_accepted_vecs['uuid'][counter])
    # Вывод резюме или его идентификатора
res_df = pd.DataFrame(res_)

UUID: 37cba700-eed6-3018-bad6-f720f8217aeb, Combined Score: 0.5407156250346308
UUID: 93d61f89-b8d0-3187-8c90-888be29e68dd, Combined Score: 0.5375462573199682
UUID: a71b0749-1ebc-3099-b0a1-342ca64d1575, Combined Score: 0.5363825693869887
UUID: ebcd86ef-6e1f-39cf-8af3-85adaec6d3b3, Combined Score: 0.5315931625041452
UUID: 9a9b0a97-3514-3137-b8b8-129474b24528, Combined Score: 0.529709035207349
UUID: fd0ccbd0-3a58-3818-8691-98f31de17527, Combined Score: 0.5293467383927596
UUID: 73592479-12bf-38d4-84f0-91fe33518b47, Combined Score: 0.526488866496337
UUID: d9847b13-dc30-36e7-8d75-c632495a7eec, Combined Score: 0.5256917687416331
UUID: fbab422e-7e0b-38d2-a7bc-cc7286858c10, Combined Score: 0.5254889625793107
UUID: 5785c202-6744-3e1b-994a-d5bffc6aad14, Combined Score: 0.5203726954775998
UUID: c70de373-9f3a-3647-ab66-f25e98c29409, Combined Score: 0.5
UUID: cc88bf96-f0b9-313a-abce-dbe60b6f1c98, Combined Score: 0.5


In [49]:
res_df = res_df.rename(columns={0: 'Sorted UUID'})
res_df

Unnamed: 0,Sorted UUID
0,37cba700-eed6-3018-bad6-f720f8217aeb
1,93d61f89-b8d0-3187-8c90-888be29e68dd
2,a71b0749-1ebc-3099-b0a1-342ca64d1575
3,ebcd86ef-6e1f-39cf-8af3-85adaec6d3b3
4,9a9b0a97-3514-3137-b8b8-129474b24528
5,fd0ccbd0-3a58-3818-8691-98f31de17527
6,73592479-12bf-38d4-84f0-91fe33518b47
7,d9847b13-dc30-36e7-8d75-c632495a7eec
8,fbab422e-7e0b-38d2-a7bc-cc7286858c10
9,5785c202-6744-3e1b-994a-d5bffc6aad14


[CV] END learning_rate=0.01, max_depth=9, min_samples_leaf=4, min_samples_split=16, n_estimators=70; total time=  18.1s
[CV] END learning_rate=0.01, max_depth=4, min_samples_leaf=3, min_samples_split=12, n_estimators=70; total time=  10.8s
[CV] END learning_rate=0.01, max_depth=5, min_samples_leaf=8, min_samples_split=5, n_estimators=70; total time=  12.1s
[CV] END learning_rate=0.01, max_depth=6, min_samples_leaf=6, min_samples_split=3, n_estimators=70; total time=  14.3s
[CV] END learning_rate=0.01, max_depth=6, min_samples_leaf=6, min_samples_split=13, n_estimators=70; total time=  14.7s
[CV] END learning_rate=0.01, max_depth=3, min_samples_leaf=1, min_samples_split=11, n_estimators=70; total time=   8.1s
[CV] END learning_rate=0.01, max_depth=9, min_samples_leaf=4, min_samples_split=4, n_estimators=70; total time=  19.0s
[CV] END learning_rate=0.01, max_depth=7, min_samples_leaf=3, min_samples_split=8, n_estimators=70; total time=  16.7s
[CV] END learning_rate=0.01, max_depth=4, mi

In [50]:
res_df.to_csv('result_vector.csv', index=False)