In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


**Код ниже предназначен для запуска в kagle с учетом зависимостей, которые там предустановлены. Для запуска в локальном Юпитер-ноутбуке нужно предварительно установить зависимости из requirements.txt в корне проекта**


In [None]:
!pip install sentence_transformers

**Предварительно надо загрузить датасет train_dataset.json**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
import joblib

# Загрузка данных
file_path = '/kaggle/input/train-dataset-fic-flashteam/train_dataset.json'  # Сюда вставить путь до датасета
data = pd.read_json(file_path)

# Подготовка данных
columns_to_keep = ['age', 'semantic_similarity', 'skill_similarity', 'total_months_worked', 'grade_proof']
data = data[columns_to_keep]

le = LabelEncoder()
data['grade_proof'] = le.fit_transform(data['grade_proof'])  # "подтверждён" -> 1, "не подтверждён" -> 0

X = data.drop(columns=['grade_proof'])
y = data['grade_proof']

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Оценка важности признаков с помощью Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

feature_importance = rf_model.feature_importances_

# Корректируем веса для VotingClassifier
weights = [1, 3, 1]  # Базовые веса
semantic_skill_weight = feature_importance[1] + feature_importance[2]
age_months_weight = feature_importance[0] + feature_importance[3]

weights[1] += int(age_months_weight * 5)  # Увеличиваем вес Random Forest
weights[2] += int(semantic_skill_weight * 5)  # Увеличиваем вес XGBoost

# Определяем модели
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": rf_model,
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

calibrated_models = {name: CalibratedClassifierCV(model, cv=5) for name, model in models.items()}
for name, model in calibrated_models.items():
    model.fit(X_train, y_train)

# VotingClassifier с обновленными весами
voting_ensemble = VotingClassifier(
    estimators=[(name, model) for name, model in calibrated_models.items()],
    voting='soft',
    weights=weights
)
voting_ensemble.fit(X_train, y_train)

# Сохранение обученной модели VotingClassifier
joblib.dump(voting_ensemble, "voting_ensemble_model.pkl")
print("Сохранено.")

# Сохранение масштабировщика
joblib.dump(scaler, "scaler.pkl")
print("Сохранено.")

# Сохранение LabelEncoder
joblib.dump(le, "label_encoder.pkl")
print("Сохранено.")

# Оценка ROC-AUC
y_prob = voting_ensemble.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC-AUC for Voting Ensemble (Weighted): {roc_auc}")



**Ниже код оценки работы модели. На вход подается 1610 строк (все 805 grade_proof - 'подтверждён' и столько же grade_proof - 'не подтверждён)
Датасет для проверки - balanced_data.json  
Его также надо предварительно загрузить**



На выходе получается информативный json со всеми промежуточными метриками

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import time
import re
from nltk.corpus import stopwords
import nltk
import joblib

# Инициализация
nltk.download('stopwords')
russian_stopwords = set(stopwords.words('russian'))

# Загрузка ансамбля обученных моделей и соотв. утилит
voting_ensemble = joblib.load("voting_ensemble_model.pkl")
scaler = joblib.load("scaler.pkl")
label_encoder = joblib.load("label_encoder.pkl")

# Загрузка Transformer моделей
model_work_similarity = SentenceTransformer('deepvk/USER-bge-m3')
model_semantic = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Функции обработки данных
def calculate_months(date_range):
    try:
        match = re.match(r'^(\d{4}-\d{2}-\d{2})\s*-\s*(\d{4}-\d{2}-\d{2})?$', date_range)
        if not match:
            return 0
        start_date = datetime.strptime(match.group(1), '%Y-%m-%d')
        end_date = datetime.strptime(match.group(2), '%Y-%m-%d') if match.group(2) else datetime.today()
        return max((end_date.year - start_date.year) * 12 + (end_date.month - start_date.month), 0)
    except:
        return 0

def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    words = text.split(',')
    words = [re.sub(r'\W+', '', word.strip()) for word in words]
    words = [word for word in words if word and word not in russian_stopwords]
    return ' '.join(words)

def calculate_skill_similarity(candidate_skills, work_experience):
    candidate_skills_set = set(candidate_skills.split(" "))
    work_experience_set = set(work_experience.split(" "))
    all_skills = list(candidate_skills_set.union(work_experience_set))
    candidate_vector = [1 if skill in candidate_skills_set else 0 for skill in all_skills]
    experience_vector = [1 if skill in work_experience_set else 0 for skill in all_skills]
    if not any(candidate_vector) or not any(experience_vector):
        return 0.0
    return cosine_similarity([candidate_vector], [experience_vector])[0][0]

def calculate_semantic_similarity(position, key_skills, work_experience):
    combined_skills = key_skills + " " + work_experience
    position_embedding = model_semantic.encode([position])
    skills_embedding = model_semantic.encode([combined_skills])
    return cosine_similarity(position_embedding, skills_embedding)[0][0]

def process_input_data(input_data):
    results = []
    for row in input_data:
        work_experience = row.get("work_experience", "")
        position = preprocess_text(row.get("position", ""))
        key_skills = preprocess_text(row.get("key_skills", ""))

        total_months_worked = 0
        processed_date_ranges = set()  # Для уникальности диапазонов дат

        # Проход по строкам опыта работы
        for line in work_experience.split('\n'):
            line = line.strip()
            if not line:
                continue

            # Извлекаем диапазон дат и описание работы
            date_part = line.split(':')[0].strip()
            match = re.match(r'^(\d{4}-\d{2}-\d{2})\s*-\s*(\d{4}-\d{2}-\d{2})?$', date_part)
            if match:
                date_range = match.group(0)
                details = line.split(':', 1)[-1].strip()
                comparison_text = ' '.join(details.split()[:4])  # Используем первые 4 слова для сравнения

                # Если диапазон уже обработан, пропускаем
                if date_range in processed_date_ranges:
                    continue
                processed_date_ranges.add(date_range)

                # Подсчитываем месяцы работы
                months_worked = calculate_months(date_range)

                # Вычисляем сходство между позицией и описанием работы
                embeddings1 = model_work_similarity.encode(position, convert_to_tensor=True)
                embeddings2 = model_work_similarity.encode(comparison_text, convert_to_tensor=True)
                similarity = util.pytorch_cos_sim(embeddings1, embeddings2).item()

                # Учитываем только релевантные месяцы работы
                if similarity > 0.649:
                    total_months_worked += months_worked

       
        skill_similarity = calculate_skill_similarity(key_skills, preprocess_text(work_experience))
        semantic_similarity = calculate_semantic_similarity(position, key_skills, preprocess_text(work_experience))

        # Добавляем обработанные данные
        results.append({
            "age": float(row.get("age", 0)),
            "semantic_similarity": semantic_similarity,
            "skill_similarity": skill_similarity,
            "total_months_worked": total_months_worked
        })

    return pd.DataFrame(results)

def main(filepath, output_filepath, num_records=1000):
    start_time = time.time()

    # Загрузка исходных данных
    with open(filepath, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)

    # Берем только первые num_records записей
    raw_data = raw_data[:num_records]

    # Сохраняем истинные метки
    true_labels = [row["grade_proof"] for row in raw_data]

    # Обработка данных
    processed_data = process_input_data(raw_data)

    # Масштабирование
    features = ['age', 'semantic_similarity', 'skill_similarity', 'total_months_worked']
    scaled_data = pd.DataFrame(scaler.transform(processed_data[features]), columns=features)

    # Предсказание
    predictions_proba = voting_ensemble.predict_proba(scaled_data)[:, 1]
    predictions = voting_ensemble.predict(scaled_data)

    # Расчет метрики AUC-ROC
    auc_roc = roc_auc_score(label_encoder.transform(true_labels), predictions_proba)

    # Добавляем предсказания и промежуточные значения к данным
    for row, prob, pred, true_label, intermediate in zip(raw_data, predictions_proba, predictions, true_labels, processed_data.to_dict(orient='records')):
        row["predicted_grade_proof"] = label_encoder.inverse_transform([pred])[0]
        row["probability"] = prob
        row["grade_proof"] = true_label  # Оригинальное значение

        # Добавляем промежуточные значения
        row["semantic_similarity"] = intermediate["semantic_similarity"]
        row["skill_similarity"] = intermediate["skill_similarity"]
        row["total_months_worked"] = intermediate["total_months_worked"]

    # Сохранение результатов
    with open(output_filepath, 'w', encoding='utf-8') as f:
        json.dump(raw_data, f, ensure_ascii=False, indent=4)

    end_time = time.time()
    total_time = end_time - start_time
    avg_time_per_record = total_time / len(raw_data)

    print(f"ROC-AUC: {auc_roc}")
    print(f"Results saved to {output_filepath}")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Number of records processed: {len(raw_data)}")
    print(f"Average time per record: {avg_time_per_record:.4f} seconds")


# Выполнение
input_filepath = "/kaggle/input/balanced-data-flashteam-fic/balanced_data.json"  # Путь к balanced_data.json
output_filepath = "output_results.json"  
main(input_filepath, output_filepath, num_records=1609) #Можно выбрать количество строк для проверки в num_records
