In [1]:
import pandas as pd
import numpy as np

import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_cases = pd.read_csv('/content/cases_with_roles.csv', sep=';')
df_cases = pd.read_csv('../data/cases_with_roles.csv', sep=';')

In [3]:
df_cases

Unnamed: 0,id,title,description,required_roles
0,37,Карта повреждений территорий,"Алгоритмы по распознаванию карьеров, вырубок, ...","ML engineer, CV engineer, Аналитик, Тестировщик"
1,3,Контроль перемещений строительных конструкций ...,Развитие методик дистанционного контроля дефор...,"ML engineer, CV engineer, Python Backend, Тест..."
2,63,Разработка системы ECM для авиационного двигателя,Имеется техническая документация по авиационны...,"ML engineer, Аналитик, Python Backend, DevOps,..."
3,6,"Система контроля качества продуктов АО ""Медитек""","Система контроля качества продуктов АО ""Медите...","C# Backend, Тестировщик, Аналитик"


In [4]:
role_to_skills_mapping = {
    "Java Backend": [
        "Java", "Spring Boot", "PostgreSQL", "Git", "Построение Rest API", "Умение работать с API",
        "Back-end разработка", "Linux", "Docker", "Kubernetes", "SQL"
    ],
    "C# Backend": [
        "C#", "Back-end разработка", "Docker", "SQL", "Git", "Умение работать с API", "СУБД PostgreSQL",
        ".NET", "Linux", "Kubernetes", "Nginx", "Управление проектами"
    ],
    "Go Backend": [
        "Go", "Back-end разработка", "Docker", "Git", "Kubernetes", "SQL", "Linux", "Helm", "Nginx",
        "Умение работать с API", "Grafana", "Управление проектами"
    ],
    "Python Backend": [
        "Python", "Django", "Docker", "Построение Rest API", "SQL", "СУБД PostgreSQL", "Git",
        "Back-end разработка", "Linux", "Kubernetes", "Nginx", "Умение работать с API"
    ],
    "C++ Backend": [
        "C++", "Back-end разработка", "Git", "Linux", "Docker", "Kubernetes", "SQL", "Умение работать с API",
        "PostgreSQL", "Управление проектами"
    ],
    "Frontend": [
        "React", "CSS", "HTML", "JavaScript", "Tailwind", "Next", "Vue", "Git", "Figma", "Canva",
        "UI/UX", "Zustand", "SSR", "MUI", "Shadcn", "Nginx"
    ],
    "ML engineer": [
        "Machine Learning", "Python", "TensorFlow", "Scikit-Learn", "Pandas", "NumPy", "Deep Learning",
        "PyTorch", "Data Science", "SQL", "Matplotlib", "Seaborn", "Управление проектами", "Jupyter",
        "Keras", "OpenCV", "Computer Vision"
    ],
    "DevOps": [
        "Docker", "Kubernetes", "Linux", "Helm", "Nginx", "Grafana", "AirFlow", "K8S", "Git",
        "Nexus", "ELK", "CDN", "S3", "Hadoop", "Управление проектами", "Умение работать с API"
    ],
    "Тестировщик": [
        "Python", "PostgreSQL", "Умение работать с API", "Atlassian stack [Jira, Confluence]", "Linux",
        "SQL", "Git", "Управление проектами", "Docker", "Automated testing"
    ],
    "Аналитик": [
        "Data Science", "SQL", "Pandas", "Математическая статистика", "Управление проектами",
        "Data Engineering", "Разработка моделей данных", "Python", "Jupyter", "Умение работать с API"
    ],
    "Дизайнер": [
        "Figma", "Canva", "CSS", "UI/UX", "Photoshop", "Adobe XD", "JavaScript", "HTML", "Tailwind", "React"
    ],
    "Инженер БПЛА": [
        "C++", "Python", "Computer Vision", "Linux", "Kubernetes", "Docker", "OpenCV", "TensorFlow",
        "Machine Learning", "Git", "ROS"
    ],
    "CV engineer": [
        "Computer Vision", "OpenCV", "Python", "PyTorch", "TensorFlow", "Machine Learning", "Deep Learning",
        "Scikit-Learn", "NumPy", "Pandas", "Data Science", "Jupyter", "Keras", "ONNX Runtime"
    ],
    "ML Ops Engineer": [
        "DevOps", "AirFlow", "Kubernetes", "ONNX Runtime", "Docker", "Linux", "Grafana", "TensorFlow",
        "Machine Learning", "PyTorch", "Git", "Hadoop", "Nginx", "K8S", "S3", "CDN", "Управление проектами"
    ]
}

# Список всех возможных навыков
all_skills = [
    "AirFlow", "Atlassian stack [Jira, Confluence]", "Back-end разработка", "C", "C#", "C++", "CDN", "CSS",
    "Canva", "Computer Vision", "Data Engineering", "Data Science", "Deep Learning", "DevOps", "Django", "Docker",
    "ETL", "Figma", "Git", "Go", "HTML", "Hadoop", "Jupyter", "Kafka", "Keras", "Kotlin", "Kubernetes", "K8S",
    "Linux", "Machine Learning", "Matplotlib", "NumPy", "ONNX Runtime", "OpenCV", "Pandas", "PyTorch", "Python",
    "React", "SQL", "Scikit-Learn", "Seaborn", "TensorFlow", "Математическая статистика", "Построение Rest API",
    "Разработка моделей данных", "СУБД PostgreSQL", "Умение работать с API", "Управление проектами", "Java",
    "Spring Boot", "S3", "Next", "Vue", "Android разработка", "IOS разработка", "Desktop разработка", "Helm",
    "Nexus", "Nginx", "ELK", "Graphana", "Zustand", "SSR", "Tailwind", "MUI", "Shadcn"
]

# Команда - кейс

### Embedding recommendation

In [5]:
# Функция для получения общего списка скиллов команды
def get_team_skills(team: Dict) -> List:
    team_skills = []
    for role, skills in team.items():
        team_skills.extend(skills)
    team_skills = list(team_skills)
    
    return team_skills

# Функция для получения эмбеддингов текста
def get_text_embedding(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Функция для вычисления косинусного сходства между командой и кейсом
def compute_similarity(case_embeddings, team_embedding):
    return cosine_similarity(case_embeddings, team_embedding.reshape(1, -1))

In [11]:
# team1 = {
#     "Member 1": ["Python", "Back-end разработка", "Django", "Docker", "Git", "SQL", "Построение Rest API"],
#     "Member 2": ["Python", "Back-end разработка", "Docker", "Git", "СУБД PostgreSQL", "Linux"],
#     "Member 3": ["Machine Learning", "Data Science", "Python", "Pandas", "NumPy", "Scikit-Learn", "TensorFlow"],
#     "Member 4": ["DevOps", "Docker", "Kubernetes", "K8S", "Linux", "Helm", "Nginx", "Graphana"],
#     "Member 5": ["Python", "PostgreSQL", "Умение работать с API", "Atlassian stack [Jira, Confluence]", "Linux"],
#     "Member 6": ["Data Science", "SQL", "Математическая статистика", "Pandas", "Scikit-Learn"],
#     "Member 7": ["DevOps", "Machine Learning", "AirFlow", "Docker", "Kubernetes", "ONNX Runtime"]
# }

# team2 = {
#     "Member 1": ["C#", "Back-end разработка", "Git", "SQL", "Построение Rest API", "Docker"],
#     "Member 2": ["C#", "Back-end разработка", "Docker", "Git", "СУБД PostgreSQL", "Linux"],
#     "Member 3": ["HTML", "CSS", "React", "Git", "Next", "Tailwind", "JavaScript"],
#     "Member 4": ["DevOps", "Docker", "Kubernetes", "K8S", "Linux", "Helm", "Nginx", "Graphana"],
#     "Member 5": ["Умение работать с API", "PostgreSQL", "Python", "Atlassian stack [Jira, Confluence]", "Linux"],
#     "Member 6": ["Data Science", "SQL", "Pandas", "Математическая статистика", "Управление проектами"],
#     "Member 7": ["Canva", "Figma", "CSS", "UI/UX", "Adobe XD", "Photoshop"]
# }

teams = {
    "Team 1": {
        "skills": {
            "Member 1": ["C#", "Back-end разработка", "Git", "SQL", "Построение Rest API", "Docker"],
            "Member 2": ["C#", "Back-end разработка", "Docker", "Git", "СУБД PostgreSQL", "Linux"],
            "Member 3": ["HTML", "CSS", "React", "Git", "Next", "Tailwind", "JavaScript"],
            "Member 4": ["DevOps", "Docker", "Kubernetes", "K8S", "Linux", "Helm", "Nginx", "Graphana"],
            "Member 5": ["Умение работать с API", "PostgreSQL", "Python", "Atlassian stack [Jira, Confluence]", "Linux"],
            "Member 6": ["Data Science", "SQL", "Pandas", "Математическая статистика", "Управление проектами"],
            "Member 7": ["Canva", "Figma", "CSS", "UI/UX", "Adobe XD", "Photoshop"]
        }
    },
    "Team 2": {
        "skills": {
            "Member 1": ["Python", "Back-end разработка", "Django", "Docker", "Git", "SQL", "Построение Rest API"],
            "Member 2": ["Python", "Back-end разработка", "Docker", "Git", "СУБД PostgreSQL", "Linux"],
            "Member 3": ["Machine Learning", "Data Science", "Python", "Pandas", "NumPy", "Scikit-Learn", "TensorFlow"],
            "Member 4": ["DevOps", "Docker", "Kubernetes", "K8S", "Linux", "Helm", "Nginx", "Graphana"],
            "Member 5": ["Python", "PostgreSQL", "Умение работать с API", "Atlassian stack [Jira, Confluence]", "Linux"],
            "Member 6": ["Data Science", "SQL", "Математическая статистика", "Pandas", "Scikit-Learn"],
            "Member 7": ["DevOps", "Machine Learning", "AirFlow", "Docker", "Kubernetes", "ONNX Runtime"]
        }
    }
}

# team_skills = get_team_skills(team1)


In [7]:
# Основная функция для получения рекомендаций по команде
def get_case_to_team_recs_by_embedding(team: Dict, df_cases: pd.DataFrame) -> pd.DataFrame:
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") # для мака
    
    # Загрузка модели и токенизатора
    model_path = "intfloat/multilingual-e5-large"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path).to(device)

    # Получаем эмбеддинг команды
    team_skills = get_team_skills(team)
    team_text = " ".join(team_skills) # Объединяем навыки в одну строку
    team_embedding = get_text_embedding(team_text, model, tokenizer, device)

    # Инференс кейсов
    case_embeddings = []
    for idx, row in df_cases.iterrows():
        case_text = row['title'] + " | " + row['description'] + " | Required roles: " + row['required_roles']
        case_embedding = get_text_embedding(case_text, model, tokenizer, device)
        case_embeddings.append(case_embedding)

    case_embeddings = np.vstack(case_embeddings)

    # Вычисляем косинусное сходство между эмбеддингами кейсов и команды
    similarities = compute_similarity(case_embeddings, team_embedding)

    # Добавляем результаты в DataFrame
    df_cases['embedding_similarity'] = similarities

    return df_cases

In [26]:
team_skills = teams['Team 1']['skills']
df_cases_team = get_case_to_team_recs_by_embedding(team_skills, df_cases)
df_cases_team

Unnamed: 0,id,title,description,required_roles,embedding_similarity
0,37,Карта повреждений территорий,"Алгоритмы по распознаванию карьеров, вырубок, ...","ML engineer, CV engineer, Аналитик, Тестировщик",0.817132
1,3,Контроль перемещений строительных конструкций ...,Развитие методик дистанционного контроля дефор...,"ML engineer, CV engineer, Python Backend, Тест...",0.835368
2,63,Разработка системы ECM для авиационного двигателя,Имеется техническая документация по авиационны...,"ML engineer, Аналитик, Python Backend, DevOps,...",0.821934
3,6,"Система контроля качества продуктов АО ""Медитек""","Система контроля качества продуктов АО ""Медите...","C# Backend, Тестировщик, Аналитик",0.83084


### skills-to-roles recommendation

In [27]:
# Преобразование навыков в бинарный вектор
def skills_to_vector(skills, all_skills):
    vector = [1 if skill in skills else 0 for skill in all_skills]
    return np.array(vector)

# Функция для перевода всех необходимых ролей в один вектор навыков
def roles_to_skills_vector(roles, role_to_skills_mapping, all_skills):
    all_case_skills = set()  # Множество всех уникальных навыков для ролей
    for role in roles:
        role_skills = role_to_skills_mapping.get(role, [])
        all_case_skills.update(role_skills)
    # Преобразуем полученные навыки в бинарный вектор
    return skills_to_vector(all_case_skills, all_skills)

# Преобразование навыков всех членов команды в один вектор
def team_to_skills_vector(team, all_skills):
    all_team_skills = set()
    for member, skills in team.items():
        all_team_skills.update(skills)
    return skills_to_vector(all_team_skills, all_skills)

# Основная функция для получения рекомендаций кейсов для команды на основе маппинга навыков
def get_case_to_team_recs_by_mapping(team: Dict, df_cases: pd.DataFrame, role_to_skills_mapping: Dict, all_skills: list) -> pd.DataFrame:
    # Преобразуем навыки команды в вектор
    team_skills_vector = team_to_skills_vector(team, all_skills)

    similarities = []
    
    # Для каждого кейса в датафрейме преобразуем роли в вектор навыков и вычисляем схожесть
    for idx, row in df_cases.iterrows():
        case_roles = row['required_roles'].split(", ")  # Преобразуем строку с ролями в список
        case_skills_vector = roles_to_skills_vector(case_roles, role_to_skills_mapping, all_skills)
        
        # Вычисляем косинусное сходство между векторами навыков команды и навыков для кейса
        similarity = cosine_similarity([case_skills_vector], [team_skills_vector])[0][0]
        similarities.append(similarity)

    # Добавляем новый столбец с результатами схожести
    df_cases['skills_similarity'] = similarities

    return df_cases

In [28]:
df_cases_team = get_case_to_team_recs_by_mapping(teams['Team 1']['skills'], df_cases, role_to_skills_mapping, all_skills)
df_cases_team

Unnamed: 0,id,title,description,required_roles,embedding_similarity,skills_similarity
0,37,Карта повреждений территорий,"Алгоритмы по распознаванию карьеров, вырубок, ...","ML engineer, CV engineer, Аналитик, Тестировщик",0.817132,0.407687
1,3,Контроль перемещений строительных конструкций ...,Развитие методик дистанционного контроля дефор...,"ML engineer, CV engineer, Python Backend, Тест...",0.835368,0.526397
2,63,Разработка системы ECM для авиационного двигателя,Имеется техническая документация по авиационны...,"ML engineer, Аналитик, Python Backend, DevOps,...",0.821934,0.537853
3,6,"Система контроля качества продуктов АО ""Медитек""","Система контроля качества продуктов АО ""Медите...","C# Backend, Тестировщик, Аналитик",0.83084,0.693688


### Hybrid recommendation

In [29]:
# Функция для вычисления общей метрики гибридной рекомендации
def calculate_hybrid_similarity(embedding_similarity, skill_similarity, alpha=0.5, beta=0.5):
    hybrid_similarity = alpha * embedding_similarity + beta * skill_similarity
    return hybrid_similarity

# Основная функция для гибридных рекомендаций
def get_case_to_team_recs(team: Dict, df_cases: pd.DataFrame, role_to_skills_mapping: Dict, all_skills: list, alpha=0.5, beta=0.5) -> pd.DataFrame:
    # Получаем рекомендации через эмбеддинги
    df_cases = get_case_to_team_recs_by_embedding(team, df_cases)
    
    # Получаем рекомендации через маппинг
    df_cases = get_case_to_team_recs_by_mapping(team, df_cases, role_to_skills_mapping, all_skills)
    
    # Вычисляем гибридную метрику
    df_cases['hybrid_similarity'] = calculate_hybrid_similarity(df_cases['embedding_similarity'], df_cases['skills_similarity'], alpha, beta)
    
    return df_cases

In [30]:
df_cases_hybrid = get_case_to_team_recs(teams['Team 1']['skills'], df_cases, role_to_skills_mapping, all_skills)
df_cases_hybrid

Unnamed: 0,id,title,description,required_roles,embedding_similarity,skills_similarity,hybrid_similarity
0,37,Карта повреждений территорий,"Алгоритмы по распознаванию карьеров, вырубок, ...","ML engineer, CV engineer, Аналитик, Тестировщик",0.817132,0.407687,0.61241
1,3,Контроль перемещений строительных конструкций ...,Развитие методик дистанционного контроля дефор...,"ML engineer, CV engineer, Python Backend, Тест...",0.835368,0.526397,0.680883
2,63,Разработка системы ECM для авиационного двигателя,Имеется техническая документация по авиационны...,"ML engineer, Аналитик, Python Backend, DevOps,...",0.821934,0.537853,0.679893
3,6,"Система контроля качества продуктов АО ""Медитек""","Система контроля качества продуктов АО ""Медите...","C# Backend, Тестировщик, Аналитик",0.83084,0.693688,0.762264


# Кейс - команда

In [31]:
# Основная функция для получения рекомендаций по эмбеддингам
def get_team_to_case_recs_by_embedding(case: Dict, teams: Dict, model, tokenizer, device) -> pd.DataFrame:
    case_text = case['title'] + " | " + case['description'] + " | Required roles: " + case['required_roles']
    case_embedding = get_text_embedding(case_text, model, tokenizer, device)
    
    team_embeddings = []
    for team_name, team_data in teams.items():
        team_skills = get_team_skills(team_data['skills'])
        team_text = " ".join(team_skills)
        team_embedding = get_text_embedding(team_text, model, tokenizer, device)
        similarity = compute_similarity(case_embedding, team_embedding)
        team_embeddings.append({'team_name': team_name, 'embedding_similarity': similarity[0][0]})
    
    return pd.DataFrame(team_embeddings)

# Основная функция для получения рекомендаций через маппинг
def get_team_to_case_recs_by_mapping(case: Dict, teams: Dict, role_to_skills_mapping: Dict, all_skills: list) -> pd.DataFrame:
    case_roles = case['required_roles'].split(", ")
    case_skills_vector = roles_to_skills_vector(case_roles, role_to_skills_mapping, all_skills)
    
    similarities = []
    for team_name, team_data in teams.items():
        team_skills_vector = team_to_skills_vector(team_data['skills'], all_skills)
        similarity = cosine_similarity([case_skills_vector], [team_skills_vector])[0][0]
        similarities.append({'team_name': team_name, 'skills_similarity': similarity})
    
    return pd.DataFrame(similarities)

# Основная функция для гибридных рекомендаций
def get_team_to_case_recs(case: Dict, teams: Dict, role_to_skills_mapping: Dict, all_skills: list, alpha=0.5, beta=0.5) -> pd.DataFrame:
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")  # Для мака
    model_path = "intfloat/multilingual-e5-large"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModel.from_pretrained(model_path).to(device)
    
    # Получаем рекомендации по эмбеддингам
    df_embedding = get_team_to_case_recs_by_embedding(case, teams, model, tokenizer, device)
    
    # Получаем рекомендации по маппингу
    df_mapping = get_team_to_case_recs_by_mapping(case, teams, role_to_skills_mapping, all_skills)
    
    # Объединяем результаты и вычисляем гибридную метрику
    df_hybrid = pd.merge(df_embedding, df_mapping, on='team_name')
    df_hybrid['hybrid_similarity'] = calculate_hybrid_similarity(df_hybrid['embedding_similarity'], df_hybrid['skills_similarity'], alpha, beta)
    
    return df_hybrid.sort_values(by='hybrid_similarity', ascending=False)

# Пример использования
case = {
    'title': "Система контроля качества продуктов АО 'Медитек'",
    'description': "Система контроля качества продуктов требуется для контроля качества и технических параметров приборов.",
    'required_roles': "C# Backend, Тестировщик, Аналитик"
}

teams = {
    "Team 1": {
        "skills": {
            "Member 1": ["C#", "Back-end разработка", "Git", "SQL", "Построение Rest API", "Docker"],
            "Member 2": ["C#", "Back-end разработка", "Docker", "Git", "СУБД PostgreSQL", "Linux"],
            "Member 3": ["HTML", "CSS", "React", "Git", "Next", "Tailwind", "JavaScript"],
            "Member 4": ["DevOps", "Docker", "Kubernetes", "K8S", "Linux", "Helm", "Nginx", "Graphana"],
            "Member 5": ["Умение работать с API", "PostgreSQL", "Python", "Atlassian stack [Jira, Confluence]", "Linux"],
            "Member 6": ["Data Science", "SQL", "Pandas", "Математическая статистика", "Управление проектами"],
            "Member 7": ["Canva", "Figma", "CSS", "UI/UX", "Adobe XD", "Photoshop"]
        }
    },
    "Team 2": {
        "skills": {
            "Member 1": ["Python", "Back-end разработка", "Django", "Docker", "Git", "SQL", "Построение Rest API"],
            "Member 2": ["Python", "Back-end разработка", "Docker", "Git", "СУБД PostgreSQL", "Linux"],
            "Member 3": ["Machine Learning", "Data Science", "Python", "Pandas", "NumPy", "Scikit-Learn", "TensorFlow"],
            "Member 4": ["DevOps", "Docker", "Kubernetes", "K8S", "Linux", "Helm", "Nginx", "Graphana"],
            "Member 5": ["Python", "PostgreSQL", "Умение работать с API", "Atlassian stack [Jira, Confluence]", "Linux"],
            "Member 6": ["Data Science", "SQL", "Математическая статистика", "Pandas", "Scikit-Learn"],
            "Member 7": ["DevOps", "Machine Learning", "AirFlow", "Docker", "Kubernetes", "ONNX Runtime"]
        }
    }
}

# Получаем рекомендации команд для кейса
df_teams_hybrid = get_team_to_case_recs(case, teams, role_to_skills_mapping, all_skills)
print(df_teams_hybrid)

  team_name  embedding_similarity  skills_similarity  hybrid_similarity
0    Team 1              0.832173           0.693688           0.762930
1    Team 2              0.814848           0.629890           0.722369


# Человек - команда

In [32]:
# Функция для получения всех требуемых навыков для команды на основе необходимых ролей
def get_required_skills(roles: List[str]) -> List[str]:
    """
    Get all required skills for a team based on the roles required.

    Args:
    roles (List[str]): A list of roles required for the team.

    Returns:
    List[str]: A list of all required skills for the team.
    """
    required_skills = []
    for role in roles:
        required_skills.extend(role_to_skills_mapping.get(role, []))
    return list(set(required_skills))  # Убираем дубликаты

# Функция для получения общего списка скиллов команды
def get_team_skills(team_skills: Dict) -> List[str]:
    """
    Get all skills of the team members.

    Args:
    team_skills (Dict): A dictionary with team members as keys and their skills as values.

    Returns:
    List[str]: A list of all skills in the team.
    """
    all_skills = []
    for member, skills in team_skills.items():
        all_skills.extend(skills)
    return list(set(all_skills))  # Убираем дубликаты

# Проверка, закрыты ли все необходимые роли в команде
def get_filled_roles(team_skills: Dict, required_roles: List[str], threshold: float = 0.45) -> List[str]:
    """
    Check if all required roles are filled in the team.

    Args:
    team_skills (Dict): A dictionary with team members as keys and their skills as values.
    required_roles (List[str]): A list of roles required for the team.
    threshold (float, optional): Minimum percentage of skills that must be matched to consider a role filled. Defaults to 0.45.

    Returns:
    List[str]: A list of roles that are filled in the team.
    """
    filled_roles = []
    team_skills_flat = get_team_skills(team_skills)
    
    for role in required_roles:
        role_skills = role_to_skills_mapping.get(role, [])
        matched_skills = [skill for skill in role_skills if skill in team_skills_flat]
        
        # Если хотя бы threshold % навыков из необходимых для роли совпадают с навыками команды
        if len(matched_skills) / len(role_skills) >= threshold:
            filled_roles.append(role)
    
    return filled_roles

# Функция для расчета схожести на основе Bag of Skills с весом для незаполненных ролей
def calculate_weighted_similarity(person_skills: List[str], required_skills: List[str], all_skills: List[str], weight: float = 1.0) -> float:
    """
    Calculate a weighted similarity between a person's skills and required skills for a team.

    Args:
    person_skills (List[str]): A list of skills of a person.
    required_skills (List[str]): A list of skills required for a team.
    all_skills (List[str]): A list of all skills in the team.
    weight (float, optional): Weight to be applied to the similarity score. Defaults to 1.0.

    Returns:
    float: Weighted similarity score between the person's skills and required skills.
    """
    person_vector = np.array([1 if skill in person_skills else 0 for skill in all_skills])
    required_vector = np.array([1 if skill in required_skills else 0 for skill in all_skills])
    similarity = cosine_similarity([person_vector], [required_vector])[0][0]
    return similarity * weight


In [33]:
# Пример данных о навыках кандидата
person_skills = ["Python", "Docker", "Kubernetes", "Linux", "Machine Learning"]

# Основная функция для смешанной рекомендации с учетом незаполненных ролей
def recommend_team(person_skills: List[str], teams: Dict[str, Dict], threshold: float = 0.5, unfilled_role_weight: float = 1.5) -> str:
    """
    Recommend a team to a person based on the skills required for a team and the skills of the person.

    Args:
    person_skills (List[str]): A list of skills of a person.
    teams (Dict[str, Dict]): A dictionary with team names as keys and team data as values.
    threshold (float, optional): Minimum percentage of skills that must be matched to consider a role filled. Defaults to 0.5.
    unfilled_role_weight (float, optional): Weight to be applied to the similarity score if the team has unfilled roles. Defaults to 1.5.

    Returns:
    str: The name of the team that is most suitable for the person.
    """
    best_team = None
    best_score = -1
    
    for team_name, team_data in teams.items():
        required_roles = team_data["case"]["required_roles"]
        team_skills = team_data["skills"]
        
        # Проверяем заполненность ролей
        filled_roles = get_filled_roles(team_skills, required_roles, threshold=threshold)
        if len(filled_roles) < len(required_roles):
            # Если не все роли заполнены, приоритизируем кандидатов для заполнения
            unfilled_roles = [role for role in required_roles if role not in filled_roles]
            unfilled_role_skills = get_required_skills(unfilled_roles)
            print(f"Team {team_name}: Roles not filled - {unfilled_roles}")
            
            # Считаем схожесть кандидата с незаполненными ролями, добавляя вес
            all_skills = get_required_skills(required_roles)
            similarity = calculate_weighted_similarity(person_skills, unfilled_role_skills, all_skills, unfilled_role_weight)
        else:
            # Если все роли заполнены, оцениваем, насколько кандидат усилит команду
            current_team_skills = get_team_skills(team_skills)
            similarity = calculate_weighted_similarity(person_skills, current_team_skills, current_team_skills)
        
        print(f"Team {team_name} similarity: {similarity}")
        
        if similarity > best_score:
            best_team = team_name
            best_score = similarity
    
    return best_team

# Пример вызова функции для рекомендаций
teams = {
    "Team 1": {
        "case": {
            "title": "Система контроля качества продуктов АО Медитек",
            "required_roles": ["C# Backend", "Тестировщик", "Аналитик"]
        },
        "skills": {
            "Member 1": ["C#", "Back-end разработка", "Git", "SQL", "Построение Rest API", "Docker"],
            "Member 2": ["C#", "Back-end разработка", "Docker", "Git", "СУБД PostgreSQL", "Linux"],
            "Member 3": ["HTML", "CSS", "React", "Git", "Next", "Tailwind", "JavaScript"],
            "Member 4": ["DevOps", "Docker", "Kubernetes", "K8S", "Linux", "Helm", "Nginx", "Graphana"],
            "Member 5": ["Умение работать с API", "PostgreSQL", "Python", "Atlassian stack [Jira, Confluence]", "Linux"],
            "Member 6": ["Data Science", "SQL", "Pandas", "Математическая статистика", "Управление проектами"],
            "Member 7": ["Canva", "Figma", "CSS", "UI/UX", "Adobe XD", "Photoshop"]
        }
    },
    "Team 2": {
        "case": {
            "title": "Разработка системы ECM для авиационного двигателя",
            "required_roles": ["ML engineer", "Аналитик", "Python Backend", "DevOps", "ML Ops Engineer"]
        },
        "skills": {
            "Member 1": ["Python", "Back-end разработка", "Django", "Docker", "Git", "SQL", "Построение Rest API"],
            "Member 2": ["Python", "Back-end разработка", "Docker", "Git", "СУБД PostgreSQL", "Linux"],
            "Member 3": ["Machine Learning", "Data Science", "Python", "Pandas", "NumPy", "Scikit-Learn", "TensorFlow"],
            "Member 4": ["DevOps", "Docker", "Kubernetes", "K8S", "Linux", "Helm", "Nginx", "Graphana"],
            "Member 5": ["Python", "PostgreSQL", "Умение работать с API", "Atlassian stack [Jira, Confluence]", "Linux"],
            "Member 6": ["Data Science", "SQL", "Математическая статистика", "Pandas", "Scikit-Learn"],
            "Member 7": ["DevOps", "Machine Learning", "AirFlow", "Docker", "Kubernetes", "ONNX Runtime"]
        }
    }
}

best_team = recommend_team(person_skills, teams, threshold=0.6, unfilled_role_weight=1.5)
print(f"Recommended team: {best_team}")


Team Team 1 similarity: 0.3481553119113957
Team Team 2: Roles not filled - ['ML engineer', 'DevOps']
Team Team 2 similarity: 0.5929270612815711
Recommended team: Team 2
