In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.metrics import precision_score, recall_score

  from .autonotebook import tqdm as notebook_tqdm





In [None]:
def rename_columns(df):
    df_renamed = df.rename(columns={
        'ID': 'ID',
        'О себе': 'About',
        'Портфолио': 'Portfolio',
        'Навыки': 'Skills',
        'Специальность->Название': 'Specialization',
        'Навыки (из справочника)': 'Skills_Dictionary',
        'GitHub_Languages': 'GitHub_Tech_Stack' 
    })
    return df_renamed

In [3]:
def prepare_data(df):
    df = df.copy()
    
    # Добавляем колонку GitHub_Tech_Stack, если её нет
    if 'GitHub_Tech_Stack' not in df.columns:
        df['GitHub_Tech_Stack'] = ''  # или другое значение по умолчанию

    df['Combined_Text'] = (
        df['Skills'].fillna('') + ' ' +
        # df['Skills_Dictionary'].fillna('') + ' ' +
        df['About'].fillna('') + ' ' +
        df['Portfolio'].fillna('') + ' ' +
        df['GitHub_Tech_Stack'].fillna('')
    )

    df['Combined_Text'] = df['Combined_Text'].str.lower()
    
    return df

In [4]:
def load_bert_model():
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = TFBertModel.from_pretrained('bert-base-multilingual-cased')
    return tokenizer, model

In [5]:
def get_bert_embeddings(text_list, tokenizer, model):
    embeddings = []
    batch_size = 16 

    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i+batch_size]
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='tf')
        outputs = model(encoded_input)
        batch_embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1)
        embeddings.append(batch_embeddings)
    
    embeddings = tf.concat(embeddings, axis=0)
    return embeddings

In [6]:
def find_similar_candidates(candidate_embeddings, user_embedding, df, top_n=10):
    
    similarities = cosine_similarity(user_embedding.numpy(), candidate_embeddings.numpy()).flatten()

    df['Similarity'] = similarities
    
    df_sorted = df.sort_values(by='Similarity', ascending=False)
    
    return df_sorted[['ID', 'About', 'Combined_Text', 'Similarity']].head(top_n)

In [12]:
if __name__ == "__main__":
    
    file_path = 'analys_with_github_data.xlsx'
    df = pd.read_excel(file_path)

    df = rename_columns(df)
    
    df = prepare_data(df)
    
    # user_skills = 'SQL Python Git HTML JavaScript DevOps Photoshop UI/UX Figma'  
    user_skills = 'php laravel mysql vue angular java spring boot postgresql express.js sqlite'
    user_skills = user_skills.lower()
    
    tokenizer, model = load_bert_model()
  
    candidate_texts = df['Combined_Text'].tolist()
    candidate_embeddings = get_bert_embeddings(candidate_texts, tokenizer, model)

    user_embedding = get_bert_embeddings([user_skills], tokenizer, model)
    
    recommended_candidates = find_similar_candidates(candidate_embeddings, user_embedding, df)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

tf.Tensor(
[[ 0.23487659 -0.37765184  0.53153086 ...  0.2806971  -0.213962
  -0.26726216]
 [ 0.3704964  -0.13124703  1.0793505  ...  0.20487551  0.00681353
  -0.02221299]
 [ 0.08903141 -0.20722505  0.6737517  ...  0.19853015 -0.07931003
  -0.10786223]
 ...
 [ 0.3704964  -0.13124703  1.0793505  ...  0.20487551  0.00681353
  -0.02221299]
 [-0.04154399 -0.8923166   0.39751965 ...  0.06989434  0.13987382
  -0.22657323]
 [-0.26908204 -0.34397748  0.8376327  ...  0.18232326 -0.04363351
  -0.4037854 ]], shape=(793, 768), dtype=float32)
tf.Tensor(
[[ 2.37056330e-01 -8.15693140e-01  6.34741843e-01 -2.18471382e-02
   7.68562034e-02 -2.04887062e-01 -5.19897223e-01  1.22636423e-01
  -1.37273252e-01 -2.15448752e-01  2.59985954e-01 -1.50140509e-01
   3.83892119e-01  2.20483646e-01 -1.89840943e-01 -1.86657354e-01
   2.64747411e-01  6.97838515e-02  1.80174053e-01  2.71406919e-01
  -3.69270921e-01  3.14778686e-02 -1.89183027e-01  3.30948502e-01
   3.49754035e-01 -3.68154019e-01 -8.91759217e-01  1.32723

In [8]:
print("\nТоп кандидатов:")
print(recommended_candidates)


Топ кандидатов:
       ID                                              About  \
727  9727                                                NaN   
228  9228                                                NaN   
147  9147                               Инженер, госслужащий   
756  9756                                                NaN   
771  9771                                                NaN   
447  9447                                                NaN   
178  9178                                                NaN   
593  9593                                                NaN   
507  9507                                  Учусь в РТУ МИРЭА   
339  9339  Начинающий специалист, активно прохожу обучени...   

                                         Combined_Text  Similarity  
727  адаптивная верстка,  bootstrap,  html,  devtoo...    0.753594  
228  php,  react,  git,  node.js,  html,  javascrip...    0.749758  
147  sql,  python,  база данных,  postgresql,  mysq...    0.746994  
75

In [9]:
for i in range(len(recommended_candidates)):
    print(recommended_candidates['ID'].iloc[i])
    print(recommended_candidates['Combined_Text'].iloc[i])
    print('__________________')

# user_skills = 'php laravel mysql vue angular java spring boot postgresql express.js sqlite'

9727
адаптивная верстка,  bootstrap,  html,  devtools,  vue.js,  vuex,  css,  jquery,  seo   
__________________
9228
php,  react,  git,  node.js,  html,  javascript,  css,  scss,  typescript,  адаптивная верстка,  mysql,  laravel   
__________________
9147
sql,  python,  база данных,  postgresql,  mysql,  git,  c/c++,  3d-печать,  ms sql server инженер, госслужащий  
__________________
9756
php,  laravel,  linux,  git,  html,  css,  react,  sql,  mysql   
__________________
9771
адаптивная верстка,  api,  webpack,  bootstrap,  git,  html,  vue.js,  ajax,  javascript,  css,  scss,  jquery,  sass   
__________________
9447
python,  sql,  git,  javascript,  html,  css,  c/c++,  linux,  ubuntu,  debian,  shell scripting,  bash,  node.js,  docker,  tcp/ip   
__________________
9178
figma,  web design,  adobe photoshop,  ux/ui,  jira,  python,  mysql,  pandas,  numpy,  matplotlib,  scikit-learn,  css,  html   
__________________
9593
адаптивная верстка,  bootstrap,  git,  html,  vue.js,  ja

In [10]:
# K = 5

# relevant_candidates = {9178, 9756, 9228, 9017, 9037}  

# predicted_candidates = recommended_candidates['ID'].tolist()[:K]

# precision_at_k = len(set(predicted_candidates) & relevant_candidates) / K

# recall_at_k = len(set(predicted_candidates) & relevant_candidates) / len(relevant_candidates)

# def dcg_at_k(relevance, k):
#     return sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevance[:k])])

# def ndcg_at_k(predicted, relevant, k):
#     relevance = [1 if candidate in relevant else 0 for candidate in predicted[:k]]
#     ideal_relevance = sorted(relevance, reverse=True)
#     dcg = dcg_at_k(relevance, k)
#     idcg = dcg_at_k(ideal_relevance, k)
#     return dcg / idcg if idcg > 0 else 0

# ndcg_at_k_score = ndcg_at_k(predicted_candidates, relevant_candidates, K)

# print(f"Precision@{K}: {precision_at_k:.2f}")
# print(f"Recall@{K}: {recall_at_k:.2f}")
# print(f"NDCG@{K}: {ndcg_at_k_score:.2f}")