**Импорт данных**

In [None]:
!pip install -U sentence-transformers
!pip install mlflow
!pip install optuna
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import os
import math
import warnings
import logging
import torch
import mlflow
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from optuna.visualization import plot_optimization_history

In [3]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)
pd.set_option('max_colwidth', 400)
mlflow.set_experiment("logging_sbert")
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[logging.StreamHandler()])
log_dir = './logs'
os.makedirs(log_dir, exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2023/12/19 14:07:12 INFO mlflow.tracking.fluent: Experiment with name 'logging_sbert' does not exist. Creating a new experiment.


In [3]:
# data = pd.read_csv('/kaggle/input/resume-hh/hh_ru.csv', delimiter=';')
# data['resume'] = data.apply(lambda row: ' '.join(row[data.columns].astype(str)), axis=1)
# df_hh = data['resume']
# df_yandex = pd.read_csv('/kaggle/input/yandex-jobs/vacancies.csv', sep=',')['Raw text']
# df_t = pd.concat([df_hh[:625], df_yandex], axis=1, ignore_index=True).rename(columns={0: 'Resume Description', 1: 'Job Description'})

In [None]:
# df = pd.read_csv('/kaggle/input/annotated-resume/resume_job.csv', sep=',', encoding='utf-8')
# df.head(1)

**Набор данных**

In [4]:
df = pd.read_excel('/kaggle/input/input-dataset/resume_job_cities_fixed.xlsx')
df.head(1)

Unnamed: 0,Resume Description,Job Description,Match,city_resume,city_job
0,"Пол, возраст: Мужчина , 28 лет , родился 29 сентября 1990; ЗП: 25000 руб.; Ищет работу на должность: Продавец-кассир;\nГород, переезд, командировки: Одинцово , не готов к переезду , не готов к командировкам; Занятость: полная занятость;\nГрафик: полный день; Опыт работы: Опыт работы 1 год 11 месяцев Сентябрь 2008 — Июль 2010 1 год 11 месяцев ООО Эльдорадо продавец консультант Продажа,консул...","Название: Продавец-кассир\n, Цена: от 59 000 ₽\n, Условия: {'Сфера деятельности': 'Продажи', 'График работы': 'Сменный', 'Частота выплат': 'Дважды в месяц', 'Опыт работы': 'Без опыта', 'Что получают работники': 'униформа, питание, подарки детям на праздники'}\n, Расположение: Одинцово\n Описание: Tрудoустpоиться в кoмпанию может каждый! Пpинимаeм грaждaн СНГ и РФ - опыт не имeeт знaчeния! Bcем...",1,Одинцово,Одинцово


**Тренировка**

In [None]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=RANDOM_STATE)
train_data = [
    InputExample(texts=[job_desc, resume_desc], label=float(match))
    for job_desc, resume_desc, match in zip(train_df['Job Description'], train_df['Resume Description'], train_df['Match'])
]
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=2, pin_memory=True)

def create_evaluator(val_df, evaluator_class, name, batch_size):
    return evaluator_class(
        sentences1=val_df['Job Description'].tolist(),
        sentences2=val_df['Resume Description'].tolist(),
        labels=val_df['Match'].tolist(),
        name=name,
        batch_size=batch_size,
        show_progress_bar=True,
        write_csv=True
    )

def train_and_evaluate(model, train_dataloader, evaluator, trial, loss_name):
    model = model.to(device)
    loss_class = getattr(losses, loss_name)
    train_loss = loss_class(model=model) 
    mlflow.end_run()

    with mlflow.start_run(run_name="ml_sbert"):
        lr = trial.suggest_float('lr', 1e-6, 1e-2, log=True)
        weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-1, log=True)
        batch_size = trial.suggest_int('batch_size', 2, 32, log=True)
        epochs = 15
        warmup_steps = math.ceil(len(train_df) * 0.1)
        checkpoint_path = './checkpoints'
        mlflow.log_params({'lr': lr, 'weight_decay': weight_decay, 'batch_size': batch_size, 'loss_name': loss_name})

        for epoch in range(epochs):
            model.fit(
                train_objectives=[(train_dataloader, train_loss)],
                evaluator=evaluator,
                optimizer_class=torch.optim.AdamW,
                epochs=epochs,
                warmup_steps=warmup_steps,
                optimizer_params={'lr': lr},
                weight_decay=weight_decay,
                show_progress_bar=True,
                save_best_model=True,
                use_amp=True,
                checkpoint_path=checkpoint_path,
                checkpoint_save_steps=500,
            )
            metrics = evaluator(model)
            print(metrics)
            torch.cuda.empty_cache()

    return metrics

def objective(trial):
    model_name = "sberbank-ai/sbert_large_nlu_ru"
    model = SentenceTransformer(model_name)
    loss_name = trial.suggest_categorical('loss_name', ['CosineSimilarityLoss', 'ContrastiveLoss'])
    evaluator = create_evaluator(val_df, evaluation.BinaryClassificationEvaluator, "binary_classification_evaluation", batch_size=32)
    metrics = train_and_evaluate(model, train_dataloader, evaluator, trial, loss_name)
    
    return metrics

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
best_params = study.best_params
print("Лучшие гиперпараметры:", best_params)

mlflow.log_params(best_params)
print("Тренировка завершена!")

fig = optuna.visualization.matplotlib.plot_optimization_history(study)
plt.show()

**TEST**

In [None]:
df_t = pd.read_csv('/kaggle/input/yandex-jobs/vacancies.csv', sep=',')['Raw text']
df_t = df_t.to_frame().rename(columns={'Raw text': 'Job Description'})

In [2]:
!ls /kaggle/working/checkpoints

Последний лучший чекпоинт

In [None]:
latest_checkpoint = max(os.listdir('/kaggle/working/checkpoints'), key=lambda x: int(x))
checkpoint_filename = f'/kaggle/working/checkpoints/{latest_checkpoint}'
print(checkpoint_filename)
# model.load_state_dict(torch.load(checkpoint_filename))

Без резюме айтишников

In [None]:
from scipy.spatial.distance import cosine

best_model_path = "/kaggle/working/checkpoints/1090"
best_model = SentenceTransformer(best_model_path)
job_descriptions = df_t['Job Description'].tolist()

user_resume = input('Введите своё резюме')

user_resume_embedding = best_model.encode(user_resume)
similarity_scores = [1 - cosine(user_resume_embedding, best_model.encode(job_desc)) for job_desc in job_descriptions]
result_df = pd.DataFrame({'Job Description': job_descriptions, 'Similarity': similarity_scores})
top_10_vacancies = result_df.nlargest(10, 'Similarity')
print("Top-10 Vacancies for User's Resume:")
print(top_10_vacancies[['Job Description', 'Similarity']])

С резюме айтишников

In [None]:
best_model_path = "/kaggle/working/checkpoints/1090"
best_model = SentenceTransformer(best_model_path)

test_data = [
    InputExample(texts=[job_desc, resume_desc], label=float(match))
    for job_desc, resume_desc, match in zip(df_t['Job Description'], df_t['Resume Description'], df_t['Match'])
]
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=2, pin_memory=True)

test_evaluator = create_evaluator(df_t, evaluation.BinaryClassificationEvaluator, "test_evaluation", batch_size=32)
test_metrics = evaluator(model)
print("Test Metrics:", test_metrics)

user_resume = input('Введите своё резюме')

new_data_embeddings = model.encode(df_t['Job Description'].tolist(), df_t['Resume Description'].tolist())
new_data['Similarity'] = model.encode([user_resume] * len(df_t), df_t['Job Description'])
top_10_new_data = new_data.nlargest(10, 'Similarity')
print("Top-10 Vacancies for New Data:")
print(top_10_new_data[['Job Description', 'Similarity']])

**TRASH**

In [None]:
# !pip install langchain

In [None]:
# with open('/kaggle/input/faiss-db/vector_db/db_jobs/index.pkl', 'rb') as f:
#     job_descriptions = pickle.load(f)
    
# with open('/kaggle/input/faiss-db/vector_db/db_resume/index.pkl', 'rb') as f:
#     resume = pickle.load(f)

In [None]:
# import pickle
# import langchain
# from langchain.vectorstores import FAISS
# from langchain.embeddings.openai import OpenAIEmbeddings

# os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

# # with open('/kaggle/input/faiss-db/vector_db/db_jobs/index.pkl', 'rb') as f:
# #     job_descriptions = pickle.load(f)
# embeddings = OpenAIEmbeddings()
# new_db = FAISS.load_local("/kaggle/input/faiss-db/vector_db/db_jobs/index.faiss", embeddings)
# docs = new_db.similarity_search(query)
# docs[0]

In [None]:
best_model = SentenceTransformer('/kaggle/working/checkpoints/best_model')

In [None]:
# my_res = """"""

In [None]:
# loaded_model = SentenceTransformer("fine_tuned_model")

In [None]:
# job_description_to_match = my_res
# job_description_embedding = model.encode(job_description_to_match, convert_to_tensor=True)
# resume_embeddings = model.encode(val_df['Job Description'].tolist(), convert_to_tensor=True)

# similarities = util.pytorch_cos_sim(job_description_embedding, resume_embeddings)
# similarities = similarities.cpu().numpy()

# top_matches_indices = np.argsort(similarities[0])[::-1][:5]
# top_matches_df = val_df.iloc[top_matches_indices][['Job Description', 'Match']]

# print("\nTop Matches for Resume:")
# print(top_matches_df)

In [None]:
top_matches_df.iloc[3, :]['Job Description']

**Tests**

In [None]:
# from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
# from torch.utils.data import Dataset, DataLoader
# from sklearn.model_selection import train_test_split
# from torch.utils.tensorboard import SummaryWriter

# SEED = 42
# num_classes = 2

# train_texts, val_texts, train_labels, val_labels = train_test_split(
#     df[['Job Description', 'Resume Description']].astype(str).values.tolist(),
#     df['Match'].tolist(),
#     test_size=0.2,
#     random_state=SEED
# )

# tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
# model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")

# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output['last_hidden_state']
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
#     sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
#     return sum_embeddings / sum_mask

# class DownstreamTaskModel(torch.nn.Module):
#     def __init__(self, config, num_classes):
#         super(DownstreamTaskModel, self).__init__()
#         self.config = config
#         self.linear = torch.nn.Linear(config.hidden_size, num_classes)

#     def forward(self, input_ids, attention_mask):
#         model_output = model(input_ids, attention_mask=attention_mask)
#         sentence_embeddings = mean_pooling(model_output, attention_mask)
#         logits = self.linear(sentence_embeddings)
#         return logits

# class DownstreamTaskDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length=128):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#         if len(self.texts) != len(self.labels):
#             raise ValueError("Length of texts and labels must be the same.")

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         encoding = self.tokenizer(
#             self.texts[idx],
#             return_tensors='pt',
#             truncation=True,
#             max_length=self.max_length,
#             padding='max_length'
#         )

#         label_ids = torch.tensor(self.labels[idx], dtype=torch.long)

#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'label_ids': label_ids
#         }

# train_dataset = DownstreamTaskDataset(train_texts, train_labels, tokenizer)
# val_dataset = DownstreamTaskDataset(val_texts, val_labels, tokenizer)

# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# log_dir = './logs'
# os.makedirs(log_dir, exist_ok=True)
# writer = SummaryWriter(log_dir)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# downstream_model = DownstreamTaskModel(model.config, num_classes)
# downstream_model.to(device)

# optimizer = torch.optim.AdamW(downstream_model.parameters(), lr=5e-5)
# total_steps = len(train_loader) * 3
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# criterion = torch.nn.CrossEntropyLoss()

# for epoch in range(3):
#     downstream_model.train()
#     total_loss = 0.0

#     for step, batch in enumerate(train_loader):
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['label_ids'].to(device)

#         optimizer.zero_grad()

#         logits = downstream_model(input_ids, attention_mask=attention_mask)
#         loss = criterion(logits, labels)

#         total_loss += loss.item()

#         loss.backward()
#         optimizer.step()
#         scheduler.step()

#     avg_loss = total_loss / len(train_loader)
#     print(f"Epoch {epoch + 1}, Average Loss: {avg_loss}")

# downstream_model.save_pretrained("fine_tuned_model")

# writer.close()

# test_texts = df_t['Job Description'].astype(str).tolist() + df_t['Resume Description'].astype(str).tolist()
# test_dataset = DownstreamTaskDataset(test_texts, None, tokenizer)
# test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# downstream_model.eval()
# all_embeddings = []

# with torch.no_grad():
#     for batch in test_loader:
#         inputs = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)

#         logits = downstream_model(inputs, attention_mask=attention_mask)
#         embeddings = mean_pooling({'last_hidden_state': logits}, attention_mask)
#         all_embeddings.append(embeddings)

# all_embeddings = torch.cat(all_embeddings, dim=0)
# job_desc_embeddings, resume_embeddings = torch.chunk(all_embeddings, 2, dim=0)
# similarity_scores = cosine_similarity(job_desc_embeddings, resume_embeddings)

In [None]:
# import pandas as pd
# import torch
# from torch.utils.data import DataLoader
# from sklearn.model_selection import train_test_split
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
# from transformers import DataCollatorForLanguageModeling
# from transformers import BertForSequenceClassification, BertConfig

# num_labels = 2
# model_name = "sismetanin/sbert-ru-sentiment-rusentiment"

# config = BertConfig.from_pretrained(model_name, num_labels=num_labels)
# new_classifier = torch.nn.Linear(config.hidden_size, num_labels)
# model = BertForSequenceClassification(config=config)
# model.classifier = new_classifier

# # model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True, hidden_dropout_prob=0.1)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

# class JobMatchingDataset(torch.utils.data.Dataset):
#     def __init__(self, tokenizer, df, max_length=512):
#         self.tokenizer = tokenizer
#         self.df = df
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         job_desc = self.df.iloc[idx]['Job Description']
#         resume_desc = self.df.iloc[idx]['Resume Description']
#         label = int(self.df.iloc[idx]['Match'])

#         encoding = self.tokenizer(
#             job_desc,
#             resume_desc,
#             truncation=True,
#             padding=True,
#             max_length=self.max_length,
#             return_tensors='pt'
#         )

#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#             'labels': torch.tensor(label, dtype=torch.long)
#         }

# class JobMatchingTestDataset(torch.utils.data.Dataset):
#     def __init__(self, tokenizer, df, max_length=512):
#         self.tokenizer = tokenizer
#         self.df = df
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         job_desc = self.df.iloc[idx]['Job Description']
#         resume_desc = self.df.iloc[idx]['Resume Description']

#         max_len_job_desc = len(self.tokenizer.encode(job_desc, max_length=None))
#         max_len_resume_desc = len(self.tokenizer.encode(resume_desc, max_length=None))
#         max_length = max(max_len_job_desc + max_len_resume_desc + self.max_length)

#         encoding = self.tokenizer(
#             job_desc,
#             resume_desc,
#             truncation=True,
#             max_length=max_length,
#             return_tensors='pt'
#         )

#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#         }

# train_dataset = JobMatchingDataset(tokenizer, df_train)
# val_dataset = JobMatchingDataset(tokenizer, df_val)
# test_dataset = JobMatchingTestDataset(tokenizer, df_t)

# training_args = TrainingArguments(
#     output_dir="./job_matching_model",
#     per_device_train_batch_size=2,
#     gradient_accumulation_steps=4,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     logging_dir="./logs",
#     logging_steps=100,
#     save_total_limit=2,
#     evaluation_strategy="steps",
#     eval_steps=200,
# )

# train_dataloader = DataLoader(train_dataset, batch_size=training_args.per_device_train_batch_size, shuffle=True)
# eval_dataloader = DataLoader(val_dataset, batch_size=training_args.per_device_eval_batch_size)

# print(len(train_dataloader.dataset), len(eval_dataloader.dataset))
# print(training_args.per_device_train_batch_size, training_args.per_device_eval_batch_size)

# if hasattr(training_args, "gradient_accumulation_steps"):
#     print("Gradient Accumulation Steps:", training_args.gradient_accumulation_steps)

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     data_collator=data_collator,
# )

# trainer.train()
# results = trainer.evaluate()

# model.save_pretrained("./job_matching_model")
# tokenizer.save_pretrained("./job_matching_model")
# predictions = []

# for i in range(len(test_dataset)):
#     inputs = test_dataset[i]
#     max_length = inputs['input_ids'].size(1)
#     inputs = {k: torch.unsqueeze(v, 0) for k, v in inputs.items()}
#     outputs = model(**inputs)
#     predicted_label = torch.sigmoid(outputs.logits).squeeze().item()
#     predictions.append(predicted_label)
# df_t['Predicted_Match'] = predictions

In [None]:
# import numpy as np
# import pandas as pd
# import torch
# from sentence_transformers import SentenceTransformer
# from sklearn.preprocessing import normalize
# from sklearn.metrics.pairwise import cosine_similarity

# class JobMatchingTestDataset(torch.utils.data.Dataset):
#     def __init__(self, tokenizer, df, max_length=512):
#         self.tokenizer = tokenizer
#         self.df = df
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         job_desc = self.df.iloc[idx]['Job Description']
#         resume_desc = self.df.iloc[idx]['Resume Description']

#         encoding = self.tokenizer(
#             job_desc,
#             resume_desc,
#             truncation=True,
#             padding='max_length',
#             max_length=self.max_length,
#             return_tensors='pt'
#         )

#         return {
#             'input_ids': encoding['input_ids'].flatten(),
#             'attention_mask': encoding['attention_mask'].flatten(),
#         }

# test_dataset = JobMatchingTestDataset(tokenizer, df_t)
# model_name = "sberbank-ai/sbert_large_nlu_ru"
# sentence_transformer_model = SentenceTransformer(model_name)

# max_length = 512
# job_tokenized = tokenizer(df['Job Description'].tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
# resume_tokenized = tokenizer(df['Resume Description'].tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
# test_job_tokenized = tokenizer(df_t['Job Description'].tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
# test_resume_tokenized = tokenizer(df_t['Resume Description'].tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

# job_embeddings = sentence_transformer_model.encode(job_tokenized['input_ids'].tolist(), convert_to_tensor=True)
# resume_embeddings = sentence_transformer_model.encode(resume_tokenized['input_ids'].tolist(), convert_to_tensor=True)
# test_job_embeddings = sentence_transformer_model.encode(test_job_tokenized['input_ids'].tolist(), convert_to_tensor=True)
# test_resume_embeddings = sentence_transformer_model.encode(test_resume_tokenized['input_ids'].tolist(), convert_to_tensor=True)

# job_embeddings = job_embeddings.cpu().numpy()
# resume_embeddings = resume_embeddings.cpu().numpy()
# test_job_embeddings = test_job_embeddings.cpu().numpy()
# test_resume_embeddings = test_resume_embeddings.cpu().numpy()

# print(job_embeddings.shape, resume_embeddings.shape)
# print(type(job_embeddings), type(resume_embeddings))
# print(job_embeddings.shape, resume_embeddings.T.shape)

# job_embeddings = normalize(job_embeddings)
# resume_embeddings = normalize(resume_embeddings)

# print("Normalized Job Embeddings:", job_embeddings)
# print("Normalized Resume Embeddings:", resume_embeddings)

# job_resume_similarities = cosine_similarity(job_embeddings, resume_embeddings)
# test_similarities = cosine_similarity(test_job_embeddings, test_resume_embeddings)

# def find_top_matches(similarities, k=5):
#     top_matches = []
#     for i in range(len(similarities)):
#         top_k_indices = similarities[i].argsort()[-k:][::-1]
#         top_k_matches = df.iloc[top_k_indices][['Resume Description', 'Match']]
#         top_matches.append(top_k_matches)
#     return top_matches

# top_matches_test = find_top_matches(test_similarities, k=5)
# print("\nTop Matches for Test Data:")
# for i, matches in enumerate(top_matches_test):
#     print(f"Job Description {i+1}:\n{matches}")

In [None]:
# for i, matches in enumerate(top_matches_test):
#     if i == 2:
#         break
#     print(f"Job Description {i+1}:\n{matches}")

In [None]:
# print("Job Embeddings Shape:", job_embeddings.shape)
# print("Resume Embeddings Shape:", resume_embeddings.shape)

# for i in range(5):
#     print(f"\nPair {i + 1} - Job Description:\n{df.iloc[i]['Job Description']}")
#     print(f"Pair {i + 1} - Resume Description:\n{df.iloc[i]['Resume Description']}")
#     print(f"Pair {i + 1} - Job Embeddings:\n{job_embeddings[i]}")
#     print(f"Pair {i + 1} - Resume Embeddings:\n{resume_embeddings[i]}")

# print("\nNormalized Job Embeddings:")
# print(job_embeddings)
# print("\nNormalized Resume Embeddings:")
# print(resume_embeddings)

# job_resume_similarities = cosine_similarity(job_embeddings, resume_embeddings)
# test_similarities = cosine_similarity(test_job_embeddings, test_resume_embeddings)

# def find_top_matches(similarities, k=5):
#     top_matches = []
#     for i in range(len(similarities)):
#         top_k_indices = similarities[i].argsort()[-k:][::-1]
#         top_k_matches = df.iloc[top_k_indices][['Resume Description', 'Match']]
#         top_matches.append(top_k_matches)
#     return top_matches

# top_matches_test = find_top_matches(test_similarities, k=5)
# print("\nTop Matches for Test Data:")
# for i, matches in enumerate(top_matches_test):
#     print(f"\nJob Description {i + 1}:\n{matches}")

In [None]:
# print("Job Embeddings Shape:", job_embeddings.shape)
# print("Resume Embeddings Shape:", resume_embeddings.shape)

# for i in range(5):
#     print(f"\nPair {i + 1} - Job Description:\n{df.iloc[i]['Job Description']}")
#     print(f"Pair {i + 1} - Resume Description:\n{df.iloc[i]['Resume Description']}")
#     print(f"Pair {i + 1} - Job Embeddings:\n{job_embeddings[i]}")
#     print(f"Pair {i + 1} - Resume Embeddings:\n{resume_embeddings[i]}")

# print("\nNormalized Job Embeddings:")
# print(job_embeddings)
# print("\nNormalized Resume Embeddings:")
# print(resume_embeddings)