In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -U sentence-transformers datasets

In [None]:
import re
import random
import pandas as pd
from itertools import combinations
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer,SentenceTransformerTrainingArguments,losses, InputExample, evaluation, SentenceTransformerModelCardData
from torch.utils.data import DataLoader
from sentence_transformers.training_args import BatchSamplers
from datasets import Dataset
import torch
from sentence_transformers import  util

In [None]:
import numpy as np
random.seed(42)
np.random.seed(42)

# **1. Load and Prepare Data**
## Load ESCO Similar Title Sets (English, Spanish, German)
## Create Multilingual Job Title Pairs

In [None]:
def clean_text(text, lang='en'):
    text = text.strip().lower()
    text = text.replace(',', ' and ')

    if lang == 'es':
        text = re.sub(r'[^a-z0-9áéíóúüñ\s]', '', text)
    elif lang == 'de':
        text = re.sub(r'[^a-z0-9äöüß\s]', '', text)
    else:
        text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [None]:
str_to_list = lambda x: x.strip().split("\n") if isinstance(x, str) else []

def create_unique_pairs_from_df(df, lang='en'):
    pairs = set()
    for _, row in df.iterrows():
        cleaned_titles = [clean_text(title, lang) for title in row['similar_titles']]
        cleaned_titles = list(set(cleaned_titles))

        if len(cleaned_titles) >= 2:
            for a, b in combinations(cleaned_titles, 2):
                if a != b:
                    pairs.add(tuple(sorted((a, b))))
    return list(pairs)


english_df = pd.read_csv(
    '/content/drive/MyDrive/NLP_Talent_clef_2025/esco_similar_titles/similar_title_sets_english.csv',
    converters={'similar_titles': str_to_list}
)
spanish_df = pd.read_csv(
    '/content/drive/MyDrive/NLP_Talent_clef_2025/esco_similar_titles/similar_title_sets_spanish.csv',
    converters={'similar_titles': str_to_list}
)
german_df = pd.read_csv(
    '/content/drive/MyDrive/NLP_Talent_clef_2025/esco_similar_titles/similar_title_sets_german.csv',
    converters={'similar_titles': str_to_list}
)

english_pairs = create_unique_pairs_from_df(english_df, 'en')
spanish_pairs = create_unique_pairs_from_df(spanish_df, 'es')
german_pairs = create_unique_pairs_from_df(german_df, 'de')

random.shuffle(english_pairs)
random.shuffle(spanish_pairs)
random.shuffle(german_pairs)


In [None]:
print(f"English pairs: {len(english_pairs)}")
print(f"Spanish pairs: {len(spanish_pairs)}")
print(f"German pairs:  {len(german_pairs)}")

In [None]:
all_pairs = english_pairs + spanish_pairs + german_pairs
random.shuffle(all_pairs)

In [None]:
random.shuffle(all_pairs)

In [None]:
len(all_pairs)

In [None]:
print(all_pairs[:5])

# **2. Prepare Training Data**
## Convert Pairs into SentenceTransformer Format

In [None]:
train_examples = [InputExample(texts=[text1, text2]) for text1, text2 in all_pairs]

train_dataset = Dataset.from_dict({
    "text1": [ex.texts[0] for ex in train_examples],
    "text2": [ex.texts[1] for ex in train_examples]
})

print(f"Train pairs: {len(train_dataset)}")

In [None]:
train_dataset[:6]

In [None]:

save_path = '/content/drive/MyDrive/NLP_Talent_clef_2025/train_dataset'
train_dataset.save_to_disk(save_path)


# **3. Model Training**
## Train Model: paraphrase-multilingual-mpnet-base-v2
## Save Fine-tuned Model

In [None]:
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2', device='cuda')

In [None]:
loss = losses.CachedMultipleNegativesRankingLoss(model)

In [None]:
args = SentenceTransformerTrainingArguments(
    output_dir="/content/NLP_1",
    num_train_epochs=1,
    per_device_train_batch_size=128,
    save_strategy="epoch",
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    logging_strategy="epoch",
    run_name="multilingual_mpnet_128_cmnrl"
)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=loss,
)
trainer.train()


In [None]:
model_path = "/content/drive/MyDrive/NLP_Talent_clef_2025/model"
model.save_pretrained(model_path)

# **4. Evaluation**

## **Compute and save Similarity Scores**

In [None]:
queries = "/content/drive/MyDrive/NLP_Talent_clef_2025/validation_dataset/english/queries"
corpus_elements = "/content/drive/MyDrive/NLP_Talent_clef_2025/validation_dataset/english/corpus_elements"

In [None]:
queries = pd.read_csv(queries, sep="\t")
corpus_elements = pd.read_csv(corpus_elements, sep="\t")

In [None]:
queries_ids = queries['q_id'].tolist()
queries_texts = queries['jobtitle'].tolist()

corpus_ids = corpus_elements['c_id'].tolist()
corpus_texts = corpus_elements['jobtitle'].tolist()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("/content/drive/MyDrive/NLP_Talent_clef_2025/model", device=device)

In [None]:
query_embeddings = model.encode(queries_texts, convert_to_tensor=True)

In [None]:
corpus_embeddings = model.encode(corpus_texts, convert_to_tensor=True)

In [None]:
similarities = util.cos_sim(query_embeddings, corpus_embeddings).cpu().numpy()

In [None]:
matches = []

for i, q_id in enumerate(queries_ids):
    for j, c_id in enumerate(corpus_ids):
        similarity = similarities[i, j]

        matches.append({
                "q_id": q_id,
                "query_title": queries_texts[i],
                "c_id": c_id,
                "corpus_title": corpus_texts[j],
                "similarity": similarity
            })

In [None]:
matches_df = pd.DataFrame(matches)

matches_df = matches_df.sort_values(by=["q_id", "similarity"], ascending=[True, False])

In [None]:
print(matches_df)

In [None]:
matches_df.to_csv("/content/drive/MyDrive/NLP_Talent_clef_2025/model/en.csv", index=False)

similarly calculate similarity for spanish and german language

## **Calculate Metrics (mAP, MRR, Precision@k)**

In [None]:
!pip install ranx

In [None]:
import pandas as pd
from ranx import Qrels, Run, evaluate

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP_Talent_clef_2025/model/en.csv')

In [None]:
result_list = []

for index, row in df.iterrows():
    q_id = row['q_id']
    c_id = row['c_id']
    score = row['similarity']
    result_list.append(f"{str(q_id)} Q0 {str(c_id)} {index+1} {score:.4f} SCaLAR_systemA")

print(result_list[:5])

In [None]:
with open("/content/drive/MyDrive/NLP_Talent_clef_2025/model/en.trec", "w", encoding="utf-8") as f:
    f.write("\n".join(result_list))

In [None]:
qrels_file = "/content/drive/MyDrive/NLP_Talent_clef_2025/validation_dataset/english/qrels.tsv"

In [None]:
prediction_file = "/content/drive/MyDrive/NLP_Talent_clef_2025/model/en.trec"

In [None]:
print(len(result_list))

In [None]:
qrels_df = pd.read_csv(qrels_file, sep="\t", header=None,
                           names=["q_id", "iter", "c_id", "rel"],
                           dtype={"q_id": str, "c_id": str, "rel":int})

In [None]:
qrels_df.head()

In [None]:
qrels_result= Qrels.from_df(qrels_df, q_id_col="q_id", doc_id_col="c_id", score_col="rel")

In [None]:
prediction_df = pd.read_csv(prediction_file, sep=r"\s+", header = None, names=["q_id", "iter", "c_id", "rank", "score", "run"], dtype={"q_id":str, "c_id":str})

In [None]:
prediction_df.head()

In [None]:
prediction_df["q_id"] = prediction_df.q_id.astype(str)
prediction_df["c_id"] = prediction_df.c_id.astype(str)

In [None]:
prediction_result= Run.from_df(prediction_df, q_id_col="q_id", doc_id_col="c_id", score_col="score")

In [None]:
metrics = ["map", "mrr", "ndcg", "precision@5", "precision@10", "precision@100"]

In [None]:
results = evaluate(qrels_result, prediction_result , metrics)

similarly calculate metrics for spanish and german language

# **5. Results**
## Results for paraphrase-multilingual-mpnet-base-v2

**english**

In [None]:
for metric, score in results.items():
        print(f"{metric}: {score:.4f}")

**spanish**

In [None]:
for metric, score in results.items():
        print(f"{metric}: {score:.4f}")

**german**

In [None]:
for metric, score in results.items():
        print(f"{metric}: {score:.4f}")

#Repeat Training and Evaluation for paraphrase-multilingual-MiniLM-L12-v2