In [None]:
import pandas as pd
import random
import numpy as np
from transformers import XLMRobertaModel, XLMRobertaTokenizer, AutoConfig, AutoModel, AutoTokenizer
from sklearn.metrics import roc_auc_score, roc_curve,accuracy_score, precision_score, recall_score, f1_score, average_precision_score, precision_recall_curve
import sentencepiece
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def plot_curve(x, y, label, title, xlabel, ylabel):
  plt.figure(figsize=(6, 5))
  plt.plot(x, y, label=label)
  plt.title(title)
  plt.xlabel(xlabel)
  plt.ylabel(ylabel)
  plt.legend()
  plt.grid()
  plt.show()


def plot_evaluation_curves(similarities, y_true):
  fpr, tpr, _ = roc_curve(y_true, similarities)
  roc_auc = roc_auc_score(y_true, similarities)
  plot_curve(fpr, tpr, f"ROC AUC = {roc_auc:.4f}", "ROC Curve", "False Positive Rate", "True Positive Rate")

  precision, recall, _ = precision_recall_curve(y_true, similarities)
  pr_auc = average_precision_score(y_true, similarities)
  plot_curve(recall, precision, f"PR AUC = {pr_auc:.4f}", "Precision-Recall (PR) Curve", "Recall", "Precision")

  plt.figure(figsize=(6, 5))
  plt.hist(similarities[y_true == 1], bins=50, alpha=0.6, label="Parallel (label=1)", density=True)
  plt.hist(similarities[y_true == 0], bins=50, alpha=0.6, label="Non-Parallel (label=0)", density=True)
  plt.title("Similarity Score Distribution")
  plt.xlabel("Similarity")
  plt.ylabel("Density")
  plt.legend()
  plt.grid()
  plt.show()


def find_best_f1_threshold(y_true, y_score):
  precision, recall, thresholds = precision_recall_curve(y_true, y_score)
  f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
  best_idx = np.argmax(f1_scores)
  return thresholds[best_idx]

def find_best_youden_threshold(y_true,similarities):
  fpr, tpr, thresholds = roc_curve(y_true, similarities)
  youden_index = tpr - fpr
  best_threshold_idx = np.argmax(youden_index)
  best_threshold = thresholds[best_threshold_idx]
  return best_threshold

def evaluation(similarities, y_true, thresholds, threshold_name):
  y_pred = (similarities >= thresholds).astype(int)
  acc = accuracy_score(y_true, y_pred)
  prec = precision_score(y_true, y_pred)
  rec = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)
  auc = roc_auc_score(y_true, similarities)
  pr_auc = average_precision_score(y_true, similarities)
  print(f"\n Best {threshold_name} : {thresholds:.4f}")
  print(f"Accuracy       : {acc:.4f}")
  print(f"Precision      : {prec:.4f}")
  print(f"Recall         : {rec:.4f}")
  print(f"F1 Score       : {f1:.4f}")
  print(f"ROC AUC        : {auc:.4f}")
  print(f"PR AUC         : {pr_auc:.4f}")

def calculate_thresholds(data, similarities):
  y_true = data['label'].values
  thresholds = {
    'F1 threshold': find_best_f1_threshold(y_true, similarities),
    'Youden J threshold': find_best_youden_threshold(y_true, similarities),
  }
  for name, thresh in thresholds.items():
    evaluation(similarities, y_true, thresh, name)
        
  plot_evaluation_curves(similarities, y_true)


def mean_pooling(model_output, attention_mask):
  token_embeddings = model_output.last_hidden_state
  input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
  sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
  sum_mask = input_mask_expanded.sum(dim=1)
  return sum_embeddings / (sum_mask + 1e-8)

def encode(sentences, model_name, batch_size=32):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name).to(device)
  model.eval()

  all_embeddings = []
  with torch.no_grad():
    for i in tqdm(range(0, len(sentences), batch_size), desc=f'Encoding {model_name}'):
      batch = sentences[i:i+batch_size]
      inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
      outputs = model(**inputs)
      embeddings = mean_pooling(outputs, inputs['attention_mask'])
      embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
      all_embeddings.append(embeddings.cpu())
  return torch.cat(all_embeddings, dim=0).numpy()

def to_xlmr_sentence_embeddings(input_file, model_name, output_path):
  data = pd.read_csv(input_file, sep='\t')
  hsb_embeddings = encode(data['hsb'].tolist(), model_name, batch_size=32)
  de_embeddings = encode(data['de'].tolist(), model_name, batch_size=32)
  similarities = np.sum(hsb_embeddings * de_embeddings, axis=1)
  data['similarity'] = similarities
  data.to_csv(output_path, sep='\t', index=False)
  calculate_thresholds(data, similarities)


def to_labse_sentence_embeddings(data,output_path):
  model = SentenceTransformer('sentence-transformers/LaBSE')
  hsb_embeddings = model.encode(data['hsb'].tolist(), normalize_embeddings=True, show_progress_bar=True)
  de_embeddings = model.encode(data['de'].tolist(), normalize_embeddings=True, show_progress_bar=True)
  similarities = np.sum(hsb_embeddings * de_embeddings, axis=1)
  data['similarity'] = similarities
  data.to_csv(output_path, sep='\t', index=False)
  calculate_thresholds(data, similarities)




In [None]:

def main():
    model_name = 'labse'  # 'xlmr' or 'glot500'
    input_file_path = '/content/drive/MyDrive/Colab Notebooks/train_data.tsv'
    output_file_path = f'output_{model_name}.tsv'

    print(f"Model to use: {model_name}")

    if model_name in ['xlmr', 'glot500']:
        to_xlmr_sentence_embeddings(input_file_path, model_name, output_file=output_file_path)
    elif model_name == 'labse':
        train = pd.read_csv(input_file_path, sep='\t')
        to_labse_sentence_embeddings(train, output_path=output_file_path)


if __name__ == '__main__':
    main()
