In [2]:
import torch
import ijson
import numpy as np
import pandas as pd

def read_data(path, country):
    corpus = []
    with open(path, 'r', encoding='utf-8') as f:
            parser = ijson.parse(f)
            aux = {"country": "", "label": "", "text": ""}
            for prefix, event, value in parser:
                prefix = ".".join(prefix.split(".")[1:])
                if (prefix, event) == ("label", "string"):
                    aux["label"] = value
                elif (prefix, event) == ("text", "string"):
                    aux["text"] = value
                elif (prefix, event) == ("country", "string"):
                    aux["country"] = value

                if aux["country"] == country and aux["text"] != "" and aux["label"] != "":
                    corpus.append(aux)
                    aux = {"country": "", "label": "", "text": ""}
    return corpus

In [3]:
path = "data/ML-dataset/train-queries.json"
data_it = pd.DataFrame(read_data(path, "ITA"))
data_fr = pd.DataFrame(read_data(path, "FRA"))

In [6]:
len(data_it["label"].unique())

711

In [3]:
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer("../models/paraphrase-multilingual-mpnet-base-v2").to(device)

  return self.fget.__get__(instance, owner)()


In [4]:
print("Started encoding")
data_it["encoded_it"] = data_it["text"].apply(model.encode)
print("Started encoding")
data_fr["encoded_fr"] = data_fr["text"].apply(model.encode)

Started encoding
Started encoding


In [6]:
lab_it = np.asarray(data_it["label"].tolist())
lab_fr = np.asarray(data_fr["label"].tolist())

aggregated_data = []

for lab in lab_it:
    min_label = np.min([data_it[data_it["label"] == lab].shape[0],  data_fr[data_fr["label"] == lab].shape[0]])

    if min_label == 0:
        continue
    else:
        aggregated_data.append([data_it[data_it["label"] == lab].iloc[:min_label], data_fr[data_fr["label"] == lab].iloc[:min_label]])


In [11]:
from tqdm import tqdm

aligned_texts_dir = []
limits = (0,1000000)
for dir in tqdm(aggregated_data[limits[0]:limits[1]]):
    array_it = np.asarray(dir[0]["encoded_it"].tolist())
    array_fr = np.asarray(dir[1]["encoded_fr"].tolist())

    scores = cosine_similarity(array_it, array_fr)
    aligned_texts = []
    for idx, score in enumerate(scores):
        top_k = np.argsort(score)[::-1][0]
        aligned_texts.append([idx, top_k])
    aligned_texts_dir.append(aligned_texts)

100%|██████████| 10307/10307 [00:17<00:00, 586.24it/s]


In [18]:
sum(len(aligned) for aligned in aligned_texts_dir)

613788

In [13]:
df = pd.DataFrame()

for dir, aligned in tqdm(zip(aggregated_data[limits[0]:limits[1]], aligned_texts_dir)):
    br_it = dir[0].iloc[np.asarray(aligned)[:,0]].drop("country", axis = 1)
    br_fr = dir[1].iloc[np.asarray(aligned)[:,1]].drop("country", axis = 1)

    br_it.columns = ["label", "text_it", "encoded_it"]
    br_fr.columns = ["label", "text_fr", "encoded_fr"]

    aux_df = br_it.copy(deep = True)
    aux_df["text_fr"] = br_fr["text_fr"].tolist()
    aux_df["encoded_fr"] = br_fr["encoded_fr"].tolist()

    df = pd.concat([df, aux_df])

df

0it [00:00, ?it/s]

10307it [01:13, 140.74it/s]


Unnamed: 0,label,text_it,encoded_it,text_fr,encoded_fr
0,32016L2341,1. Gli articoli 52 e 53 non ostano ad alcuna d...,"[-0.01917797, 0.01133264, -0.008022693, 0.0453...",1. Les articles 52 et 53 ne font obstacle à au...,"[-0.044572506, 0.050717562, -0.009113831, 0.02..."
56,32016L2341,Gli Stati membri garantiscono che gli EPAP sia...,"[-0.15083398, -0.057951342, -0.009174005, -0.0...",Les États membres veillent à ce que les IRP so...,"[-0.13886526, -0.07500404, -0.009932162, -0.02..."
101,32016L2341,Gli Stati membri assicurano che vi sia una sep...,"[-0.114227384, 0.0053351, -0.009329527, 0.0703...",Les États membres veillent à ce qu'il existe u...,"[-0.09838905, 0.017340802, -0.009662485, 0.107..."
520,32016L2341,"1. Entro 13 gennaio 2023, la Commissione esegu...","[-0.038859554, -0.17914939, -0.012377142, -0.0...","1. Au plus tard le 13 janvier 2023, la Commiss...","[-0.041650075, -0.18207741, -0.011810097, -0.0..."
553,32016L2341,1. Gli Stati membri consentono agli EPAP regis...,"[-0.07931274, -0.01991996, -0.009524323, 0.021...",1. Les États membres autorisent les IRP enregi...,"[-0.06276633, -0.06766292, -0.008952431, 0.028..."
...,...,...,...,...,...
9449,32016L1629,1. Le unità navali munite di un certificato de...,"[-0.0065439087, 0.043382887, -0.00880788, -0.1...",1. Les bâtiments munis d'un certificat valide ...,"[-0.081004485, 0.1373705, -0.009739099, -0.123..."
9597,32016L1629,Ai fini della presente direttiva s'intende per...,"[0.037924215, -0.15280026, -0.016648697, -0.12...","Aux fins de la présente directive, on entend p...","[0.054068886, -0.18845516, -0.018823588, -0.13..."
9745,32016L1629,La presente direttiva stabilisce:\na)\ni requi...,"[-0.0077311955, -0.10552563, -0.011854096, -0....",La présente directive établit:\na)\nles prescr...,"[-0.0702611, -0.11325889, -0.013084402, -0.095..."
10017,32016L1629,1. La presente direttiva si applica alle segue...,"[0.0022214877, -0.22605956, -0.012241536, -0.0...",1. La présente directive s'applique aux bâtime...,"[-0.09247355, -0.1562672, -0.012801878, -0.072..."


In [14]:
def lca_mapping(train_sl, train_tl):
    lca = torch.matmul(torch.linalg.pinv(train_tl.T),train_sl.T).to(device)
    linear_mapping = lambda x: torch.matmul(lca.T, x).to(device)
    return linear_mapping

mapp = lca_mapping(torch.as_tensor(np.asarray(df["encoded_it"].iloc[:int(0.5*len(df))].tolist())), torch.as_tensor(np.asarray(df["encoded_fr"].iloc[:int(0.5*len(df))].tolist())))

RuntimeError: [enforce fail at alloc_cpu.cpp:83] err == 0. DefaultCPUAllocator: can't allocate memory: you tried to allocate 376735708944 bytes. Error code 12 (Cannot allocate memory)