In [46]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample, models, losses, SimilarityFunction

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import pandas as pd
import math

In [4]:
train_df = pd.read_json("Data\HebNLI_train.jsonl", lines=True)
test_df = pd.read_json("Data\HebNLI_test.jsonl", lines=True)
val_df = pd.read_json("Data\HebNLI_val.jsonl", lines=True)

In [5]:
# Prepare data
LABEL_MAP = {
    'entailment': 0, 
    'neutral': 1, 
    'contradiction': 2
}

def prepare_data(df) -> tuple:
    df = df[df.original_label != '-']
    df["original_label"] = df["original_label"].apply(lambda x: LABEL_MAP[x.lower()],)

    return df.translation1, df.translation2, df.original_label

def loadSts():
    df = pd.read_csv("Data\heb_sts_test.csv")
    return EmbeddingSimilarityEvaluator(
        sentences1=df.sentece1,
        sentences2=df.sentece2,
        scores=df.score,
        main_similiarity=SimilarityFunction.COSINE,
        name="heb_sts_test",
    )

train_premises, train_hypotheses, train_labels = prepare_data(train_df)
validation_premises, validation_hypotheses, validation_labels = prepare_data(val_df)
test_premises, test_hypotheses, test_labels = prepare_data(test_df)

# Load model_map
model_map = {
    'AlephBERT': models.Transformer("onlplab/alephbert-base"),
    'mBERT': models.Transformer("bert-base-multilingual-cased"),
    'DictaBERT': models.Transformer("dicta-il/dictabert"),
}

# Prepare training data
train_samples = [
    InputExample(texts=[premise, hypothesis], label=label)
    for premise, hypothesis, label in zip(train_premises, train_hypotheses, train_labels)
]

validation_samples = [
    InputExample(texts=[premise, hypothesis], label=label)
    for premise, hypothesis, label in zip(validation_premises, validation_hypotheses, validation_labels)
]

train_batch_size = 16
num_epochs = 1
model_save_path = "output/training_nli_hebrew"

# Create DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
validation_dataloader = DataLoader(validation_samples, shuffle=False, batch_size=train_batch_size)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["original_label"] = df["original_label"].apply(lambda x: LABEL_MAP[x.lower()],)
Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
model = SentenceTransformer("bert-base-multilingual-cased")
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=3
)

<torch.utils.data.dataloader.DataLoader at 0x27fbdc30850>

In [22]:
model = SentenceTransformer("bert-base-multilingual-cased")
embeddings = model.encode(train_samples[0].texts)
print("Embedding shape: ",embeddings.shape)
similarities = model.similarity(embeddings, embeddings)
print("Similarities:\n", similarities)

No sentence-transformers model found with name bert-base-multilingual-cased. Creating a new one with mean pooling.


Embedding shape:  (2, 768)
Similarities:
 tensor([[1.0000, 0.6714],
        [0.6714, 1.0000]])


In [24]:
ent = next(s for s in train_samples if s.label == 0)
contra = next(s for s in train_samples if s.label == 2)

In [25]:
embeddings = model.encode(ent.texts)
print("Embedding shape: ",embeddings.shape)
similarities = model.similarity(embeddings, embeddings)
print("Similarities:\n", similarities)

Embedding shape:  (2, 768)
Similarities:
 tensor([[1.0000, 0.7468],
        [0.7468, 1.0000]])


In [26]:
embeddings = model.encode(contra.texts)
print("Embedding shape: ",embeddings.shape)
similarities = model.similarity(embeddings, embeddings)
print("Similarities:\n", similarities)

Embedding shape:  (2, 768)
Similarities:
 tensor([[1.0000, 0.7582],
        [0.7582, 1.0000]])


In [None]:
loss = losses.Cos