In [1]:
# !git clone https://github.com/Nminsker/NLP-Project.git

In [2]:
!pip install datasets
!pip install sentence-transformers

Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)
Using cached nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)
Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
Downloading nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl (21.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, sentence-transformers
Successfully installed nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 n

In [45]:
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample, models, losses, SimilarityFunction
import torch
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
import pandas as pd
import math
import logging
from google.colab import drive
import os
import warnings

warnings.filterwarnings("ignore")

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


#Data Load & Preperation

## Constants

In [15]:
LABEL_MAP = {
    'entailment': 0,
    'neutral': 1,
    'contradiction': 2
}

DATA_PATH = "/content/drive/MyDrive/HebNLI"
MODEL_DIR = "/content/drive/MyDrive/SBERT_Models"

BATCH_SIZE = 16

## Get the Data

In [9]:
def prepare_data(fileName) -> tuple:
    """ Given a path for a dataframe, return both a datframe with columns = [translation1, translation2, original_label] and a list of InputExamples """
    df = pd.read_json(f"{DATA_PATH}/{fileName}", lines=True)

    df.dropna(inplace=True)
    df = df[df.original_label != '-']

    df["label"] = df["original_label"].apply(lambda x: LABEL_MAP[x.lower()],)
    df = df["translation1 translation2 label".split()]

    samples = [
        InputExample(texts=[row.translation1, row.translation2], label=row.label)
        for _, row in df.iterrows()
    ]
    return df, samples

In [12]:
train_df, train_samples = prepare_data("HebNLI_train.jsonl")
val_df, val_samples = prepare_data("HebNLI_val.jsonl")
test_df, test_samples = prepare_data("HebNLI_test.jsonl")

In [14]:
trainDataloader = DataLoader(train_samples, shuffle=True, batch_size=BATCH_SIZE)
validationDataloader = DataLoader(val_samples, shuffle=False, batch_size=BATCH_SIZE)
testDataloader = DataLoader(test_samples, shuffle=False, batch_size=BATCH_SIZE)

In [28]:
class SBERT:
    def __init__(self, modelName, baseModelName, outputPath):
        self.modelName = modelName #short name
        self.baseModelName = baseModelName #model name as of huggingface
        self.outputPath = outputPath # path to save the results / models

        self.setDevice()
        self.setOrGetModel()

    def setDevice(self):
        """ Set the torch.device based on the system capabilities """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        torch.manual_seed(42)
        if self.device == "cuda":
            torch.cuda.manual_seed_all(42)

    def setOrGetModel(self):
        fullPath = f"{self.outputPath}/{self.modelName}"
        if not os.path.exists(f"{fullPath}/config.json"):
            logging.info(f"Couldn't find existing model: {self.modelName}, creating it.") #why logging not working?
            print(f"Couldn't find existing model: {self.modelName}, creating it.")
            self._wordEmbeddingModel = models.Transformer(self.baseModelName)
            self._poolingModel = models.Pooling(self._wordEmbeddingModel.get_word_embedding_dimension())
            self.model = SentenceTransformer(modules=[self._wordEmbeddingModel, self._poolingModel], device="cuda")
        else:
            logging.info(f"Found existing model: {self.modelName}, loading it.")
            print(f"Found existing model: {self.modelName}, loading it.")
            self.model = SentenceTransformer(fullPath)

    def train(self, trainDataloader, devExamples, numEpochs=1, learningRate=1e-5, patience=2):
        print(f"\nTraining model: {self.modelName}")

        # Loss function
        train_loss = losses.CosineSimilarityLoss(self.model)

        # Evaluator
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(devExamples, name='sts-dev')

        # Configure the training
        warmupSteps = math.ceil(len(trainDataloader) * numEpochs * 0.1)  # 10% of train data for warm-up
        logging.info("Warmup-steps: {}".format(warmupSteps))

        # Early stopping parameters
        best_score = -float('inf')
        epochs_no_improve = 0

        # Train the model
        print(f"Starting training for {self.modelName}...")

        for epoch in range(numEpochs):
            self.model.fit(
                train_objectives=[(trainDataloader, train_loss)],
                evaluator=evaluator,
                epochs=1,
                evaluation_steps=1000,
                warmup_steps=warmupSteps,
                output_path=self.outputPath,
                optimizer_params={'lr': learningRate},
                show_progress_bar=True
            )

            # Evaluate on validation set
            evaluation_results = evaluator(self.model)
            score = evaluation_results['sts-dev_spearman_cosine']
            print(f"Epoch {epoch+1}: Validation Score: {score}")

            # Check for early stopping
            if score - best_score > 0.001:
                best_score = score
                epochs_no_improve = 0
                self.model.save(self.outputPath)  # Save the best model
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= self.patience:
                    print(f"Early stopping at epoch {epoch+1}")
                    break

        print(f"Training completed for {self.modelName}. Evaluating on test set...")

    def evaluate(self, base_model_name, output_path):
        print(f"\nEvaluating model: {base_model_name}")

        # Load the SBERT model
        model = SentenceTransformer(output_path)

        # Evaluate on the test set
        test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(self.test_examples, name='sts-test')
        result = test_evaluator(model, output_path=output_path)

        print(f"Evaluation result for {base_model_name}: {result}")

        # Clear memory after evaluation
        del model
        self.clear_memory()

        return result

In [29]:
alephBert = SBERT(
    modelName="AlephBERT",
    baseModelName="onlplab/alephbert-base",
    outputPath=f"{MODEL_DIR}/AlephBERT"
)

alephBert.train(
    trainDataloader=trainDataloader,
    devExamples=val_samples,
    numEpochs=1,
    learningRate=1e-5,
    patience=2
)

Some weights of BertModel were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training model: AlephBERT
Starting training for AlephBERT...


Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine,Sts-dev Pearson Manhattan,Sts-dev Spearman Manhattan,Sts-dev Pearson Euclidean,Sts-dev Spearman Euclidean,Sts-dev Pearson Dot,Sts-dev Spearman Dot,Sts-dev Pearson Max,Sts-dev Spearman Max
1000,0.673,No log,-0.135165,-0.134151,-0.154158,-0.135242,-0.153012,-0.133921,-0.056084,-0.048805,-0.056084,-0.048805
2000,0.6694,No log,-0.070648,-0.090917,-0.091604,-0.089566,-0.093052,-0.090542,-0.050601,-0.052542,-0.050601,-0.052542
3000,0.6527,No log,-0.073882,-0.060002,-0.074597,-0.062804,-0.074438,-0.062109,-0.042972,-0.046211,-0.042972,-0.046211
4000,0.6669,No log,-0.070982,-0.060096,-0.074626,-0.062728,-0.074439,-0.062281,-0.042963,-0.046318,-0.042963,-0.046318
5000,0.663,No log,-0.070019,-0.060131,-0.074678,-0.062509,-0.074456,-0.062645,-0.042914,-0.04638,-0.042914,-0.04638
6000,0.6725,No log,-0.069713,-0.060564,-0.074709,-0.062505,-0.074472,-0.062853,-0.042903,-0.046446,-0.042903,-0.046446
7000,0.6816,No log,-0.072032,-0.060171,-0.075052,-0.062644,-0.074571,-0.062704,-0.042885,-0.046974,-0.042885,-0.046974
8000,0.6766,No log,-0.072609,-0.063853,-0.076611,-0.063296,-0.075484,-0.062939,-0.043166,-0.048778,-0.043166,-0.048778
9000,0.6652,No log,-0.063898,-0.058795,-0.076914,-0.063896,-0.075381,-0.062875,-0.043085,-0.048335,-0.043085,-0.048335
10000,0.667,No log,-0.073809,-0.067976,-0.080089,-0.06613,-0.077932,-0.064461,-0.043774,-0.050199,-0.043774,-0.050199


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch 1: Validation Score: 0.13198303334972783
Training completed for AlephBERT. Evaluating on test set...


In [46]:
mBert = SBERT(
    modelName="mBERT",
    baseModelName="bert-base-multilingual-cased",
    outputPath=f"{MODEL_DIR}/mBERT"
)

mBert.train(
    trainDataloader=trainDataloader,
    devExamples=val_samples,
    numEpochs=1,
    learningRate=1e-5,
    patience=2
)


Training model: mBERT
Starting training for mBERT...


Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine,Sts-dev Pearson Manhattan,Sts-dev Spearman Manhattan,Sts-dev Pearson Euclidean,Sts-dev Spearman Euclidean,Sts-dev Pearson Dot,Sts-dev Spearman Dot,Sts-dev Pearson Max,Sts-dev Spearman Max
1000,0.668,No log,-0.033862,-0.131417,-0.101382,-0.129793,-0.10069,-0.128432,-0.029054,-0.033183,-0.029054,-0.033183
2000,0.6619,No log,-0.030324,-0.100818,-0.065588,-0.101624,-0.064737,-0.100395,-0.005807,-0.011424,-0.005807,-0.011424
3000,0.6666,No log,0.098267,0.007554,0.083598,0.006039,0.083458,0.006331,0.104102,0.065433,0.104102,0.065433
4000,0.6543,No log,0.102042,0.082683,0.092333,0.071218,0.091872,0.070566,0.137395,0.122538,0.137395,0.122538
5000,0.6643,No log,0.156784,0.087887,0.14141,0.084205,0.140237,0.083506,0.161137,0.14906,0.161137,0.14906
6000,0.6568,No log,0.16703,0.090506,0.143268,0.086483,0.142076,0.085999,0.160472,0.156546,0.16703,0.156546
7000,0.6535,No log,0.167577,0.080003,0.154993,0.078243,0.153812,0.077718,0.162904,0.135425,0.167577,0.135425
8000,0.6456,No log,0.191905,0.125154,0.172428,0.121879,0.171119,0.121358,0.183079,0.171929,0.191905,0.171929
9000,0.657,No log,0.206211,0.070482,0.187634,0.066271,0.187777,0.065769,0.21198,0.188247,0.21198,0.188247
10000,0.6545,No log,0.191022,0.1082,0.168716,0.103964,0.168455,0.103531,0.191131,0.174209,0.191131,0.174209


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch 1: Validation Score: 0.11384535231083545
Training completed for mBERT. Evaluating on test set...


In [48]:
dictaBert = SBERT(
    modelName="DictaBERT",
    baseModelName="dicta-il/dictabert",
    outputPath=f"{MODEL_DIR}/DictaBERT"
)

dictaBert.train(
    trainDataloader=trainDataloader,
    devExamples=val_samples,
    numEpochs=1,
    learningRate=1e-5,
    patience=2
)

config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at dicta-il/dictabert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.58M [00:00<?, ?B/s]


Training model: DictaBERT
Starting training for DictaBERT...


Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine,Sts-dev Pearson Manhattan,Sts-dev Spearman Manhattan,Sts-dev Pearson Euclidean,Sts-dev Spearman Euclidean,Sts-dev Pearson Dot,Sts-dev Spearman Dot,Sts-dev Pearson Max,Sts-dev Spearman Max
1000,0.6727,No log,-0.112297,-0.179064,-0.157579,-0.180247,-0.155797,-0.179218,-0.027381,-0.060562,-0.027381,-0.060562
2000,0.6695,No log,-0.103311,-0.117482,-0.127669,-0.121156,-0.121937,-0.117001,-0.043379,-0.058073,-0.043379,-0.058073
3000,0.6526,No log,-0.050557,-0.078293,-0.073539,-0.080685,-0.072033,-0.078253,-0.029551,-0.037711,-0.029551,-0.037711


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine,Sts-dev Pearson Manhattan,Sts-dev Spearman Manhattan,Sts-dev Pearson Euclidean,Sts-dev Spearman Euclidean,Sts-dev Pearson Dot,Sts-dev Spearman Dot,Sts-dev Pearson Max,Sts-dev Spearman Max
1000,0.6727,No log,-0.112297,-0.179064,-0.157579,-0.180247,-0.155797,-0.179218,-0.027381,-0.060562,-0.027381,-0.060562
2000,0.6695,No log,-0.103311,-0.117482,-0.127669,-0.121156,-0.121937,-0.117001,-0.043379,-0.058073,-0.043379,-0.058073
3000,0.6526,No log,-0.050557,-0.078293,-0.073539,-0.080685,-0.072033,-0.078253,-0.029551,-0.037711,-0.029551,-0.037711
4000,0.6671,No log,-0.067593,-0.080206,-0.089496,-0.084232,-0.084814,-0.080216,0.052161,0.049163,0.052161,0.049163
5000,0.6628,No log,-0.041854,-0.061368,-0.065284,-0.065228,-0.061166,-0.06134,0.078612,0.076484,0.078612,0.076484
6000,0.6725,No log,0.00976,-0.016444,-0.010141,-0.019216,-0.007589,-0.016495,0.090164,0.095731,0.090164,0.095731
7000,0.6816,No log,-0.036799,-0.094047,-0.068354,-0.095893,-0.066099,-0.093964,-0.051349,-0.062074,-0.036799,-0.062074
8000,0.6724,No log,0.091178,0.064158,0.085154,0.063695,0.084318,0.063021,0.100486,0.099841,0.100486,0.099841
9000,0.6486,No log,0.142593,0.129556,0.131897,0.128216,0.1321,0.129093,0.148028,0.137409,0.148028,0.137409
10000,0.6494,No log,0.157871,0.116267,0.14045,0.115315,0.141333,0.11589,0.163544,0.132078,0.163544,0.132078


Epoch 1: Validation Score: 0.09299865473072819
Training completed for DictaBERT. Evaluating on test set...
