# Constants

In [None]:
TOPIC = 'תורה'
CLEANED_DATA_PATH = '../../Data/Cleaned Data/good_df.json'
BEREL_BASE_PATH = '../../Models/Saved_Models/Transformers/BEREL_base'

# Setup

## Groundwork Installations

In [None]:
import numpy as np
import pandas as pd

In [None]:
good_df = pd.read_json(CLEANED_DATA_PATH)

In [None]:
good_df

In [None]:
! pip install transformers datasets

## Setup Topic Data

In [None]:
def create_single_topic_df(good_df, topic, random_state=613):
  using = good_df.copy()
  using['label'] = np.where(using['topic']==(topic), 1,0)
  positive = using[using['label']==1]
  using = using[using['text'].isin(positive['text']) == False]
  negative = using.sample(len(positive.index), random_state=random_state)
  combined = pd.concat([positive, negative], axis=0)
  combined.drop(['pm_ref', 'topic'], axis=1, inplace=True)
  return combined

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
single_topic_df = create_single_topic_df(good_df, TOPIC)
train_df, eval_df = train_test_split(single_topic_df, test_size=0.2, random_state=613)
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

In [None]:
len(train_dataset)

In [None]:
len(eval_dataset)

# AlephBert

In [None]:
from transformers import BertTokenizerFast

alephbert_tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base')

def tokenize_function(examples):
    return alephbert_tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('onlplab/alephbert-base', num_labels=2)

In [None]:
model.eval()

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("./alephBERT_model")

# BEREL

In [None]:
from rabtokenizer import RabbinicTokenizer
from transformers import BertTokenizer, BertForMaskedLM
import os

berel_tokenizer = RabbinicTokenizer(BertTokenizer.from_pretrained(os.path.join(BEREL_BASE_PATH, 'vocab.txt'), model_max_length=512))

In [None]:
def tokenize_function(examples):
    return berel_tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(BEREL_BASE_PATH, num_labels=2)

In [None]:
model.eval()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("./BEREL_model")

# HeBERT

In [None]:
from transformers import AutoTokenizer
heBERT_tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT", model_max_length=512)

In [None]:
def tokenize_function(examples):
    return heBERT_tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('avichr/heBERT', num_labels=2)

In [None]:
model.eval()

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model("./heBERT_model")

# FOR COLAB -- Download Saved Models from Session Storage

If running this script in Google Colab, the files that were saved above to session storage will be too big to download directly, but they can be moved into your Google Drive. Below is a way to do that, saving to drive as one zipped file containing all 3 models:

In [None]:
!zip -r /content/all_models.zip /content/BEREL_model /content/alephBERT_model /content/heBERT_model

In [None]:
import shutil
DRIVE_DESTINATION = #<your desired destination directory in your drive for the zip file>
shutil.copyfile(
    '/content/all_models.zip',
    f'{DRIVE_DESTINATION}/{TOPIC}_saved_models')