In [1]:
!pip install datasets

[0m

In [2]:
!pip install transformers

[0m

In [3]:
import torch
from transformers import AutoTokenizer, AutoModel, LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AdamW
import pandas as pd
import numpy as np
import os



In [4]:
base_path = "**location_of_training_parsed_dataset"
files = sorted([base_path+'/'+f for f in os.listdir(base_path)])
case_files = []
for file in files:
  df = pd.read_csv(file)
  df = df.values
  cases = []
  for row in df:
    try: 
      if(row[0].index("\n(z0")!=0):
        content=row[0][row[0].index("\n(z0"):]
      else:
        content = row[0]
    except:
      continue
    cases.append(content)
  case_files.append(cases)

train_case_files = case_files[:]

In [5]:
len(train_case_files), len(train_case_files[0])

(9000, 81)

In [6]:
np.random.seed(0)
warm_ratio_files = [train_case_files[i] for i in np.random.choice(np.arange(len(train_case_files)), size=2250, replace=False)]
print(len(warm_ratio_files), len(warm_ratio_files[0]))

2250 38


In [7]:
selected_lines = []
for case in warm_ratio_files:
    selected_lines.extend(case)
len(selected_lines)

51681

In [8]:
model_name = "nlpaueb/legal-bert-small-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/989 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

In [9]:
with open("train_data_pretrain.txt", "w") as f:
    f.write("\n\n".join(selected_lines))

In [10]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="train_data_pretrain.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)



In [11]:
training_args = TrainingArguments(
    output_dir="./pretrained_model",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,  # Adjust the batch size as per your GPU memory
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=1e-4,
    warmup_ratio=0.1,
)

In [16]:
from transformers import BertForMaskedLM

In [17]:
model = BertForMaskedLM.from_pretrained("nlpaueb/legal-bert-small-uncased")

Some weights of the model checkpoint at nlpaueb/legal-bert-small-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()

Step,Training Loss
500,7.1069
1000,6.0488
1500,4.7996
2000,3.7565
2500,3.0635
3000,2.688
3500,2.3456
4000,2.0699
4500,1.8632
5000,1.8066


In [None]:
!mkdir pretrained_model_final

In [None]:
trainer.save_model("./pretrained_model_final")