In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

In [3]:
final_citation_df = pd.read_csv('Final_Citations_Labels_Smote.csv')
final_citation_df.head(5)

Unnamed: 0,text,startPosition,endPosition,normCite,citeType,altCite,pinCiteStr,pageRangeStr,nodeId,section,sectionAndSubSection,isShortCite,chunk_id,context,original_label
0,1 USC 1,3479,3486,1 usc 1,USC,,,,0,1 USC 1,1 USC 1,False,0.0,"Division A—Military Construction, Veterans Aff...",Definition
1,or direction,188589,188601,or dir ection,,,,,0,,,False,9.0,"16353(b)). <paragraph display-inline=""no-displ...",Definition
2,42 U.S.C.,245062,245071,42 usc,USC,,,,0,42 U.S.C.,42 U.S.C.,False,4.0,Domestic Food Programs Food and Nutrition Serv...,Authority
3,19 USC 2434,110102,110113,19 usc 2434,USC,,,,0,19 USC 2434,19 USC 2434,False,16.0,"4655)— <clause display-inline=""no-display-inli...",Amending
4,2 FAM 154,343562,343571,[2] 1 fam 154,UK,,,,0,,,False,,(d) None of the funds appropriated or otherwis...,Authority


In [4]:
label_encoder = LabelEncoder()
final_citation_df["label"] = label_encoder.fit_transform(final_citation_df["original_label"])

In [5]:
final_citation_df["input_text"] = final_citation_df["text"].fillna("None") + " [SEP] " + final_citation_df["context"].fillna("None")

In [6]:
train_df, test_df = train_test_split(final_citation_df, test_size=0.3, stratify=final_citation_df["label"], random_state=5934)

In [7]:
train_dist = train_df["label"].value_counts()
test_dist = test_df["label"].value_counts()

In [8]:
train_df.shape, test_df.shape

((30639, 17), (13131, 17))

In [10]:
train_subset = train_df[["input_text", "label"]]
type(train_subset)

pandas.core.frame.DataFrame

In [11]:
test_subset = test_df[["input_text", "label"]]
type(test_subset)

pandas.core.frame.DataFrame

In [12]:
train_subset.shape, test_subset.shape

((30639, 2), (13131, 2))

In [14]:
model_name = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)
model.to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
train_dataset = Dataset.from_pandas(train_subset.rename(columns={"input_text": "text"}))
test_dataset = Dataset.from_pandas(test_subset.rename(columns={"input_text": "text"}))

In [16]:
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

In [17]:
tokenized_train = train_dataset.map(tokenize_function, batched=True).remove_columns(["text"])
tokenized_test = test_dataset.map(tokenize_function, batched=True).remove_columns(["text"])
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

Map:   0%|          | 0/30639 [00:00<?, ? examples/s]

Map:   0%|          | 0/13131 [00:00<?, ? examples/s]

In [21]:
training_args = TrainingArguments(
    output_dir="./legalbert_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./legalbert_logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)



In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)#%% md


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
