In [None]:
!pip install transformers datasets
!pip install torch

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset, DatasetDict
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

## Load roberta model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli')
roberta_model = AutoModel.from_pretrained('ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/703 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

## Define the CNN model

In [None]:
class CNNKeywordModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim):
        super(CNNKeywordModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes
        ])
        self.output_dim = output_dim
        self.fc = nn.Linear(num_filters * len(filter_sizes), output_dim)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        conv_results = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        pooled_results = [F.max_pool1d(cr, cr.size(2)).squeeze(2) for cr in conv_results]
        cat = torch.cat(pooled_results, 1)
        return self.fc(cat)

## Combined Model

In [None]:
class CombinedModel(nn.Module):
    def __init__(self, roberta_model, cnn_model, hidden_dim, num_classes):
        super(CombinedModel, self).__init__()
        self.roberta_model = roberta_model
        self.cnn_model = cnn_model
        self.fc = nn.Linear(cnn_model.output_dim + roberta_model.config.hidden_size, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, cnn_input=None, labels=None):
        roberta_outputs = self.roberta_model(input_ids=input_ids, attention_mask=attention_mask)
        roberta_hidden_states = roberta_outputs.last_hidden_state
        cnn_outputs = self.cnn_model(cnn_input)

        combined = torch.cat((roberta_hidden_states[:, 0, :], cnn_outputs), dim=1)
        logits = self.fc(combined)

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return loss, logits
        else:
            return logits

## Load dataset

### Load data from Huggingface

In [None]:
from datasets import load_dataset, Dataset, DatasetDict

label_mapping = {'Entailed': 0, 'Neutral': 1, 'Contradict': 2}

def get_train_nli_data(legal_type: str) -> pd.DataFrame:
    justice_lens_dataset = load_dataset("darrow-ai/LegalLensNLI")

    train_df = justice_lens_dataset["train"].filter(lambda example: example["legal_act"] != legal_type).to_pandas()
    train_df['label'] = train_df['label'].map(label_mapping).astype(int)

    return train_df

def get_test_nli_data(legal_type: str) -> pd.DataFrame:
    justice_lens_dataset = load_dataset("darrow-ai/LegalLensNLI")

    test_df = justice_lens_dataset["train"].filter(lambda example: example["legal_act"] == legal_type).to_pandas()
    test_df['label'] = test_df['label'].map(label_mapping).astype(int)

    return test_df

legal_type = "wage"
train_df = get_train_nli_data(legal_type)
test_df = get_test_nli_data(legal_type)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
datasets = DatasetDict({'train': train_dataset, 'validation': test_dataset})


Downloading readme:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/380k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/312 [00:00<?, ? examples/s]

Filter:   0%|          | 0/312 [00:00<?, ? examples/s]

Filter:   0%|          | 0/312 [00:00<?, ? examples/s]

### Load data locally

In [None]:
def tokenize_for_sequence_model(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding=True)

def tokenize_for_cnn(examples):
    combined_texts = [p + " " + h for p, h in zip(examples['premise'], examples['hypothesis'])]
    encoding = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt")
    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask']
    }


tokenized_datasets = datasets.map(tokenize_for_sequence_model, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['premise', 'hypothesis'])
tokenized_datasets.set_format('torch')

cnn_tokenized_datasets = datasets.map(tokenize_for_cnn, batched=True)
cnn_tokenized_datasets.set_format('torch')

sequence_inputs = tokenized_datasets["train"]
cnn_inputs = cnn_tokenized_datasets["train"]
cnn_input_lists = [tensor.tolist() for tensor in cnn_inputs['input_ids']]
combined_dataset_train = sequence_inputs.add_column('cnn_input', cnn_input_lists)

sequence_inputs_val = tokenized_datasets["validation"]
cnn_inputs_val = cnn_tokenized_datasets["validation"]
cnn_input_lists_val = [tensor.tolist() for tensor in cnn_inputs_val['input_ids']]
combined_dataset_val = sequence_inputs_val.add_column('cnn_input', cnn_input_lists_val)

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

## Train

In [None]:
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "macro_f1": f1,
        "macro_precision": precision,
        "macro_recall": recall
    }

In [None]:
vocab_size = tokenizer.vocab_size
embedding_dim = 100
num_filters = 100
filter_sizes = [2, 3, 4]
output_dim = 50

cnn_model = CNNKeywordModel(vocab_size, embedding_dim, num_filters, filter_sizes, output_dim)
combined_model = CombinedModel(roberta_model=roberta_model, cnn_model=cnn_model, hidden_dim=256, num_classes=3)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    save_total_limit=2,
    warmup_steps= 50
)

trainer = Trainer(
    model=combined_model,
    args=training_args,
    train_dataset=combined_dataset_train,
    eval_dataset=combined_dataset_val,
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Macro Precision,Macro Recall
1,No log,0.777663,0.615385,0.452381,0.383333,0.555556
2,No log,1.306032,0.769231,0.719048,0.888889,0.694444
3,No log,1.240154,0.846154,0.805556,0.866667,0.833333
4,No log,0.728826,0.846154,0.815629,0.84127,0.805556
5,No log,0.76146,0.923077,0.904762,0.916667,0.916667
6,No log,1.155567,0.846154,0.838095,0.916667,0.805556
7,0.288300,1.203833,0.846154,0.815629,0.84127,0.805556
8,0.288300,0.999322,0.923077,0.904762,0.916667,0.916667
9,0.288300,1.638051,0.846154,0.838095,0.916667,0.805556
10,0.288300,1.584453,0.846154,0.838095,0.916667,0.805556


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=1500, training_loss=0.09745880721819898, metrics={'train_runtime': 4175.6765, 'train_samples_per_second': 1.432, 'train_steps_per_second': 0.359, 'total_flos': 0.0, 'train_loss': 0.09745880721819898, 'epoch': 20.0})

# Save model

In [None]:
!pip install transformers huggingface_hub



In [None]:
from huggingface_hub import login

login(token="<Your-write-token>")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import os
import torch

# Define your model directory and Hugging Face repo names
hf_repo_name = "nimamegh/roberta_cnn_legal"
save_directory = './saved_model'
os.makedirs(save_directory, exist_ok=True)

# Save the RoBERTa model and tokenizer locally
roberta_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Push the RoBERTa model and tokenizer to Hugging Face Hub
roberta_model.push_to_hub(hf_repo_name)
tokenizer.push_to_hub(hf_repo_name)

# Save the CNN model's state dict locally
torch.save(cnn_model.state_dict(), os.path.join(save_directory, 'cnn_model.pth'))

# Save the combined model's state dict locally
torch.save(combined_model.state_dict(), os.path.join(save_directory, 'combined_model.pth'))

# Save the training arguments locally
torch.save(training_args, os.path.join(save_directory, 'training_args.bin'))

# Optionally, push other artifacts to Hugging Face Hub
from huggingface_hub import HfApi

# Create a new repository or use an existing one
api = HfApi()
api.upload_file(
    path_or_fileobj=os.path.join(save_directory, 'cnn_model.pth'),
    path_in_repo='cnn_model.pth',
    repo_id=hf_repo_name,
)

api.upload_file(
    path_or_fileobj=os.path.join(save_directory, 'combined_model.pth'),
    path_in_repo='combined_model.pth',
    repo_id=hf_repo_name,
)

api.upload_file(
    path_or_fileobj=os.path.join(save_directory, 'training_args.bin'),
    path_in_repo='training_args.bin',
    repo_id=hf_repo_name,
)



README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

cnn_model.pth:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

combined_model.pth:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nimamegh/roberta_cnn_legal/commit/85fb5df32dd41523e8ac4ab81c5f7709e6f5da0d', commit_message='Upload training_args.bin with huggingface_hub', commit_description='', oid='85fb5df32dd41523e8ac4ab81c5f7709e6f5da0d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
!zip -r nli_model.zip /content/saved_model