In [1]:
!pip install transformers datasets requests

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [2]:
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import requests
import io

hf_repo_name = "nimamegh/roberta_cnn_legal"
hf_read_token = "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"

roberta_model = AutoModel.from_pretrained(hf_repo_name, use_auth_token=hf_read_token)
tokenizer = AutoTokenizer.from_pretrained(hf_repo_name, use_auth_token=hf_read_token)

class CNNKeywordModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim):
        super(CNNKeywordModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes
        ])
        self.output_dim = output_dim
        self.fc = nn.Linear(num_filters * len(filter_sizes), output_dim)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        conv_results = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        pooled_results = [F.max_pool1d(cr, cr.size(2)).squeeze(2) for cr in conv_results]
        cat = torch.cat(pooled_results, 1)
        return self.fc(cat)

cnn_model = CNNKeywordModel(vocab_size=tokenizer.vocab_size, embedding_dim=100, num_filters=100, filter_sizes=[2, 3, 4], output_dim=50)

def load_state_dict_from_huggingface(hf_repo_name, filename, token):
    url = f"https://huggingface.co/{hf_repo_name}/resolve/main/{filename}"
    headers = {"Authorization": f"Bearer {token}"}
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return torch.load(io.BytesIO(response.content), map_location=torch.device('cpu'))

state_dict = load_state_dict_from_huggingface(hf_repo_name, "cnn_model.pth", hf_read_token)
cnn_model.load_state_dict(state_dict)
class CombinedModel(nn.Module):
    def __init__(self, roberta_model, cnn_model, hidden_dim, num_classes):
        super(CombinedModel, self).__init__()
        self.roberta_model = roberta_model
        self.cnn_model = cnn_model
        self.fc = nn.Linear(cnn_model.output_dim + roberta_model.config.hidden_size, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids=None, attention_mask=None, cnn_input=None, labels=None):
        roberta_outputs = self.roberta_model(input_ids=input_ids, attention_mask=attention_mask)
        roberta_hidden_states = roberta_outputs.last_hidden_state

        cnn_outputs = self.cnn_model(cnn_input)
        combined = torch.cat((roberta_hidden_states[:, 0, :], cnn_outputs), dim=1)
        logits = self.fc(combined)

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return loss, logits
        else:
            return logits

combined_model = CombinedModel(roberta_model=roberta_model, cnn_model=cnn_model, hidden_dim=256, num_classes=3)
combined_state_dict = load_state_dict_from_huggingface(hf_repo_name, "combined_model.pth", hf_read_token)
combined_model.load_state_dict(combined_state_dict)

training_args = load_state_dict_from_huggingface(hf_repo_name, "training_args.bin", hf_read_token)

print("Model loaded from Hugging Face")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/914 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Model loaded from Hugging Face


In [4]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
import torch
from transformers import Trainer, TrainingArguments, AutoTokenizer
import numpy as np
import torch.nn as nn
excel_file_path = "./testset_NLI_LegalLens.xlsx"
test_csv = "./testset_NLI_LegalLens.csv"

if not os.path.exists(test_csv):
    read_file = pd.read_excel(excel_file_path)
    read_file.to_csv(test_csv, index=None, header=True)

df = pd.read_csv(test_csv)

print(df)

          id                                            premise  \
0   11468879  DEFENDANT agreed to a settlement in a class ac...   
1   12135604  DEFENDANT has agreed to a $865,000 class actio...   
2   12332938  A settlement has been reached in a class actio...   
3   13798813  DEFENDANT has agreed to a $21.875M settlement ...   
4    1576896  DEFENDANT has agreed to pay $2 million to sett...   
..       ...                                                ...   
79  91590520  DEFENDANT has agreed to a $500,000 class actio...   
80  95213221  A class action lawsuit has been settled for $3...   
81  96398889  DEFENDANT has agreed to a $985,000 class actio...   
82  98604253  DEFENDANT has agreed to a $75 million settleme...   
83   9964838  DEFENDANT has agreed to a $800,000 settlement ...   

                                           hypothesis  
0    I've been with DEFENDANT for a while now, and...  
1    Despite the data breach at DEFENDANT in Septe...  
2    I've used the point of

In [5]:
# Load and prepare the dataset
df = pd.read_csv(test_csv)
df = df[['premise', 'hypothesis']]  # Adjust column names as necessary

test_dataset = Dataset.from_pandas(df)
datasets = DatasetDict({'validation': test_dataset})

In [6]:
def tokenize_for_sequence_model(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding=True)

def tokenize_for_cnn(examples):
    combined_texts = [p + " " + h for p, h in zip(examples['premise'], examples['hypothesis'])]
    encoding = tokenizer(combined_texts, padding=True, truncation=True, return_tensors="pt")
    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask']
    }

tokenized_datasets = datasets.map(tokenize_for_sequence_model, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['premise', 'hypothesis'])
tokenized_datasets.set_format('torch')

cnn_tokenized_datasets = datasets.map(tokenize_for_cnn, batched=True)
cnn_tokenized_datasets.set_format('torch')

sequence_inputs_val = tokenized_datasets["validation"]
cnn_inputs_val = cnn_tokenized_datasets["validation"]
cnn_input_lists_val = [tensor.tolist() for tensor in cnn_inputs_val['input_ids']]
combined_dataset_val = sequence_inputs_val.add_column('cnn_input', cnn_input_lists_val)

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

In [7]:
import torch.nn.functional as F
training_args = TrainingArguments(
    output_dir='./results',
    per_device_eval_batch_size=2
)

trainer = Trainer(
    model=combined_model,
    args=training_args,
)

predictions = trainer.predict(combined_dataset_val)
label_mapping = {'Entailed': 0, 'Neutral': 1, 'Contradict': 2}
preds = np.argmax(predictions.predictions, axis=1)

inverse_label_mapping = {v: k for k, v in label_mapping.items()}
preds_labels = [inverse_label_mapping[pred] for pred in preds]

original_val_df = df
original_val_df['predicted_label'] = preds_labels

result_df = original_val_df[['premise', 'hypothesis', 'predicted_label']]
result_df.rename(columns={'predicted_label': 'label'}, inplace=True)
result_df.to_csv('predictions_NLILens.csv', index=False)

print("Predictions saved to predictions_NLILens.csv")



Predictions saved to predictions_NLILens.csv


In [8]:
# Check the format of the predictions file
def check_nli_format(predictions_file_path, test_file_path):
    """
    Check the format of the NLI prediction file.
    The file should be in CSV format with columns: Premise, hypothesis, label
    """
    try:
        df = pd.read_csv(predictions_file_path)
    except Exception as e:
        return False, f"Error reading predictions CSV file: {e}"

    try:
        test_df = pd.read_csv(test_file_path)
    except Exception as e:
        return False, f"Error reading test CSV file: {e}"

    # Check expected columns
    expected_columns = ['premise', 'hypothesis', 'label']
    pred_columns = list(df.columns)
    for expected_col in expected_columns:
        if expected_col not in pred_columns:
            return False, f"Incorrect columns. Expected: {expected_columns}, Found: {pred_columns}"

    # Check number of rows
    expected_nli_num_rows = len(test_df)
    predictions_nli_num_rows = len(df)
    if predictions_nli_num_rows != expected_nli_num_rows:
        return False, f"Incorrect number of predictions. Expected: {expected_nli_num_rows}, Found: {predictions_nli_num_rows}"

    return True, "NLI prediction file format is correct."

# Check the format of the predictions file
format_correct, message = check_nli_format('predictions_NLILens.csv', test_csv)
print(message)


NLI prediction file format is correct.
