# Train

In [None]:
!pip install spacy transformers torch spacy_transformers datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests<3.0.0,>=2.13.0 (from spacy)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 

## Create config file for Spacy

In [None]:
config = """
[paths]
train = null
dev = null
vectors = null

[system]
gpu_allocator = "pytorch"

[nlp]
lang = "en"
pipeline = ["transformer","ner"]
batch_size = 16

[components]

[components.transformer]
factory = "transformer"

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "microsoft/deberta-v3-base"
tokenizer_config = {"use_fast": true}

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.ner]
factory = "ner"

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0

[components.ner.model.tok2vec.pooling]
@layers = "reduce_mean.v1"

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"

[training.optimizer]
@optimizers = "Adam.v1"

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 5e-5

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256

[initialize]
vectors = ${paths.vectors}

"""
with open('./config.cfg', 'w') as f:
    f.write(config)

## Load data

### Load data from Huggingface

In [None]:
from datasets import load_dataset
import pandas as pd
import json

dataset = load_dataset("darrow-ai/LegalLensNER", split='train')

df_hf = pd.DataFrame(dataset)
print(len(df_hf))
df_hf.head()

def transform_to_spacy_format(df):
    spacy_data = []
    for index, row in df.iterrows():
        tokens = eval(row['tokens'])
        ner_tags = eval(row['ner_tags'])

        text = ' '.join(tokens)
        ents = []
        start = 0
        end = 0

        for token, tag in zip(tokens, ner_tags):
            start = end
            end = start + len(token)

            if tag != 'O':
                ents.append((start, end, tag))

            end += 1

        spacy_data.append((text, {"entities": [(start, end, label) for start, end, label in ents]}))

    return spacy_data

spacy_data = transform_to_spacy_format(df_hf)
output_json_path = './train_annotations.json'
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(spacy_data, f, ensure_ascii=False, indent=4)

output_json_path


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/685k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/710 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/617 [00:00<?, ? examples/s]

710


'./train_annotations.json'

In [None]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

cv_data = json.load(open('./train_annotations.json','r'))

len(cv_data)

cv_data[0]

def get_spacy_doc(file, data):
    nlp = spacy.blank('en')
    db = DocBin()

    for text, annot in tqdm(data):
        doc = nlp.make_doc(text)
        annot = annot['entities']

        ents = []
        entity_indices = []

        for start, end, label in annot:
            skip_entity = False
            for idx in range(start, end):
                if idx in entity_indices:
                    skip_entity = True
                    break
            if skip_entity:
                continue

            entity_indices = entity_indices + list(range(start, end))
            try:
                span = doc.char_span(start, end, label=label, alignment_mode='strict')
            except:
                continue

            if span is None:
                err_data = str([start, end]) + "    " + str(text) + "\n"
                file.write(err_data)
            else:
                ents.append(span)

        try:
            doc.ents = ents
            db.add(doc)
        except:
            pass

    return db

from sklearn.model_selection import train_test_split

train, test = train_test_split(cv_data, test_size=0.2)

len(train), len(test)

file = open('./train_file.txt','w')

db = get_spacy_doc(file, train)
db.to_disk('./train_data.spacy')

db = get_spacy_doc(file, test)
db.to_disk('./test_data.spacy')

file.close()


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
100%|██████████| 568/568 [00:00<00:00, 1450.09it/s]
100%|██████████| 142/142 [00:00<00:00, 1126.31it/s]


In [None]:
!python -m spacy init fill-config ./config.cfg ./config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train ./config.cfg  --output ./output  --paths.train ./train_data.spacy  --paths.dev ./test_data.spacy --gpu-id 0

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
tokenizer_config.json: 100% 52.0/52.0 [00:00<00:00, 381kB/s]
config.json: 100% 579/579 [00:00<00:00, 4.51MB/s]
spm.model: 100% 2.46M/2.46M [00:00<00:00, 45.6MB/s]
  _torch_pytree._register_pytree_node(
pytorch_model.bin: 100% 371M/371M [00:01<00:00, 204MB/s]
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        3132.84    838.77    0.00    0.00    0.00    0.00
  8     200       99585.36  82393.59   85.23   78.30   93.51    0.85

[31mAborted.[0m


# Save

## local

In [None]:
import zipfile
import os
from IPython.display import FileLink

def zip_dir(directory = "./output/model-best", file_name = 'output.zip'):
    os.chdir(directory)
    zip_ref = zipfile.ZipFile(file_name, mode='w')
    for folder, _, files in os.walk(directory):
        for file in files:
            if file_name in file:
                pass
            else:
                zip_ref.write(os.path.join(folder, file))

    return FileLink(file_name)

zip_dir()

In [None]:
!zip -r ./file.zip ./output/model-best

  adding: content/output/model-best/ (stored 0%)
  adding: content/output/model-best/transformer/ (stored 0%)
  adding: content/output/model-best/transformer/model (deflated 23%)
  adding: content/output/model-best/transformer/cfg (stored 0%)
  adding: content/output/model-best/.ipynb_checkpoints/ (stored 0%)
  adding: content/output/model-best/meta.json (deflated 67%)
  adding: content/output/model-best/vocab/ (stored 0%)
  adding: content/output/model-best/vocab/strings.json (deflated 74%)
  adding: content/output/model-best/vocab/lookups.bin (stored 0%)
  adding: content/output/model-best/vocab/vectors.cfg (stored 0%)
  adding: content/output/model-best/vocab/key2row (stored 0%)
  adding: content/output/model-best/vocab/vectors (deflated 45%)
  adding: content/output/model-best/ner/ (stored 0%)
  adding: content/output/model-best/ner/moves (deflated 78%)
  adding: content/output/model-best/ner/model (deflated 8%)
  adding: content/output/model-best/ner/cfg (deflated 33%)
  adding: c

## Huggingface

In [None]:
!python -m spacy package /content/output/model-best /content/hf --build wheel

[38;5;4mℹ Building package artifacts: wheel[0m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Including 1 package requirement(s) from meta and config[0m
spacy-transformers>=1.3.5,<1.4.0
[38;5;2m✔ Loaded meta.json from file[0m
/content/output/model-best/meta.json
[38;5;2m✔ Generated README.md from meta.json[0m
[38;5;2m✔ Successfully created package directory 'en_pipeline-0.0.0'[0m
/content/hf/en_pipeline-0.0.0
[1m* Creating isolated environment: venv+pip...[0m
[1m* Installing packages in isolated environment:[0m
  - setuptools >= 40.8.0
[1m* Getting build dependencies for wheel...[0m
running egg_info
creating en_pipeline.egg-info
writing en_pipeline.egg-info/PKG-INFO
writing dependency_links to en_pipeline.egg-info/dependency_links.txt
writing entry points to en_pipeline.egg-info/entry_points.txt
writing requirements to en_pipeline.egg-info/requires.txt
writing top-level names to en_pipeline.

In [None]:
!pip install spacy-huggingface-hub

Collecting spacy-huggingface-hub
  Downloading spacy_huggingface_hub-0.0.10-py3-none-any.whl.metadata (5.5 kB)
Collecting typer<0.8.0,>=0.3.0 (from spacy-huggingface-hub)
  Downloading typer-0.7.0-py3-none-any.whl.metadata (17 kB)
Downloading spacy_huggingface_hub-0.0.10-py3-none-any.whl (8.3 kB)
Downloading typer-0.7.0-py3-none-any.whl (38 kB)
Installing collected packages: typer, spacy-huggingface-hub
  Attempting uninstall: typer
    Found existing installation: typer 0.12.3
    Uninstalling typer-0.12.3:
      Successfully uninstalled typer-0.12.3
Successfully installed spacy-huggingface-hub-0.0.10 typer-0.7.0


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write)

In [None]:
%cd /content/hf/en_pipeline-0.0.0/dist/

/content/hf/en_pipeline-0.0.0/dist


In [None]:
!python -m spacy huggingface-hub push en_pipeline-0.0.0-py3-none-any.whl

[38;5;4mℹ Publishing to repository 'nimamegh/en_pipeline'[0m
result /tmp/tmp74hsiez5/en_pipeline/meta.json en_pipeline/meta.json
result /tmp/tmp74hsiez5/en_pipeline/en_pipeline-0.0.0/README.md en_pipeline/en_pipeline-0.0.0/README.md
result /tmp/tmp74hsiez5/en_pipeline/en_pipeline-0.0.0/config.cfg en_pipeline/en_pipeline-0.0.0/config.cfg
result /tmp/tmp74hsiez5/en_pipeline/en_pipeline-0.0.0/meta.json en_pipeline/en_pipeline-0.0.0/meta.json
result /tmp/tmp74hsiez5/en_pipeline/en_pipeline-0.0.0/tokenizer en_pipeline/en_pipeline-0.0.0/tokenizer
result /tmp/tmp74hsiez5/en_pipeline/en_pipeline-0.0.0/ner/cfg en_pipeline/en_pipeline-0.0.0/ner/cfg
result /tmp/tmp74hsiez5/en_pipeline/en_pipeline-0.0.0/ner/model en_pipeline/en_pipeline-0.0.0/ner/model
result /tmp/tmp74hsiez5/en_pipeline/en_pipeline-0.0.0/ner/moves en_pipeline/en_pipeline-0.0.0/ner/moves
result /tmp/tmp74hsiez5/en_pipeline/en_pipeline-0.0.0/transformer/cfg en_pipeline/en_pipeline-0.0.0/transformer/cfg
result /tmp/tmp74hsiez5/en_

# Load pretrained model

In [None]:
!pip install spacy transformers spacy_transformers spacy-huggingface-hub datasets

Collecting spacy_transformers
  Downloading spacy_transformers-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting spacy-huggingface-hub
  Downloading spacy_huggingface_hub-0.0.10-py3-none-any.whl.metadata (5.5 kB)
Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy_transformers)
  Downloading spacy_alignments-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting typer<1.0.0,>=0.3.0 (from spacy)
  Downloading typer-0.7.0-py3-none-any.whl.metadata (1

In [None]:
import spacy
!git clone https://nimamegh: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@huggingface.co/nimamegh/ner_model
model_path = "./ner_model"
nlp = spacy.load(model_path)



# Evaluate

## Load test data from csv file

In [None]:
from datasets import load_dataset
import pandas as pd
import json

test_path = './NER_test.csv'
df_hf = pd.read_csv(test_path)
print(len(df_hf))
df_hf.head()

def transform_to_spacy_format(df):
    spacy_data = []
    for index, row in df.iterrows():
        tokens = eval(row['tokens'])
        ner_tags = eval(row['ner_tags'])
        text = ' '.join(tokens)
        ents = []
        start = 0
        end = 0

        for token, tag in zip(tokens, ner_tags):
            start = end
            end = start + len(token)
            if tag != 'O':
                ents.append((start, end, tag))
            end += 1

        spacy_data.append((text, {"entities": [(start, end, label) for start, end, label in ents]}))

    return spacy_data

spacy_data = transform_to_spacy_format(df_hf)

output_json_path = './test_annotations.json'
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(spacy_data, f, ensure_ascii=False, indent=4)

output_json_path


617


'./test_annotations.json'

In [None]:
import spacy
from spacy.training import Example
import json
from sklearn.metrics import classification_report
from spacy.tokens import Doc

Doc.set_extension("id", default=None, force=True)

with open('test_annotations.json', 'r') as f:
    test_data = json.load(f)

id_df = pd.read_csv(test_path)
ids = id_df['id'].tolist()

def create_examples(data, nlp):
    examples = []
    for text, annot in data:
        doc = nlp.make_doc(text)
        entities = annot['entities']
        spans = [doc.char_span(start, end, label=label) for start, end, label in entities]
        spans = [span for span in spans if span is not None]
        doc.ents = spans
        example = Example.from_dict(doc, {"entities": [(start, end, label) for start, end, label in entities]})
        examples.append(example)
    return examples

examples = create_examples(test_data, nlp)


true_labels = []
predicted_labels = []
data_for_csv = []

for idx,example in enumerate(examples):
    doc = example.reference
    pred_doc = nlp(doc.text)
    tokens = [token.text for token in doc]
    predicted_ner_tags = []
    prev_tag = "O"
    for token in doc:
        true_label = token.ent_type_ if token.ent_iob_ != "O" else "O"
        pred_token = pred_doc[token.i]
        pred_tag = pred_token.ent_type_ if pred_token.ent_iob_ != "O" else "O"

        pred_label = f"{pred_tag}"

        true_labels.append(true_label)
        predicted_labels.append(pred_label)

        prev_tag = pred_tag
    data_for_csv.append({
        "id": ids[idx],
        "tokens": tokens,
        "ner_tags": predicted_ner_tags
    })

df = pd.DataFrame(data_for_csv)

# Save to CSV
df.to_csv('./predictions_NERLens.csv', index=False)
all_labels = list(set(true_labels + predicted_labels))

report = classification_report(true_labels, predicted_labels, labels=all_labels, output_dict=True)

print(json.dumps(report, indent=2))


{
  "B-VIOLATED BY": {
    "precision": 0.922077922077922,
    "recall": 0.9466666666666667,
    "f1-score": 0.9342105263157895,
    "support": 75.0
  },
  "I-LAW": {
    "precision": 0.817351598173516,
    "recall": 0.93717277486911,
    "f1-score": 0.8731707317073171,
    "support": 191.0
  },
  "I-VIOLATION": {
    "precision": 0.9109398064621945,
    "recall": 0.7836884436291802,
    "f1-score": 0.8425364077669902,
    "support": 7087.0
  },
  "B-VIOLATION": {
    "precision": 0.7970540098199672,
    "recall": 0.7469325153374233,
    "f1-score": 0.7711797307996833,
    "support": 652.0
  },
  "O": {
    "precision": 0.955605575744427,
    "recall": 0.9823529411764705,
    "f1-score": 0.9687946772242384,
    "support": 35870.0
  },
  "B-VIOLATED ON": {
    "precision": 0.7794117647058824,
    "recall": 0.6973684210526315,
    "f1-score": 0.7361111111111112,
    "support": 76.0
  },
  "I-VIOLATED BY": {
    "precision": 0.9513888888888888,
    "recall": 0.9513888888888888,
    "f1-sc

In [None]:
# Function to check the format of the prediction file
def check_ner_format(predictions_file_path, test_file_path):
    """
    Check the format of the NER prediction file.
    The file should be in CSV format with columns: id, tokens, ner_tags
    """
    try:
        df = pd.read_csv(predictions_file_path)
    except Exception as e:
        return False, f"Error reading predictions CSV file: {e}"

    try:
        test_df = pd.read_csv(test_file_path)
    except Exception as e:
        return False, f"Error reading test CSV file: {e}"

    # Check expected columns
    expected_columns = ['id', 'tokens', 'ner_tags']
    pred_columns = list(df.columns)
    for expected_col in expected_columns:
        if expected_col not in pred_columns:
            return False, f"Incorrect columns. Expected: {expected_columns}, Found: {pred_columns}"

    # Check number of rows
    expected_ner_num_rows = len(test_df)
    predictions_ner_num_rows = len(df)
    if predictions_ner_num_rows != expected_ner_num_rows:
        return False, f"Incorrect number of predictions. Expected: {expected_ner_num_rows}, Found: {predictions_ner_num_rows}"

    return True, "NER prediction file format is correct."

# Check the format of the predictions file
is_correct_format, message = check_ner_format('./predictions_NERLens.csv', test_path)
print(message)

NER prediction file format is correct.
