 **Setup Environment**

 Installing necessary libraries


In [None]:

!apt update
!apt install -y python3-venv

!pip install spacy==3.7.2
!pip install spacy-transformers
!pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cu113/torch_stable.html


!python -m spacy download en_core_web_trf


!pip install numpy>=1.25.0 pandas-stubs==2.0.3.230814



!nvidia-smi
!nvcc --version

In [None]:
!nvcc --version
!cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -A 2

# !pip install torch
import torch
print(torch.cuda.is_available())

In [None]:
!pip install spacy==3.7.2
!python -m spacy info

In [None]:
!pip install spacy-transformers
!pip install transformers

In [None]:
# import spacy
# from spacy.tokens import DocBin
# from tqdm import tqdm
# import os
# import json
# import random

# # Load English blank model
# nlp = spacy.blank("en")

# # Function to convert data to .spacy format
# def convert_to_spacy_format(data):
#     db = DocBin()
#     for text, annot in tqdm(data):
#         doc = nlp.make_doc(text)
#         ents = []
#         for start, end, label in annot["entities"]:
#             span = doc.char_span(start, end, label=label, alignment_mode="contract")
#             if span is None:
#                 print("Skipping entity")
#             else:
#                 ents.append(span)
#         doc.ents = ents
#         db.add(doc)
#     return db

# # Load all JSON files from the "train" directory
# train_dir = "/experience_extraction/train"
# all_data = []
# for filename in os.listdir(train_dir):
#     if filename.endswith(".json"):
#         with open(os.path.join(train_dir, filename)) as f:
#             data = json.load(f)
#             all_data.extend(data["annotations"])

**Data Preprocessing**

In [None]:
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import json
import os

def convert_to_spacy(json_spec):
    nlp = spacy.blank("en") # load a new spacy model
    db = DocBin() # create a DocBin object
    dir, file_name = os.path.split(json_spec)
    with open(json_spec) as json_f:
        data = json.load(json_f)

    td = [item for item in data['annotations'] if item]
    for text, annot in tqdm(td): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        if not annot["entities"]:
          continue

        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)

    db.to_disk(os.path.join(dir, f"{file_name.split('.')[0]}.spacy"))


In [None]:
convert_to_spacy("/experience_extraction/training_data/training/annotated_dataset.json")

In [None]:
# # Shuffle the data
# random.shuffle(all_data)

# # Split data into training and validation sets (80% training, 20% validation)
# split_idx = int(0.8 * len(all_data))
# train_data = all_data[:split_idx]
# val_data = all_data[split_idx:]

In [None]:
# train_db = convert_to_spacy_format(train_data)
# val_db = convert_to_spacy_format(val_data)


In [None]:
# # Save .spacy format files
# train_db.to_disk("/experience_extraction/training_data/training/training_data.spacy")
# val_db.to_disk("/experience_extraction/training_data/validation/validation_data.spacy")

**Training configuration**

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# !python -m spacy init config config.cfg --lang en --pipeline ner --optimize accuracy --force

In [None]:
# !python -m spacy download en_core_web_lg

In [None]:
# !pip install spacy==3.7.2
# !pip install spacy-transformers
!python -m spacy download en_core_web_trf

In [None]:
# !pip install cupy

 ***Train the model***


In [None]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data/training/annotated_dataset.spacy --paths.dev ./training_data/validation/annotated_dataset.spacy --gpu-id 0

**Evaluating the model**

In [None]:
import spacy
nlp_ner = spacy.load("./model-best")

In [None]:
doc = nlp_ner('''

''')

In [None]:
spacy.displacy.render(doc, style="ent", jupyter=True)

In [None]:
exp = []
if doc.ents:
  for ent in doc.ents:
    if ent.label_ == "EXPERIENCE":
      print(ent.text)

  print(doc.ents)

In [None]:
import spacy
from spacy.training import offsets_to_biluo_tags
from spacy.tokens import DocBin
from spacy.training.example import Example

nlp = spacy.load("./model-best")

eval_data = DocBin().from_disk("./training_data/validation/validation_data.spacy")

docs = list(eval_data.get_docs(nlp.vocab))

for doc in docs:
    ents = [(ent.start, ent.end, ent.label_) for ent in doc.ents]
    biluo_tags = offsets_to_biluo_tags(doc, ents)
    if '-' in biluo_tags:
        print(f"Misaligned entities in text: {doc.text}")
        print(f"Entities: {ents}")
        print(f"BILUO Tags: {biluo_tags}")

examples = []
for doc in docs:
    ents = [(ent.start, ent.end, ent.label_) for ent in doc.ents]
    if '-' not in offsets_to_biluo_tags(doc, ents):
        example = Example.from_dict(doc, {"entities": ents})
        examples.append(example)

scorer = nlp.evaluate(examples)

def safe_print_metric(scorer, metric_name, metric_display_name):
    value = scorer.get(metric_name)
    if value is not None:
        print(f"{metric_display_name}: {value:.2f}")
    else:
        print(f"{metric_display_name}: N/A")

print("Evaluation Metrics:")
safe_print_metric(scorer, 'ents_f', "Entities F1-score (ENTS_F)")
safe_print_metric(scorer, 'ents_p', "Entities Precision (ENTS_P)")
safe_print_metric(scorer, 'ents_r', "Entities Recall (ENTS_R)")

print("Available keys in scorer:", scorer.keys())

safe_print_metric(scorer, 'token_acc', "Token Accuracy (Token Acc)")
safe_print_metric(scorer, 'tags_acc', "Tags Accuracy (Tags Acc)")
safe_print_metric(scorer, 'uas', "Unlabeled Attachment Score (UAS)")
safe_print_metric(scorer, 'las', "Labeled Attachment Score (LAS)")
