In [1]:
%pip install nltk -q

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
 
stopwords = stopwords.words("english")

from xml.etree import ElementTree as ET

In [2]:
import spacy
path_data = "Data/"
!python -m spacy download en_core_web_sm
lemmatizer = spacy.load("en_core_web_sm")

!bash downloads.bash $path_data $model_path

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Updated Git hooks.
Git LFS initialized.
The data were already unpacked


Special case: if the label does not contain "P" at the start (due to possible errors - or just missing in the file)

In [4]:
import re
import string

def get_text_from_xml(xml_file: str, data_file: str) -> pd.DataFrame:
    """
    Obtain a dataset of articles from a given XML file path
    """
    parsed_dir = ET.parse(path_data + xml_file)
    parsed_dir = parsed_dir.getroot()

    data = {"ID": [], "Text": []}
    for child_node in parsed_dir:
        id = child_node.attrib["id"]
        if "P" != id[0] and id[0] != "A":
            id = "P" + id
        data["ID"].append(id)
        data["Text"].append(child_node.text)

    data = pd.DataFrame(data)
    transformer_data = data.copy()

    !mkdir "Processed/"
    
    transformer_data["Text"] = transformer_data["Text"].apply(data_preprocessing_transformer)
    transformer_data.to_pickle("Processed/" + "transformer_" + data_file)

    data["Text"] = data["Text"].apply(data_preprocessing)
    data.to_pickle("Processed/" + data_file)

    return data

def data_preprocessing_transformer(data: str) -> list:
    """
    Minimal preprocessing for transformers. This includes:
    * lowercasing
    * replacement of all whitespace characters with " "
    * and removing any surplus " " characters
    * tokenization based on single " "
    """
    data = data.lower()
    data = re.sub(r"\s+", " ", data)
    data = data.strip()
    return data.split(" ")

def data_preprocessing(data: str) -> list:
    """
    Main preprocessing pipeline. Tokenize a given sentence such that:
    * all tokens are lower-cased
    * compound nouns are kept (e.g. hand-arm)
    * the tokens are lemmatized
    * any stop words, isolated punctuation, and tokens with numeric characters are removed
    """
    data = data.lower()
    # !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ from string.punctuation
    # keep compund nouns
    data = regexp_tokenize(data, r"[\w]+(?:[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~][\w]+[()*[\]{}]?){1,}|[\w]+|(?:[([{]\w+[)\]}])+")

    # lemmatize
    aux_data = data
    data = " ".join(data)
    lemmatized_data = lemmatizer(data)
    data = [token.lemma_ for token in lemmatized_data]

    # remove all punctuation
    data = [word for word in data if len(word) > 1]

    # some of the compound-nouns are separated by the lemmatizer, as such it is necessary to readd those tokens 
    # back as they were, but still in their lemmatized forms
    aux = []
    for aux_token in aux_data:
        found = False
        for token in lemmatized_data:
            if token.text == aux_token:
                aux.append(token.lemma_)
                found = True
            if found:
                break
        if not found:
            aux.append(aux_token)
    data = aux

    # remove stop words, any remaining punctuation, and tokens containing digits
    data = [word for word in data if word not in stopwords and word not in string.punctuation and not re.search(r"[0-9]+", word)]

    return data

directive_data = get_text_from_xml("DIR_EN_32002L0044.xml", "directive_data.pickle")
provision_data = get_text_from_xml("NIM_EN.xml", "provision_data.pickle")

mkdir: cannot create directory ‘Processed/’: File exists
mkdir: cannot create directory ‘Processed/’: File exists


In [5]:
text_transformer = pd.read_pickle("Processed/transformer_directive_data.pickle")["Text"]
text_processed = pd.read_pickle("Processed/directive_data.pickle")["Text"]

print(" ".join(text_processed.iloc[0]))
print(" ".join(text_transformer.iloc[0]))

directive individual directive within meaning article directive lay minimum requirement protection worker risk health safety arise likely arise exposure mechanical vibration
this directive, which is the 16th individual directive within the meaning of article 16(1) of directive 89/391/eec, lays down minimum requirements for the protection of workers from risks to their health and safety arising or likely to arise from exposure to mechanical vibration.


In [6]:
" ".join(pd.read_pickle("Processed/transformer_provision_data.pickle")["Text"].iloc[15])

'without prejudice to sections 9 and 10 of the act, where employees are exposed to risk from mechanical vibration, an employer shall provide those employees or their safety representative (or both) with suitable and sufficient information, instruction and training, including - the technical and organisational measures taken in order to comply with these regulations, the exposure limit values and the exposure action values, the results of the risk assessment and measurement of the mechanical vibration carried out in accordance with regulation 5 and the potential injury arising from the work equipment in use, why and how to detect and report signs of injury, the circumstances in which health surveillance is made available to employees and its purpose, in accordance with regulation 8, and safe working practices to minimise exposure to mechanical vibration.'