# Preprocessing

After having done $2$ assignments, I have deemed it fit to factor out preprocessing into its own notebook, instead of having it attached to the first model so as to not hinder its execution or add overhead.

The data required to be used by the models shall be stored under `<arbitrary path>`.

## Data Loading

Loading the `.parquet` files for training. I am unsure which language to try first..

In [1]:
import os
import re
import nltk
import spacy
import polars as pl

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
# Select which language
# we want to train for
lang = "italian"

lang_abbr = {"italian": "it",
             "french": "fr",}

# Prepare the path
training_data_path = os.path.join("..", "data", "input_lang", lang, "training.parquet")

In [3]:
raw_parquet_data = pl.read_parquet(training_data_path)

raw_parquet_data.head()

text
str
"""- Grazie, amico. ###>- Thanks,…"
"""Dillo. ###>Say it."""
"""Trifosfato di sodio (tripolifo…"
"""Invero è avido per amore delle…"
"""ALLEGATO I ###>ANNEX I"""


In [4]:
split_data = raw_parquet_data.select(
    raw_parquet_data["text"]
    .str.split(" ###>")
    .list.to_struct(n_field_strategy = "max_width",
                    fields = [lang_abbr[lang], "english"])
    .alias("splitten")
).unnest("splitten")

split_data.head()

it,english
str,str
"""- Grazie, amico.""","""- Thanks, buddy."""
"""Dillo.""","""Say it."""
"""Trifosfato di sodio (tripolifo…","""Sodium triphosphate (sodium tr…"
"""Invero è avido per amore delle…","""Surely, he is ardent in his lo…"
"""ALLEGATO I""","""ANNEX I"""


## Lower Casing and Regex
A cruical part of regex is to lowercase the letters so that we are working with the least possible amount of characters in a language's given alphabet.

As well as removing most punctuation.

In [5]:
# Defining path to install NLTK libraries in
NLTK_LIB_PATH = os.path.join("..", "venv_nlp", "Lib", "nltk_data")

# Defining download function
def download_libs():
    libraries = {
        os.path.join("corpora", "stopwords"): "stopwords",
        os.path.join("corpora","wordnet"): "wordnet",
        os.path.join("tokenizers", "punkt"): "punkt"
    }

    for _, package in libraries.items():
        try:
            nltk.data.find(package)
            print(f"{package.capitalize()} data exists.")
        except LookupError:
            print(f"Downloading {package}...")

            nltk.download(package, download_dir=NLTK_LIB_PATH)
        except Exception as e:
            print(f"Unexpected error checking {package}: {e}")

In [6]:
try:
    os.makedirs(NLTK_LIB_PATH, exist_ok=True)
    print(f"Using NLTK data directory: {NLTK_LIB_PATH}")
    download_libs()
except PermissionError:
    print(f"Permission denied: Unable to create or write to directory '{NLTK_LIB_PATH}'")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Using NLTK data directory: ..\venv_nlp\Lib\nltk_data
Downloading stopwords...
Downloading wordnet...
Downloading punkt...


[nltk_data] Downloading package stopwords to
[nltk_data]     ..\venv_nlp\Lib\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     ..\venv_nlp\Lib\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to ..\venv_nlp\Lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
def regex_text(text: str) -> str:
    '''
    Cleans any passed string.
    Includes accented characters
    '''
    text = text.lower()

    regex_patterns = [
        (r"https?://\S+", ""),    # Remove http or https links
        (r"\S+@\S+\.\S+", " "),   # Remove email addresses
        (r"[^\w\sÀ-ÿ'’]", " "),   # Remove special characters, except accented ones
        (r"\s+", " "),            # Replace multiple spaces with a single space
        (r"^\s+|\s+$", "")        # Strip leading and trailing spaces
    ]

    for pattern, replacement in regex_patterns:
        text = re.sub(pattern, replacement, text)
    return text

In [8]:
stopwords = set(nltk.corpus.stopwords.words("english"))
stopwords.update(nltk.corpus.stopwords.words(f"{lang}"))

In [9]:
def stopword_removal(text: str, stopwords_set: set) -> str:
    words = text.split()
    return " ".join([word for word in words if word not in stopwords_set])


In [11]:
spacy.cli.download("en_core_web_sm")
spacy.cli.download(f"{lang_abbr[lang]}_core_news_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [12]:
# Loading language model
lang_model_en = spacy.load("en_core_web_sm")
lang_model_other = spacy.load(f"{lang_abbr[lang]}_core_news_sm")

# Lemmatiser
def lemma(tokens, language_model):
    doc = language_model(tokens)
    return [token.lemma_ for token in doc]