In [1]:
import unicodedata

import pandas as pd
from commons import DATASET_CLEAN_LOCATION, DATASET_CLEAN_UNDERSAMPLING_LOCATION, DATASET_LOCATION

# Data Cleaning

We import the dataset used in the data exploration phase and filter it to include only sentences in Italian and English.


In [2]:
df = pd.read_csv(DATASET_LOCATION)

In [3]:
df_iteng = df[(df["Language"] == "Italian") | (df["Language"] == "English")].copy()

The function `clean_text_keep_accents` is used within the main cleaning pipeline to filter out unwanted Unicode characters that the primary cleaning function might miss.

Specifically, it removes all special characters except letters (including accented characters), numbers, and whitespace. It preserves accented letters by checking Unicode categories starting with "L" (letters) and "N" (numbers), while excluding characters categorized as modifier letters and punctuation. This ensures that important accented characters remain intact for languages like Italian, while unwanted symbols and special characters are removed.


In [4]:
def clean_text_keep_accents(text: str) -> str:
    """
    Clean a string by removing special characters while preserving accents.

    This function removes all characters from the input string except letters,
    numbers, and whitespace. It retains accented characters and strips out
    special symbols, including modifier letters and punctuation.

    Args:
        text (str): The input text string to be cleaned.

    Returns:
        str: The cleaned text with accents preserved and special characters removed.

    """
    return "".join(
        c for c in text
        if (
            unicodedata.category(c).startswith(("L", "N")) or c.isspace()
        )
        and not unicodedata.name(c, "").startswith("MODIFIER")
    )

This function `clean_text` performs a series of preprocessing steps to clean and normalize the input text:

- Converts the entire text to lowercase to ensure uniformity.
- Replaces apostrophes with spaces to prevent contractions (e.g., "l'uomo" becomes two separate tokens: "l" and "uomo").
- Removes all digits from the text.
- Keeps only alphanumeric characters and whitespace, removing other punctuation.
- Cleans remaining Unicode characters by calling `clean_text_keep_accents`, which preserves accented characters while removing unwanted symbols.
- Normalizes whitespace by collapsing multiple spaces into a single space, ensuring consistent token separation.


In [5]:
def clean_text(text: str) -> str:
    """
    Clean and normalizes a text string by lowercasing, removing digits, and stripping punctuation.

    This function performs several text preprocessing steps:
    - Converts all characters to lowercase.
    - Replaces apostrophes with spaces.
    - Removes all digits.
    - Keeps only alphanumeric characters and whitespace.
    - Cleans remaining Unicode characters while preserving accents (via `clean_text_keep_accents`).
    - Normalizes whitespace to ensure consistent spacing.

    Args:
        text (str): The input text string to be cleaned.

    Returns:
        str: The cleaned and normalized text.

    """
    text = text.lower()
    text = text.replace("'", " ")
    text = "".join(c for c in text if not c.isdigit())
    text = "".join(c for c in text if c.isalnum() or c.isspace())
    text = clean_text_keep_accents(text)
    return  " ".join(text.split())

We apply the cleaning function to the dataset and save the cleaned version for the training phase.


In [6]:
df_iteng["Text_clean"] = df_iteng["Text"].apply(clean_text)

In [7]:
df_clean = df_iteng[["Text_clean", "Language"]]
df_clean = df_clean.rename(columns={"Text_clean": "Text"})

In [8]:
df_clean.to_csv(DATASET_CLEAN_LOCATION, index=False)

We also create another dataset using undersampling because the English sentences are twice as many as the Italian ones. 

However, instead of random undersampling, we take half of the sentences from each of the four quartiles, where the quartiles are defined based on the word count of the sentences. This approach helps maintain the distribution of sentence lengths in the undersampled data.

The steps are as follows:
- Separate the English sentences and divide them into four quartiles based on their word counts.
- From each quartile, randomly sample 50% of the sentences.
- Combine the undersampled English sentences with all the Italian sentences.
- Save this balanced dataset for the training phase.


In [None]:
df_iteng["Word_count"] = df_iteng["Text_clean"].apply(lambda x: len(str(x).split()))
english_df = df_iteng[df_iteng["Language"] == "English"].copy()
english_df["quartile"] = pd.qcut(english_df["Word_count"], q=4, labels=False)

undersampled_parts = []
for q in range(4):
    group = english_df[english_df["quartile"] == q]
    sampled = group.sample(frac=0.5, random_state=1999)
    undersampled_parts.append(sampled)
english_undersampled = pd.concat(undersampled_parts)
italian_df = df_iteng[df_iteng["Language"] == "Italian"]
df_clean_undersampling = pd.concat([english_undersampled, italian_df]).reset_index(drop=True)

In [10]:
df_clean_undersampling = df_clean_undersampling[["Text_clean", "Language"]]
df_clean_undersampling = df_clean_undersampling.rename(columns={"Text_clean": "Text"})

In [11]:
df_clean_undersampling.to_csv(DATASET_CLEAN_UNDERSAMPLING_LOCATION, index=False)