In [1]:
import unicodedata

import pandas as pd
from commons import DATASET_CLEAN_LOCATION, DATASET_CLEAN_UNDERSAMPLING_LOCATION, DATASET_LOCATION

In [2]:
df = pd.read_csv(DATASET_LOCATION)

In [3]:
df_iteng = df[(df["Language"] == "Italian") | (df["Language"] == "English")].copy()

In [4]:
# Per prima cosa metto tutto in minuscolo. Poi sostituisco l'apostrofo con uno spazio
# (Altrimenti costrutti come l'uomo diventavano un unico token luomo)
# Successivamente elimino i numeri e poi i caratteri non alfanumerici a parte gli spazi.
# Elimino infine gli spazi multipli in modo da avere solo uno spazio che divide le parole


def clean_text_keep_accents(text: str) -> str:
    """
    Clean a string by removing special characters while preserving accents.

    This function removes all characters from the input string except letters,
    numbers, and whitespace. It retains accented characters and strips out
    special symbols, including modifier letters and punctuation.

    Args:
        text (str): The input text string to be cleaned.

    Returns:
        str: The cleaned text with accents preserved and special characters removed.

    """
    return "".join(
        c for c in text
        if (
            # Tieni lettere o numeri o spazi
            unicodedata.category(c).startswith(("L", "N")) or c.isspace()
        )
        # Per eliminare i caratteri speciali che sono classificati come MODIFIER LETTER
        and not unicodedata.name(c, "").startswith("MODIFIER")
    )

In [5]:
def clean_text(text: str) -> str:
    """
    Clean and normalizes a text string by lowercasing, removing digits, and stripping punctuation.

    This function performs several text preprocessing steps:
    - Converts all characters to lowercase.
    - Replaces apostrophes with spaces.
    - Removes all digits.
    - Keeps only alphanumeric characters and whitespace.
    - Cleans remaining Unicode characters while preserving accents (via `clean_text_keep_accents`).
    - Normalizes whitespace to ensure consistent spacing.

    Args:
        text (str): The input text string to be cleaned.

    Returns:
        str: The cleaned and normalized text.

    """
    text = text.lower()
    text = text.replace("'", " ")
    text = "".join(c for c in text if not c.isdigit())
    text = "".join(c for c in text if c.isalnum() or c.isspace())
    #Pulizia dei caratteri unicode
    text = clean_text_keep_accents(text)
    return  " ".join(text.split())

In [6]:
df_iteng["Text_clean"] = df_iteng["Text"].apply(clean_text)

In [7]:
df_clean = df_iteng[["Text_clean", "Language"]]
df_clean = df_clean.rename(columns={"Text_clean": "Text"})

In [8]:
df_clean.to_csv(DATASET_CLEAN_LOCATION, index=False)

In [9]:
## Undersampling utilizzando i quartili
df_iteng["Word_count"] = df_iteng["Text_clean"].apply(lambda x: len(str(x).split()))
english_df = df_iteng[df_iteng["Language"] == "English"].copy()
english_df["quartile"] = pd.qcut(english_df["Word_count"], q=4, labels=False)

undersampled_parts = []
for q in range(4):
    group = english_df[english_df["quartile"] == q]
    sampled = group.sample(frac=0.5, random_state=1999)
    undersampled_parts.append(sampled)
english_undersampled = pd.concat(undersampled_parts)
italian_df = df_iteng[df_iteng["Language"] == "Italian"]
df_clean_undersampling = pd.concat([english_undersampled, italian_df]).reset_index(drop=True)

In [10]:
df_clean_undersampling = df_clean_undersampling[["Text_clean", "Language"]]
df_clean_undersampling = df_clean_undersampling.rename(columns={"Text_clean": "Text"})

In [11]:
df_clean_undersampling.to_csv(DATASET_CLEAN_UNDERSAMPLING_LOCATION, index=False)