In [1]:
! pip install mlflow language_tool_python ndg-httpsclient pyopenssl pyasn1 mlflow
! pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple lingowiz==2.2.1

Collecting mlflow
  Downloading mlflow-2.18.0-py3-none-any.whl.metadata (29 kB)
Collecting language_tool_python
  Downloading language_tool_python-2.8.1-py3-none-any.whl.metadata (12 kB)
Collecting ndg-httpsclient
  Downloading ndg_httpsclient-0.5.1-py3-none-any.whl.metadata (6.2 kB)
Collecting mlflow-skinny==2.18.0 (from mlflow)
  Downloading mlflow_skinny-2.18.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.18.0->mlflow)
  Downloading databricks_sdk-0.38.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0

In [11]:
"""
Module: translation_training_pipeline.py

This module provides a complete pipeline for fine-tuning machine translation models
using Hugging Face's `transformers` library and managing training experiments with MLflow.
It supports data preprocessing, tokenization, model training, and logging results to Hugging Face
and MLflow.

### Features:
1. **Data Splitting**:
    - Splits data into training and evaluation sets.

2. **Model Training**:
    - Fine-tunes a MarianMT model on custom datasets.
    - Supports freezing encoder and decoder layers for efficient fine-tuning.

3. **Tokenization**:
    - Tokenizes input and target sentences for training and evaluation.

4. **MLflow Integration**:
    - Logs training hyperparameters, metrics, and models.
    - Automatically creates or updates MLflow experiments.

5. **Hugging Face Integration**:
    - Saves and uploads trained models to the Hugging Face Model Hub.

6. **Callbacks**:
    - Includes a callback to clear GPU memory after each epoch.

### Dependencies:
- `transformers`: For Hugging Face models and training utilities.
- `mlflow`: For experiment tracking and logging.
- `pandas`: For data manipulation.
- `datasets`: For handling datasets in Hugging Face format.
- `torch`: For GPU/CPU compatibility during training.
- `tqdm`: For progress visualization.
- `requests`: For making API calls to external services.

### Example Usage:
```python
from translation_training_pipeline import training_pipeline

df = pd.read_csv("translation_data.csv")
training_pipeline(
    df=df,
    src="English",
    base_model="Helsinki-NLP/opus-mt-en-ar",
    steps=1000,
    batch_size=16,
    learning_rate=5e-5,
    epochs=3,
    warmup=100,
    trg_language="Arabic",
    layer=1
)
"""
# Importing Librairies
import os
from sklearn.model_selection import train_test_split
from datasets import Dataset
from tqdm import tqdm
# HF librairies for fine tuning
from transformers import (MarianTokenizer,
                          MarianMTModel,
                          Trainer,
                          TrainingArguments,
                          TrainerCallback)
from dotenv import load_dotenv
import mlflow
from datetime import datetime
from mlflow.data.pandas_dataset import PandasDataset
from huggingface_hub import HfApi
import requests
import torch
import warnings

# Suppress all UserWarnings
warnings.filterwarnings("ignore", category=UserWarning)
os.environ["WANDB_MODE"] = "disabled"
load_dotenv()
token = os.getenv('HF_TOKEN')
tqdm.pandas()
mlflow.autolog(disable=True)
mlflow.transformers.autolog(disable=True)
mlflow.pytorch.autolog(disable=True)


def split_test(df, test_size=0.2, random_state=None):
    """
    Splits the DataFrame into training and testing sets.

    Args:
        df (pd.DataFrame): The input DataFrame to split.
        test_size (float, optional): Fraction of the data to reserve for testing. Defaults to 0.2.
        random_state (int, optional): Random seed for reproducibility. Defaults to None.

    Returns:
        tuple: A tuple containing:
            - pd.DataFrame: Training set.
            - pd.DataFrame: Testing set.
    """
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=random_state)
    return df_train, df_test


class EmptyCacheCallback(TrainerCallback):
    """
    A callback to clear GPU memory after each training epoch to prevent memory overflow.
    """
    def on_epoch_end(self, args, state, control, **kwargs):
        """
        Clears the GPU cache after the end of an epoch.

        Args:
            args (TrainingArguments): Training arguments.
            state (TrainerState): Trainer state.
            control (TrainerControl): Trainer control object.
        """
        print(f"Clearing cache after epoch {state.epoch}...")
        torch.cuda.empty_cache()


def train(model,
          tokenized_datasets_train,
          tokenized_datasets_eval,
          batch_size,
          lr,
          epochs,
          warmup,
          tokenizer):
    # Adjust training arguments for small dataset

    """
    Trains a MarianMT model on tokenized datasets.

    Args:
        model (MarianMTModel): The MarianMT model to fine-tune.
        tokenized_datasets_train (Dataset): Tokenized training dataset.
        tokenized_datasets_eval (Dataset): Tokenized evaluation dataset.
        steps (int): Total training steps.
        batch_size (int): Batch size for training and evaluation.
        lr (float): Learning rate for optimization.
        epochs (int): Number of training epochs.
        warmup (int): Number of warmup steps for learning rate scheduling.
        tokenizer (MarianTokenizer): Tokenizer for the MarianMT model.
        bool_model (bool): Reserved for additional configuration (unused in this function).

    Returns:
        tuple: A tuple containing:
            - Trainer: Hugging Face Trainer object after training.
            - TrainOutput: Training metrics.
    """

    print("")
    print("Initializaing Training Arguments ...")
    training_args = TrainingArguments(
        output_dir="temp",
        evaluation_strategy="epoch",  # Evaluate at the end of each epoch
        logging_strategy="epoch",
        per_device_train_batch_size=batch_size,  # Batch size for training
        per_device_eval_batch_size=batch_size,   # Batch size for evaluation
        learning_rate=lr,                        # Learning rate for fine-tuning
        num_train_epochs=epochs,                 # Number of epochs
        warmup_steps=warmup,
        fp16=True,                               # Use mixed precision if on GPU
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=tokenized_datasets_train,
        eval_dataset=tokenized_datasets_eval,
        callbacks=[EmptyCacheCallback()]
    )

    # Train the model
    print("Training ...")
    trainer.train()

    # Return the trainer
    return trainer


# Tokenization function for both source (English) and target (Arabic)
# transforming sentences into list of words
def tokenize_function(data, tokenizer, src, trg):
    """
    Tokenizes source and target texts for training and evaluation.

    Args:
        data (Dataset): The dataset containing source and target texts.
        tokenizer (MarianTokenizer): Tokenizer for the MarianMT model.
        src (str): Name of the source language column in the dataset.
        trg (str): Name of the target language column in the dataset.

    Returns:
        dict: Tokenized input and target sequences, including labels.
    """

    # Tokenize the source (English) text
    source = tokenizer(data[src], truncation=True, padding="max_length", max_length=512)

    # Tokenize the target (Arabic) text
    targets = tokenizer(data[trg], truncation=True, padding="max_length", max_length=512)

    # Set the 'labels' field to the tokenized target (Arabic) text
    source["labels"] = targets["input_ids"]

    return source


def split_eval(dataset, eval_size=0.2, random_state=42):
    """
    Splits a dataset into training and evaluation subsets.

    Args:
        dataset (Dataset): The dataset to split.
        eval_size (float, optional): Proportion of the dataset for evaluation. Defaults to 0.2.
        random_state (int, optional): Random seed for reproducibility. Defaults to 42.

    Returns:
        tuple: A tuple containing:
            - Dataset: Training subset.
            - Dataset: Evaluation subset.
    """
    train_data, eval_data = train_test_split(dataset, test_size=eval_size, random_state=random_state)
    return train_data, eval_data


def initialize(data_train,
               data_eval,
               base_model,
               src,
               trg,
               layer=0):
    """
    Initializes the tokenizer, model, and tokenized datasets for training.

    Args:
        data_train (Dataset): Training dataset.
        data_eval (Dataset): Evaluation dataset.
        special_model (str): Special model configuration (optional).
        base_model (str): Pretrained MarianMT model name or path.
        src (str): Source language column in the dataset.
        trg (str): Target language column in the dataset.
        layer (int, optional): Layer freezing configuration. Defaults to 0.

    Returns:
        tuple: A tuple containing:
            - Dataset: Tokenized training dataset.
            - Dataset: Tokenized evaluation dataset.
            - MarianMTModel: Initialized MarianMT model.
            - MarianTokenizer: Initialized tokenizer.
    """
    # Load the tokenizer from the pre-trained MarianMT model
    print("")
    print("Intializing Tokenizer ...")
    tokenizer = MarianTokenizer.from_pretrained(base_model)

    print("Initializing Model ...")
    model = MarianMTModel.from_pretrained(base_model)

    # print("Freezing embedding layer")
    # model.model.shared.requires_grad = False

    # # Unfreeze encoder layer for source language encoding
    # print(f"Freezing last layers of encoder ...")
    # for layer in model.model.encoder.layers[:-1]:
    #   for param in layer.parameters():
    #     param.requires_grad = False  # Freeze the encoder layers

    # Unfreeze decoder layer for target language generation
    print("Freezing last layers of decoder ...")
    for layer in model.model.decoder.layers[:-2]:
        for param in layer.parameters():
            param.requires_grad = False

    # Apply tokenization to the train dataset in batches (batched=True) for efficiency
    tokenized_datasets_train = data_train.map(tokenize_function,
                                              batched=True,
                                              fn_kwargs={"tokenizer": tokenizer,
                                                         "src": src,
                                                         "trg": trg})
    tokenized_datasets_eval = data_eval.map(tokenize_function,
                                            batched=True,
                                            fn_kwargs={"tokenizer": tokenizer,
                                                       "src": src,
                                                       "trg": trg})
    print("Initialization Complete")

    # Return the tokenized dataset, the model,
    # and the tokenizer for further use
    return (tokenized_datasets_train,
            tokenized_datasets_eval,
            model,
            tokenizer)


def log_params(base_model,
               batch_size,
               learning_rate,
               epochs,
               warmup_steps,
               experiment_name,
               df,
               log_history):
    """
    Logs training hyperparameters, data, and model metadata to MLflow.

    Args:
        base_model (str): Base MarianMT model used for training.
        steps (int): Total training steps.
        batch_size (int): Batch size for training and evaluation.
        learning_rate (float): Learning rate used during training.
        epochs (int): Number of training epochs.
        warmup_steps (int): Number of warmup steps for learning rate scheduling.
        experiment_name (str): Name of the MLflow experiment.
        df (pd.DataFrame): Training dataset.
        model (MarianMTModel): Trained MarianMT model.
        tokenizer (MarianTokenizer): Tokenizer used for training.
        log_history (list): Training logs for parameter logging.

    Returns:
        None
    """
    now = datetime.now()
    formatted_date_time = now.strftime("%Y-%m-%d_%H:%M")
    # Create or set the experiment

    mlflow.set_experiment(experiment_name)
    run_name = experiment_name+"_"+str(formatted_date_time)
    with mlflow.start_run(run_name=run_name,
                          nested=True):

        print("Logging Hyperparameters")
        mlflow.log_param("epochs", epochs)
        mlflow.log_param("base_model", base_model)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("warmup_steps", warmup_steps)

        print("Logging Data")
        dataset: PandasDataset = mlflow.data.from_pandas(df,
                                                         source="dataset_train")
        mlflow.log_input(dataset, context="train")

        print("Logging Model Metrics and Params")
        for params in log_history[-2:]:
            for key, value in params.items():
                mlflow.log_param(key, value)

        print("Logging Model...")
        body = {
            "model_name": experiment_name,
            "run_name": run_name
        }
        requests.post("https://ideal-amoeba-specially.ngrok-free.app/mlflow",
                      json=body,
                      verify=False)
        print("Logging Complete")


def training_pipeline(df,
                      src,
                      base_model,
                      steps,
                      batch_size,
                      learning_rate,
                      epochs,
                      warmup,
                      special_model=None,
                      trg_language="Arabic",
                      layer=0):

    """
    Executes the end-to-end training pipeline for fine-tuning a MarianMT model.

    Args:
        df (pd.DataFrame): Input dataset for training and evaluation.
        src (str): Source language column name in the dataset.
        base_model (str): Pretrained MarianMT model name or path.
        steps (int): Total training steps.
        batch_size (int): Batch size for training and evaluation.
        learning_rate (float): Learning rate for fine-tuning.
        epochs (int): Number of training epochs.
        warmup (int): Number of warmup steps for learning rate scheduling.
        special_model (str, optional): Custom model configuration. Defaults to None.
        trg_language (str, optional): Target language name. Defaults to "Arabic".
        layer (int, optional): Layer freezing configuration. Defaults to 0.

    Returns:
        None
    """
    output_path = f"translator_{src}_Arabic_spec"
    print("Data Split train - eval")
    df_train, df_eval = split_eval(df, 0.1)
    data_train = Dataset.from_pandas(df_train)
    data_eval = Dataset.from_pandas(df_eval)
    print("")
    print(f"Length train data {len(df_train)} \nLength eval data {len(df_eval)}")


#  Initialing model, tokenizer, data
    (tokenized_datasets_train,
     tokenized_datasets_eval,
     model,
     tokenizer) = initialize(data_train,
                             data_eval,
                             special_model,
                             base_model,
                             src,
                             trg_language,
                             layer)

    print("")
    trainer = train(
        model,
        tokenized_datasets_train,
        tokenized_datasets_eval,
        steps,
        batch_size,
        learning_rate,
        epochs,
        warmup,
        tokenizer,
        False,
    )

    print("Training Complete")

    log_history = trainer.state.log_history

    trainer.model.save_pretrained(output_path,
                                  safe_serialization=False)
    tokenizer.save_pretrained(output_path)

    # Upload to hugging face
    print("Uploading model to hugging face ...")
    api = HfApi(token=token)
    api.upload_folder(folder_path=output_path,
                      repo_id=f"patrick844/{output_path}",
                      token=token)

    # Logging
    experiment_name = output_path
    log_params(
        base_model,
        steps,
        batch_size,
        learning_rate,
        epochs, warmup,
        experiment_name,
        df_train,
        trainer.model,
        tokenizer,
        log_history)


In [12]:

"""
Module Name: data_prep.py

This module provides functionality for preprocessing text data in various languages,
formatting dataframes, and preparing datasets for training machine translation models like MarianMT.

### Features:
- Language-specific preprocessing functions for:
  - English
  - French
  - Italian
  - Russian
  - Turkish
  - Spanish
  - Greek
  - Romanian
- General utility functions for:
  - Removing whitespace
  - Expanding abbreviations
  - Normalizing text (e.g., punctuation, accents)
- Automatic language detection using FastText.
- Integration with MLflow for tracking data inputs.
- Processes source and target language columns for fine-tuning MarianMT.

### Main Classes and Functions:
- **Text Preprocessing Functions**:
    - `preprocess_english(text)`: Handles English-specific preprocessing.
    - `preprocess_french(text)`: Handles French-specific preprocessing.
    - `preprocess_italian(text)`: Handles Italian-specific preprocessing.
    - `preprocess_russian(text)`: Handles Russian-specific preprocessing.
    - `preprocess_turkish(text)`: Handles Turkish-specific preprocessing.
    - `preprocess_spanish(text)`: Handles Spanish-specific preprocessing.
    - `preprocess_greek(text)`: Handles Greek-specific preprocessing.
    - `preprocess_romania(text)`: Handles Romanian-specific preprocessing.
- **Utility Functions**:
    - `get_language_code(language_name)`: Retrieves ISO 639-3 language codes.
    - `detect_language(row, src, model)`: Detects source and target languages in a DataFrame row.
    - `rm_whitespace(text)`: Removes unnecessary whitespace from text.
    - `handle_english_contractions(text)`: Expands English contractions.
    - `expand_abbreviations(text)`: Expands abbreviations using a dictionary.
    - `format_table(filepath, source, output_file)`: Main function for processing a CSV file, normalizing text, and preparing data for training.

### Dependencies:
- `pandas`: For handling tabular data.
- `tqdm`: For progress bars during processing.
- `camel_tools`: For Arabic text normalization and diacritic removal.
- `langcodes`: For handling language code lookups.
- `huggingface_hub`: For downloading FastText models.
- `fasttext`: For language detection.
- `mlflow`: For tracking input data for machine learning pipelines.

### Example Usage:
```python
from data_prep import format_table

# Preprocess a CSV file for fine-tuning MarianMT
format_table(
    filepath="input_data.csv",
    source="English",
    output_file="processed_data.csv"
)
"""

import re
import unicodedata
import pandas as pd
from tqdm import tqdm
from camel_tools.utils.dediac import dediac_ar
from camel_tools.utils.normalize import normalize_unicode as n_unicode
import langcodes
from huggingface_hub import hf_hub_download
import fasttext
import mlflow
from mlflow.data import from_pandas
from mlflow.data.pandas_dataset import PandasDataset
from lingowiz.utils import abbreviation_dict

tqdm.pandas()


def get_language_code(language_name: str) -> str:
    """
    Retrieves the ISO 639-3 language code
    from the given language name using langcodes.

    Args:
        language_name (str): The name of the language (e.g., "French").

    Returns:
        str: The ISO 639-3 code for the language,
        or None if the language is not found.

    Raises:
        LookupError: If the language code cannot be found.

    Example:
        get_language_code("French")  # Returns 'fra'
    """
    try:
        language = langcodes.find(language_name)
        return language.to_alpha3()  # Returns ISO 639-3 code
    except LookupError:
        return None


def detect_language(row, src, model):

    """Automatic Language Detection and filtering bad detection

    Args:
        row: Single row, source and target language
        src: Source column
        model: Model used for Language Dettection

    Returns:
        tuple:
            - str: Code of source language
            - str: Code of target language

    Raises:
        KeyError: If the required keys are missing from the row dictionary.
        AttributeError: If the model object does not have a `predict` method.
        ValueError: If the text preprocessing or model output is invalid.
        Exception: For any other unexpected errors.

    Notes:
        In case of an error during language detection, the function
        returns (None, None) instead of raising the exception.
    """
    try:

        # Preprocess the text for English column
        text_input = re.sub(r'\b[A-Z][a-z]*\b', '', row[src])
        text_input = unicodedata.normalize('NFC', text_input)
        text_input = text_input.lower()

        # Preprocess the text for Arabic column
        text_target = re.sub(r'\b[A-Z][a-z]*\b', '', row["Arabic"])
        text_target = unicodedata.normalize('NFC', text_target)
        text_target = text_target.lower()

        # Detect the language for both columns
        code_input = model.predict(text_input)[0][0]
        code_input = code_input.replace("__label__", "").split("_")[0]

        code_target = model.predict(text_target)[0][0]
        code_target = code_target.replace("__label__", "").split("_")[0]

        # Return both language codes (English, Arabic)
        return code_input, code_target

    except KeyError as e:
        # Handle missing keys in the row
        print(f"KeyError: {e}")
        return None, None
    except AttributeError as e:
        # Handle issues with `model` or its methods
        print(f"AttributeError: {e}")
        return None, None
    except ValueError as e:
        # Handle unexpected values in the text or prediction
        print(f"ValueError: {e}")
        return None, None
    except Exception as e:
        # Log unexpected exceptions (optional)
        print(f"Unexpected error: {e}")
        return None, None


def update_rows_2(row, src, model):

    """Automatic Language Detection and filtering bad detection

    Args:
        row (DataFrame): Single row, source and target language
        src (str): Source column
        model (model): Model used for Language Dettection

    Returns:
        str: Updated row following >>code<< for ML model
    """
    # Detect the source and target language codes for the row
    code_input, code_target = detect_language(row, src, model)

    if code_target == "arb":
        code_target = "ara"

    # Check if both input and target language codes are valid
    if code_input and code_target:
        # Append detected language codes to a list for tracking (optional)

        # Update the 'English' and 'Arabic' columns
        # with the MarianMT language code format
        row[src] = f">>{code_input}<< " + row[src]
        row["Arabic"] = f">>{code_target}<< " + row["Arabic"]

        return row

    return row


def rm_whitespace(text) -> str:
    """Remove Unecessary White Space

    Args:
        text (str): Input text

    Returns:
        str: Text without unecessary white space
    """
    # Utility functions used across multiple languages
    return " ".join(text.split())


def lowercase_text(text):
    """Transform to lower case

    Args:
        text (str): Input text

    Returns:
        str: lowered case text
    """
    return text.lower()


def preprocess_english(text):
    """Preprocess English language

    Args:
        text (str): Input text in English

    Returns:
        str: The preprocessed text after applying all English-specific transformations.
    """
    # English-specific Preprocessing
    text = process_medical_data(text)
    text = handle_english_contractions(text)
    text = rm_whitespace(text)

    return text


def handle_english_contractions(text):
    """
    Handeling English Contractions

    Args:
        text (str): Input text in English

    Returns:
        str: Contraction correction text in English
    """

    # Normalizing English Contractions
    contractions = {"I'm": "I am",
                    "you're": "you are",
                    "isn't": "is not",
                    "can't": "cannot"}
    for contraction, expanded in contractions.items():
        text = text.replace(contraction, expanded)
    return text


# French-specific Preprocessing
def preprocess_french(text):
    """Preprocess French language based on 3 function

    Args:
        text (str): Input text in Frencch

    Returns:
        str: The preprocessed text after applying all French-specific transformations.
    """
    text = lowercase_text(text)
    text = normalize_french_accents(text)
    text = normalize_french_punctuation(text)
    text = rm_whitespace(text)
    return text


def normalize_french_accents(text):
    """Standerize French accents

    Args:
        text (str): Input text in French

    Returns:
        str: Normalized French text accent
    """
    return unicodedata.normalize('NFC', text)


def normalize_french_punctuation(text):
    """Normalize French ponctuations

    Args:
        text (str): Input text in French

    Returns:
        str: Modifyed text
    """
    text = text.replace(' :', ':')
    text = text.replace(' ;', ';')
    text = text.replace(' ?', '?')
    text = text.replace(' !', '!')
    return text


def preprocess_italian(text):
    """Preprocess Italian language based on 3 function

    Args:
        text (str): Input text in Italian

    Returns:
        str: The preprocessed text after applying all Italian-specific transformations.
    """
    # Italian-specific Preprocessing
    text = lowercase_text(text)
    text = normalize_italian_punctuation(text)
    text = rm_whitespace(text)
    return text


def normalize_italian_punctuation(text):
    """Normalize Italian ponctuations

    Args:
        text (str): Input text in Italian

    Returns:
        str: Normalized Italian text punctuation
    """
    return text.replace("’", "'")


def preprocess_russian(text):
    """
    Preprocess Russian language text by removing unnecessary whitespace.

    Args:
        text (str): Input text in Russian.

    Returns:
        str: The preprocessed text with whitespace removed.
    """
    # Russian-specific Preprocessing
    text = rm_whitespace(text)
    return text


def preprocess_turkish(text):
    """
    Preprocess Turkish language text

    Args:
        text (str): Input text in Turkish.

    Returns:
        str: The preprocessed text after applying all Turkish-specific transformations.
    """
    # Turkish-specific Preprocessing
    text = lowercase_turkish(text)
    text = normalize_turkish_punctuation(text)
    text = rm_whitespace(text)
    return text


def lowercase_turkish(text):
    """
    Convert Turkish text to lowercase, handling special Turkish characters.

    Args:
        text (str): The input text in Turkish.

    Returns:
        str: The text converted to lowercase, with Turkish-specific characters handled.
    """
    return text.replace('I', 'ı').replace('İ', 'i').lower()


def normalize_turkish_punctuation(text):
    """Normalize Turkish ponctuations

    Args:
        text (str): Input text

    Returns:
        str: Normalized Turkish text punctuation
    """
    return text.replace("’", "'")


def preprocess_spanish(text):
    """
    Preprocess Spanish language text

    Args:
        text (str): Input text in Spanish.

    Returns:
        str: The preprocessed text after applying all Spanish-specific transformations.
    """
    # Spanish-specific Preprocessing
    text = lowercase_text(text)
    text = normalize_spanish_accents(text)
    text = rm_whitespace(text)
    return text


def normalize_spanish_accents(text):
    """Normalize Spanish accents

    Args:
        text (str): Input text Spanish

    Returns:
        str: Normalized Spanish text accent
    """
    return unicodedata.normalize('NFC', text)


def preprocess_greek(text):
    """
    Preprocess Greek language text

    Args:
        text (str): Input text in Greek.

    Returns:
        str: The preprocessed text after applying all Greek-specific transformations.
    """
    # Greek-specific Preprocessing
    text = lowercase_text(text)
    text = normalize_greek_accents(text)
    text = rm_whitespace(text)
    return text


def preprocess_romania(text):

    """
    Preprocess Romanian language text by applying lowercasing, punctuation removal,
    whitespace cleaning, and optional diacritic normalization.

    Args:
        text (str): The input text in Romanian.

    Returns:
        str: The preprocessed text, normalized and cleaned.
    """

    # Step 1: Lowercasing
    text = text.lower()

    # Step 2: Whitespace and punctuation cleaning
    text = re.sub(r'[^\w\s]', '', text)  # Removes punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Removes extra whitespaces

    # Step 3: Diacritic normalization (optional
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode('utf-8')
    return text


def normalize_greek_accents(text):
    """
    Normalize Greek text accents using Unicode NFC normalization.

    Args:
        text (str): The input Greek text.

    Returns:
        str: The normalized text with accents unified.
    """
    return unicodedata.normalize('NFC', text)


def process_medical_data(data):
    """
    Process medical data by replacing certain symbols, converting to lowercase,
    expanding abbreviations, and adding spaces between letters and numbers.

    Args:
        data (str): The input medical data text.

    Returns:
        str: The processed and normalized medical data.
    """
    data = data.replace(".", " ")
    data = data.replace("=", " ")
    data = data.replace("-", " ")
    data = data.replace("_", " ")
    data = data.lower()
    data = expand_abbreviations(data)
    data = add_space_between_letters_and_numbers(data)
    return data


def add_space_between_letters_and_numbers(text):
    """
    Add spaces between letters and numbers in the text.

    Args:
        text (str): The input text containing letters and numbers.

    Returns:
        str: The text with spaces inserted between letters and numbers.
    """
    # Use regex to insert a space between letters and numbers
    separated_text = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', text)
    return separated_text


def expand_abbreviations(text):
    """
    Expand abbreviations in the text based on a provided abbreviation dictionary.

    Args:
        text (str): The input text containing abbreviations.
        abbreviation_dict (dict): A dictionary where keys are abbreviations and
            values are their expansions.

    Returns:
        str: The text with abbreviations expanded.
    """
    # Function to expand abbreviations using the dictionary
    pattern = re.compile(r'\b(' +
                         '|'.join(re.escape(key) for key in abbreviation_dict.keys()) +
                         r')\b')
    expanded_text = pattern.sub(lambda x: abbreviation_dict[x.group()], text)
    return expanded_text


def format_table(filepath: str, source: str, output_file: str) -> None:
    """
    Preprocesses a CSV file containing:
    - Arabic translations
    - Source language.
    The function applies:
    - Normalization
    - Removes diacritics
    - Processes source
    - Target columns
    for use in MarianMT fine-tuning.

    Args:
        filepath (str): Path to the input CSV file.
        source (str): Name of the source language (e.g., 'French')
        output_file (str): Path to the processed CSV file

    Raises:
        FileNotFoundError: If the input CSV file does not exist.
        ValueError: If the source language column is not present in the CSV.
        KeyError: If the 'Arabic' column is not found in the CSV.
        Exception: For any unexpected errors during processing.

    Example:
        format_table("data_fr.csv", "French", "processed_data.csv")
    """

    try:
        # Step 1: Load the data from the CSV file
        print("Loading data from CSV...")
        df = pd.read_csv(filepath)
        if source not in df.columns or "Arabic" not in df.columns:
            error = f"Columns '{source}' and/or 'Arabic' not found in the CSV."
            raise KeyError(error)
        print(f"Loaded {len(df)} rows.")

        # Step 2: Drop any rows with missing values
        print("Dropping rows with missing values...")
        df = df.dropna()
        print(f"{len(df)} rows remafining after dropping missing values.")
        print("")  # Print an empty line for clarity

        # Step 3: Apply normalization to the Arabic column
        print("Applying normalization to the Arabic column...")
        df_arabic = df["Arabic"]
        df_arabic = df_arabic.progress_apply(n_unicode)
        print("Normalization applied.")
        print("")

        # Step 4: Apply diacritic removal to the Arabic column
        print("Removing diacritics from the Arabic column...")
        df_arabic = df_arabic.progress_apply(dediac_ar)
        print("Diacritics removed.")
        print("")

        # Step 5: Remove extra whitespaces from the Arabic column
        print("Removing extra whitespaces from the Arabic column...")
        df_arabic = df_arabic.progress_apply(rm_whitespace)
        print("Extra whitespaces removed from Arabic.")
        print("")

        df["Arabic"] = df_arabic

        # Step 6: Preprocess the source language based on its type
        print(f"Applying preprocessing to the source language: {source}...")

        def preprocess_source(row):
            if source.lower() == 'english' or source.lower() == 'eng':
                return preprocess_english(row)
            if source.lower() == 'french' or source.lower() == 'fra':
                return preprocess_french(row)
            if source.lower() == 'italian' or source.lower() == 'ita':
                return preprocess_italian(row)
            if source.lower() == 'russian' or source.lower() == 'rus':
                return preprocess_russian(row)
            if source.lower() == 'turkish' or source.lower() == 'tur':
                return preprocess_turkish(row)
            if source.lower() == 'spanish' or source.lower() == 'spa':
                return preprocess_spanish(row)
            if source.lower() == 'greek' or source.lower() == 'ell':
                return preprocess_greek(row)
            return preprocess_english(row)

        df[source] = df[source].progress_apply(preprocess_source)

        print(f"Preprocessing for {source} applied.")
        print("")

        # Step 7: Remove extra whitespaces from the source column
        print("Removing extra whitespaces from the source column...")
        df[source] = df[source].progress_apply(rm_whitespace)
        print(f"Extra whitespaces removed from {source}.")
        print("")
        repo_id = "facebook/fasttext-language-identification"
        model_path = hf_hub_download(repo_id=repo_id,
                                     filename="model.bin")
        fasttext_model = fasttext.load_model(model_path)
        print("FastText model loaded.")

        # Step 8: Update rows for MarianMT fine-tuning by adding language codes
        print("Updating rows for MarianMT fine-tuning...")
        df = df.swifter.apply(lambda row: update_rows_2(row,
                                                        source,
                                                        fasttext_model),
                              axis=1)
        print("Rows updated for MarianMT fine-tuning.")
        print("")

        # Step 9: Get the language code for the source language
        code = get_language_code(source)

        # Step 10: Apply mask for filtering rows
        print("Applying mask for specific Arabic and source string slices...")
        mask_arabic = df["Arabic"].str.contains(r">>ara<<")
        mask_source = df[source].str.contains(f">>{code}<<")
        df = df[mask_arabic & mask_source]
        print(f"{len(df)} rows remaining after applying the mask.")
        print("")

        # Step 11: Save the processed DataFrame to a new CSV file
        print("Saving the processed data to a CSV file...")
        df.to_csv(output_file, index=False)
        dataset: PandasDataset = from_pandas(df,
                                             source=output_file)
        mlflow.log_input(dataset, context="training")
        print(f"Data successfully saved to '{output_file}'.")
        print("Data processing complete.")

    except FileNotFoundError as exc:
        # Re-raise the exception with additional context, preserving the original traceback
        raise FileNotFoundError(f"The file {filepath} does not exist.") from exc
    except KeyError as exc:
        # Re-raise the KeyError with context about the missing column
        raise KeyError(f"Missing necessary column: {exc}") from exc
    except Exception as exc:
        # Log and re-raise the unexpected exception
        print(f"An error occurred: {exc}")
        raise exc


In [13]:
import mlflow
mlflow.set_tracking_uri("https://652d-35-203-165-151.ngrok-free.app/")

In [14]:

# # Initializing Language
# source_lang, source_code = "French","fr"


# # Converting data from tmx to csv
# parse_tmx("data_tmx/insurance_translation.tmx","data_csv/insurance_translation.csv","French","Arabic","fr","ar")

# # Reading data
# df_2 = pd.read_csv("data_csv/insurance_translation.csv")
# df_2.to_csv("data_csv/dataset.csv")

# # Format table
# format_table("data_csv/dataset.csv","French","process/data_fr_spe.csv")
# df_2 = pd.read_csv("process/data_fr_spe.csv")
# src = "French"

# # Base Model to train
# base_model = "Helsinki-NLP/opus-mt-fr-ar"

# # Intializing training parameters
# steps,batch_size,learning_rate,epochs, warmup = 11000,16,5e-5, 5, 100

# # Initializing mlflow
# mlflow.environment_variables.MLFLOW_TRACKING_INSECURE_TLS = "true"
# mlflow.environment_variables.MLFLOW_TRACKING_SERVER_CERT_PATH  = ""
# mlflow.set_tracking_uri("https://be80-34-138-225-3.ngrok-free.app/")

# # Running the training pipeline
# training_pipeline(df_2,src,base_model,steps,batch_size,learning_rate,epochs, warmup,base_model,"Arabic",3)

CSV file created successfully!


In [16]:
%env CUDA_LAUNCH_BLOCKING=1

from lingowiz.converter import decompress_gz,parse_tmx
from lingowiz.data_prep import format_table
from lingowiz.train import training_pipeline
import pandas as pd
import mlflow
import torch
import os
os.environ["WANDB_MODE"] = "disabled"

torch.cuda.empty_cache()

# Defining Source Language
source_lang, source_code = "English","en"

# Parsing TMX files to CSV
parse_tmx("data_tmx/insurance_translation.tmx","data_csv/insurance_translation.csv","English","Arabic","en","ar")

# Reading CSV files (and merging)
df_2 = pd.read_csv("data_csv/insurance_translation.csv")

# Data preprocessing
format_table("data_csv/insurance_translation.csv","English","process/data_en_spe.csv")
df_2 = pd.read_csv("process/data_en_spe.csv")

# SHuffle files ( more random samples)
df_2 = df_2.sample(frac=1).reset_index(drop=True)

# Initializing Variables
t_type="special"
src = "English"
base_model = "Helsinki-NLP/opus-mt-en-ar"
steps,batch_size,learning_rate,epochs, warmup = 10000,16,5e-4, 5, 1000
mlflow.environment_variables.MLFLOW_TRACKING_INSECURE_TLS = "true"

# Running training pipeline
training_pipeline(df_2,t_type,src,base_model,steps,batch_size,learning_rate,epochs, warmup,base_model,"Arabic",2)

env: CUDA_LAUNCH_BLOCKING=1
CSV file created successfully!
Loading data from CSV...
Loaded 1021 rows.
Dropping rows with missing values...
1021 rows remaining after dropping missing values.

Applying normalization to the Arabic column...


Pandas Apply:   0%|          | 0/1021 [00:00<?, ?it/s]

Normalization applied.

Removing diacritics from the Arabic column...


Pandas Apply:   0%|          | 0/1021 [00:00<?, ?it/s]

Diacritics removed.

Removing extra whitespaces from the Arabic column...


Pandas Apply:   0%|          | 0/1021 [00:00<?, ?it/s]

Extra whitespaces removed from Arabic.

Applying preprocessing to the source language: English...


Pandas Apply:   0%|          | 0/1021 [00:00<?, ?it/s]

Preprocessing for English applied.

Removing extra whitespaces from the source column...


Pandas Apply:   0%|          | 0/1021 [00:00<?, ?it/s]

Extra whitespaces removed from English.

FastText model loaded.
Updating rows for MarianMT fine-tuning...


Pandas Apply:   0%|          | 0/1021 [00:00<?, ?it/s]

Rows updated for MarianMT fine-tuning.

Applying mask for specific Arabic and source string slices...
1021 rows remaining after applying the mask.

Saving the processed data to a CSV file...
Data successfully saved to 'process/data_en_spe.csv'.
Data processing complete.
Data Split train - eval

Length train data 918 
Length eval data 103

Intializing Tokenizer ...
Initializing Model ...
Freezing last layers of decoder ...


Map:   0%|          | 0/918 [00:00<?, ? examples/s]

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Initialization Complete


Initializaing Training Arguments ...
Training ...


Epoch,Training Loss,Validation Loss
1,3.6391,0.321173
2,0.1917,0.080177
3,0.0708,0.042889
4,0.0408,0.033628
5,0.0268,0.02794


Clearing cache after epoch 1.0...
Clearing cache after epoch 2.0...
Clearing cache after epoch 3.0...
Clearing cache after epoch 4.0...


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62801]], 'forced_eos_token_id': 0}


Clearing cache after epoch 5.0...


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62801]], 'forced_eos_token_id': 0}


Training Complete


2024/11/28 08:18:27 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id d6fee50084bc4326b098449c6abae779: Failed to log run data: Exception: API request to https://652d-35-203-165-151.ngrok-free.app/api/2.0/mlflow/runs/log-batch failed with exception HTTPSConnectionPool(host='652d-35-203-165-151.ngrok-free.app', port=443): Max retries exceeded with url: /api/2.0/mlflow/runs/log-batch (Caused by ResponseError('too many 500 error responses'))


Uploading model to hugging face ...


pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

Logging Hyperparameters
Logging Data
Logging Model Metrics and Params
Logging Model...




Logging Complete
🏃 View run translator_English_Arabic_spec_2024-11-28_08:18 at: https://652d-35-203-165-151.ngrok-free.app/#/experiments/556171306495575882/runs/f68b6d389aac447caaf197be7c8d87a5
🧪 View experiment at: https://652d-35-203-165-151.ngrok-free.app/#/experiments/556171306495575882
