# **1. Data Understanding & Preparation**

In [1]:
#installing required packages
! pip3 install sacrebleu
! pip3 install sentencepiece




In [2]:
#import libraries
import pandas as pd
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import sacrebleu
import re

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load dataset

''' upload the data file in the files path using upload button in the colab
also modify the file name if needed '''

data_path = "Dataset.csv"
df = pd.read_csv(data_path)
print(df.head())


                                    input_text  \
0  I CoLoUr 🎨 the centre of my favourite book.   
1          He is travelling ✈️ to the THEATRE.   
2                 I have a flat near the lift.   
3                I have a flat near the lift.    
4    The PROGRAMME 🗓️ will start at 6 O'CLOCK.   

                               target_text  
0  I color the center of my favorite book.  
1          He is traveling to the theater.  
2   I have an apartment near the elevator.  
3   I have an apartment near the elevator.  
4     The program will start at 6 o'clock.  


In [4]:

# clean the dataset
def clean_text(text):
    """
    Clean the input text by stripping spaces and removing emojis/extra characters.
    Convert to lowercase to standardize.
    """
    try:
        text = re.sub(r"\s+", " ", text.strip())
        text = re.sub(r"[^\w\s']", "", text)  # Retain words, spaces, and apostrophes
        return text[0] + text[1:].lower()
    except Exception as e:
        raise ValueError(f"Error cleaning text: {str(e)}")

# Clean the text in both columns
df["input_text"] = df["input_text"].apply(clean_text)
df["target_text"] = df["target_text"].apply(clean_text)

df

Unnamed: 0,input_text,target_text
0,I colour the centre of my favourite book,I color the center of my favorite book
1,He is travelling to the theatre,He is traveling to the theater
2,I have a flat near the lift,I have an apartment near the elevator
3,I have a flat near the lift,I have an apartment near the elevator
4,The programme will start at 6 o'clock,The program will start at 6 o'clock
...,...,...
91,The theatre's performance was breathtaking,The theater's performance was breathtaking
92,Her behaviour has been commendable,Her behavior has been commendable
93,The cheque was never received,The check was never received
94,The aeroplane took off on time,The airplane took off on time


# **2. Model Selection**

In [5]:
""" Model: T5 Transformer (EnglishVoice/t5-base-uk-to-us-english) from hugging face
- T5 (Text-to-Text Transfer Transformer) is a powerful model designed for text generation tasks.
- This pre-trained model is fine-tuned specifically for UK-to-US English conversion.
- It is efficient and effective at handling text-based transformations.
- Light model, can run on CPU as well.
"""

# Loading the pre-trained T5 model for UK-to-US conversion
model_name = "EnglishVoice/t5-base-uk-to-us-english"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = model.to(device)

# Setting the maximum length for tokenization
max_input_length = 64
max_target_length = 64

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# **3. Model Training & Evaluation**


In [6]:
# create lists to save predictions and target
predictions = []
references = []

# Loop over each row in the DataFrame to generate predictions
for idx, row in df.iterrows():
    input_text = row["input_text"]
    target_text = row["target_text"]

    # prepare the prompt expected by the model
    prompt = "UK to US: " + input_text

    # Tokenizing the input prompt along with input text
    encoding = tokenizer.encode_plus(
        prompt,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # Generate the output text
    beam_outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        num_beams=5,
        max_length=max_target_length,
        early_stopping=True
    )

    # Decoding the output tokens to a string and clean it
    prediction = tokenizer.decode(beam_outputs[0], skip_special_tokens=True)

    predictions.append(prediction)
    references.append(target_text)

    # Printing few sample conversions
    if idx < 10:
        print(f"Sample {idx}:")
        print(f"  Input:      {input_text}")
        print(f"  Target:     {target_text}")
        print(f"  Prediction: {prediction}")
        print()


Sample 0:
  Input:      I colour  the centre of my favourite book
  Target:     I color the center of my favorite book
  Prediction: I color the center of my favorite book

Sample 1:
  Input:      He is travelling  to the theatre
  Target:     He is traveling to the theater
  Prediction: He is traveling to the theater

Sample 2:
  Input:      I have a flat near the lift
  Target:     I have an apartment near the elevator
  Prediction: I have an apartment near the elevator

Sample 3:
  Input:      I have a flat near the lift
  Target:     I have an apartment near the elevator
  Prediction: I have an apartment near the elevator

Sample 4:
  Input:      The programme  will start at 6 o'clock
  Target:     The program will start at 6 o'clock
  Prediction: The program will start at 6 o'clock

Sample 5:
  Input:      He has a cheque  for payment
  Target:     He has a check for payment
  Prediction: He has a check for payment

Sample 6:
  Input:      She wears jewellery  on occasions
  Targe

In [7]:

# Evaluate predictions using sacrebleu
bleu = sacrebleu.corpus_bleu(predictions, [references])
print(f"\nOverall BLEU Score: {bleu.score:.2f}")

# Exact-match accuracy after cleaning predictions and references
exact_matches = sum([1 for pred, ref in zip(predictions, references) if clean_text(pred) == clean_text(ref)])
exact_match_accuracy = exact_matches / len(references) * 100
print(f"Exact Match Accuracy: {exact_match_accuracy:.2f}%")



Overall BLEU Score: 96.14
Exact Match Accuracy: 94.79%


# **4. Deployment & Inference**


In [8]:
def convert_text_uk_to_us(text):
    """Function to convert UK English text to US English using the trained model."""

    # cleaning and preprocessing the input text
    cleaned_text = clean_text(text)

    # prepare the prompt expected by the model
    prompt = "UK to US: " + input_text

    # Tokenizing the input prompt along with input text
    encoding = tokenizer.encode_plus(
        prompt,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # Generate the output text
    beam_outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        num_beams=5,
        max_length=max_target_length,
        early_stopping=True
    )

    return tokenizer.decode(beam_outputs[0], skip_special_tokens=True)

# inference input text, replace this with the test input
input_text = "I'm going to my flat from the theatre, let's meet for the football in the evening."
output_text = convert_text_uk_to_us(input_text)

print(f"\nExample Inference:")
print(f"  Input:     {input_text}")
print(f"  Converted: {output_text}")  #expected output  I'm going to my apartment from the theater let's meet for the soccer in the evening


Example Inference:
  Input:     I'm going to my flat from the theatre, let's meet for the football in the evening.
  Converted: I'm going to my apartment from the theater, let's meet for the soccer in the evening.
