In [1]:
# Install necessary libraries
!pip install transformers
#!pip install -U transformers
#!pip install sentencepiece

import os
import urllib.request
import zipfile
import tensorflow as tf
from transformers import MarianTokenizer, MarianMTModel

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.8 MB/s[0m eta [36m0:00:0

In [2]:
# Define constants
model_name = "Helsinki-NLP/opus-mt-es-en"
dataset_url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
max_length = 128
batch_size = 32
num_epochs = 5
learning_rate = 1e-4

In [3]:
# Download and extract data
dataset_dir = "spa-eng"
file_name = "spa-eng.zip"

In [4]:
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

urllib.request.urlretrieve(dataset_url, os.path.join(dataset_dir, file_name))
with zipfile.ZipFile(os.path.join(dataset_dir, file_name), 'r') as zip_ref:
    zip_ref.extractall(dataset_dir)


In [5]:
# Load data
file_path = os.path.join("spa-eng", "spa-eng", "spa.txt")
lines = open(file_path, encoding='utf-8').read().strip().split('\n')
train_data = [(line.split('\t')[0], line.split('\t')[1]) for line in lines]
source_texts, target_texts = zip(*train_data)

In [6]:
# Initialize tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

ImportError: ignored

In [7]:
# Preprocess data
input_encodings = tokenizer(source_texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
target_encodings = tokenizer(target_texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')

inputs = {key: tf.convert_to_tensor(input_encodings[key]) for key in input_encodings}
targets = tf.convert_to_tensor(target_encodings['input_ids'])

NameError: ignored

In [None]:
# Build train dataset
train_dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
train_dataset = train_dataset.shuffle(len(inputs)).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)


In [None]:
# Define training step
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        logits = model(**inputs, decoder_input_ids=targets).logits  # Pass inputs as **kwargs
        loss = loss_fn(targets, logits)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

In [None]:
# Training parameters
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [None]:
# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    total_loss = 0
    total_batches = 0
    for batch, (inputs, targets) in enumerate(train_dataset):
        loss = train_step(inputs, targets)  # Use the regular train step
        total_loss += loss.numpy().item()
        total_batches = batch + 1
        if batch % 100 == 0:
            print(f"Batch {batch}, Loss: {loss.numpy():.4f}")
    print(f"Epoch Loss: {total_loss / total_batches:.4f}")

In [None]:
# Translate a source text to the target language
def translate_text(input_text, tokenizer, model, max_length=128):
    input_encoding = tokenizer(input_text, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
    input_ids = tf.convert_to_tensor(input_encoding['input_ids'])
    translation_ids = model.generate(input_ids)
    translation_text = tokenizer.batch_decode(translation_ids, skip_special_tokens=True)[0]
    return translation_text

# Example source text for translation
source_text = "Hello, how are you?"

# Perform translation
translated_text = translate_text(source_text, tokenizer, model)

# Print the translated text
print(f"Source Text: {source_text}")
print(f"Translated Text: {translated_text}")
