In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from datasets import Dataset

In [3]:
df = pd.read_csv('en-sv_translations.csv')

In [4]:
df.dropna()

Unnamed: 0,english,swedish
0,Previously on The Hot Zone: Anthrax.,I tidigare avsnitt...
1,Director Mueller just assigned us a major case...,Byråchef Mueller gav oss just ett stort fall.
2,Investigation''s officially been dubbed Ameri...,Utredningen har fått namnet Amerithrax.
3,Whoever sent these letters got their Anthrax ...,Brevskickaren fick sin mjältbrand från ett ame...
4,We wouldn''t be here if we didn''t have eviden...,Vi hade inte varit här om inte bevisen pekat p...
...,...,...
43533706,"You are already almost 15 minutes late. Oh, my...","-Gå nu, du är nästan en kvart sen."
43533707,By the powers vested in me by the state of Sou...,I kraft av mitt ämbete i staten South Carolina...
43533708,Who invited you? - I'm-- - Beat it.,Vem bjöd in dig?
43533709,Okay.,! Stick!


In [5]:
df = df.dropna(subset=["english", "swedish"])  # Drop missing values
df["english"] = df["english"].astype(str)
df["swedish"] = df["swedish"].astype(str)
dataset = Dataset.from_pandas(df)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43533688 entries, 0 to 43533710
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   english  object
 1   swedish  object
dtypes: object(2)
memory usage: 996.4+ MB


In [7]:
small_df = df.sample(frac=0.00221, random_state=42) 
df = small_df
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 96209 entries, 39927259 to 24099229
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  96209 non-null  object
 1   swedish  96209 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [8]:
pip install transformers datasets tokenizers


Note: you may need to restart the kernel to use updated packages.


In [9]:
!pip install -U transformers



In [10]:
!pip uninstall transformers -y
!pip install transformers

Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Installing collected packages: transformers
Successfully installed transformers-4.51.3


In [11]:
import transformers
print(transformers.__version__)

4.51.3


In [12]:
import transformers
print(transformers.__version__)
print(transformers.__file__)


4.51.3
C:\Users\Emma\anaconda3\Lib\site-packages\transformers\__init__.py


In [13]:
from transformers import Seq2SeqTrainingArguments
print(Seq2SeqTrainingArguments.__init__.__code__.co_varnames)


('self', 'output_dir', 'overwrite_output_dir', 'do_train', 'do_eval', 'do_predict', 'eval_strategy', 'prediction_loss_only', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'per_gpu_train_batch_size', 'per_gpu_eval_batch_size', 'gradient_accumulation_steps', 'eval_accumulation_steps', 'eval_delay', 'torch_empty_cache_steps', 'learning_rate', 'weight_decay', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'max_grad_norm', 'num_train_epochs', 'max_steps', 'lr_scheduler_type', 'lr_scheduler_kwargs', 'warmup_ratio', 'warmup_steps', 'log_level', 'log_level_replica', 'log_on_each_node', 'logging_dir', 'logging_strategy', 'logging_first_step', 'logging_steps', 'logging_nan_inf_filter', 'save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_only_model', 'restore_callback_states_from_checkpoint', 'no_cuda', 'use_cpu', 'use_mps_device', 'seed', 'data_seed', 'jit_mode_eval', 'use_ipex', 'bf16', 'fp16', 'fp16_opt_level', 'half_precision_backend',

In [14]:
from transformers import Seq2SeqTrainingArguments
help(Seq2SeqTrainingArguments)


Help on class Seq2SeqTrainingArguments in module transformers.training_args_seq2seq:

class Seq2SeqTrainingArguments(transformers.training_args.TrainingArguments)
 |
 |  TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
 |  itself**.
 |
 |  Using [`HfArgumentParser`] we can turn this class into
 |  [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
 |  command line.
 |
 |  Parameters:
 |      output_dir (`str`, *optional*, defaults to `"trainer_output"`):
 |          The output directory where the model predictions and checkpoints will be written.
 |      overwrite_output_dir (`bool`, *optional*, defaults to `False`):
 |          If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
 |          points to a checkpoint directory.
 |      do_train (`bool`, *optional*, defaults to `False`):
 |          Whether to run t

In [15]:
!pip install transformers[torch]




In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import (
    PreTrainedTokenizerFast,
    T5Config,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

# ===== 2. Train a Tokenizer from Scratch =====
from tokenizers import ByteLevelBPETokenizer

# Write all text to a file
with open("training_texts.txt", "w", encoding="utf-8") as f:
    for line in df["english"].fillna("").astype(str).tolist() + df["swedish"].fillna("").astype(str).tolist(): 
        f.write(line.strip() + "\n")


# Train tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="training_texts.txt", vocab_size=32000, min_frequency=2)

# Save tokenizer
os.makedirs("tokenizer", exist_ok=True)
tokenizer.save_model("tokenizer")

import json


config = {
    "add_prefix_space": True,
    "unk_token": "<unk>",
    "bos_token": "<s>",
    "eos_token": "</s>",
    "pad_token": "<pad>",
    "mask_token": "<mask>"
}

with open("tokenizer/tokenizer_config.json", "w") as f:
    json.dump(config, f)



# Load tokenizer into Hugging Face
from transformers import GPT2TokenizerFast

hf_tokenizer = GPT2TokenizerFast.from_pretrained(
    "tokenizer",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>"
)


from transformers import T5Config, T5ForConditionalGeneration

# ===== 3. Initialize T5 Model from Scratch =====
config = T5Config(
    vocab_size=hf_tokenizer.vocab_size,
    d_model=512,
    d_ff=2048,
    num_layers=6,
    num_heads=8,
    dropout_rate=0.1,
    eos_token_id=hf_tokenizer.eos_token_id,
    pad_token_id=hf_tokenizer.pad_token_id,
    decoder_start_token_id=hf_tokenizer.bos_token_id,
)

model = T5ForConditionalGeneration(config)

model.resize_token_embeddings(len(hf_tokenizer))

# ===== 4. Tokenize the Dataset =====
def preprocess_function(examples):
    english_texts = [str(text) for text in examples["english"]]
    swedish_texts = [str(text) for text in examples["swedish"]]

    # Ensure they are lists of strings
    assert isinstance(english_texts, list), "english_texts must be a list"
    assert isinstance(swedish_texts, list), "swedish_texts must be a list"

    model_inputs = hf_tokenizer(
        english_texts,
        padding="max_length",
        truncation=True,
        max_length=128,
    )

    with hf_tokenizer.as_target_tokenizer():
        labels = hf_tokenizer(
            swedish_texts,
            padding="max_length",
            truncation=True,
            max_length=128,
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



from datasets import Dataset
dataset = Dataset.from_pandas(df)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Tokenize both inputs (English) and targets (Swedish)
inputs = hf_tokenizer(df['english'].tolist(), padding='max_length', truncation=True, max_length=128, return_tensors='pt')
targets = hf_tokenizer(df['swedish'].tolist(), padding='max_length', truncation=True, max_length=128, return_tensors='pt')

print("Max token ID in input:", inputs['input_ids'].max())
print("Max token ID in output:", targets['input_ids'].max())



split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
val_dataset = split["test"]

# ===== 5. Training Setup =====
training_args = Seq2SeqTrainingArguments(
    output_dir="./scratch_translation_model",
    # evaluation_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
)

data_collator = DataCollatorForSeq2Seq(tokenizer=hf_tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=hf_tokenizer,
    data_collator=data_collator,
)

# ===== 6. Train the Model =====
trainer.train()

# ===== 7. Save the Model =====
model.save_pretrained("my_translation_model")
hf_tokenizer.save_pretrained("my_translation_model")





The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/96209 [00:00<?, ? examples/s]



Max token ID in input: tensor(32003)
Max token ID in output: tensor(32003)


  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


In [None]:
from transformers import T5ForConditionalGeneration, PreTrainedTokenizerFast

model = T5ForConditionalGeneration.from_pretrained("my_translation_model")
tokenizer = PreTrainedTokenizerFast.from_pretrained("my_translation_model")

input_text = "We wouldn't be here if we didn't have evidence."
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

output = model.generate(**inputs, max_length=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))
