In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('en-sv_translations.csv')

In [7]:
df.dropna()

Unnamed: 0,english,swedish
0,Previously on The Hot Zone: Anthrax.,I tidigare avsnitt...
1,Director Mueller just assigned us a major case...,Byråchef Mueller gav oss just ett stort fall.
2,Investigation''s officially been dubbed Ameri...,Utredningen har fått namnet Amerithrax.
3,Whoever sent these letters got their Anthrax ...,Brevskickaren fick sin mjältbrand från ett ame...
4,We wouldn''t be here if we didn''t have eviden...,Vi hade inte varit här om inte bevisen pekat p...
...,...,...
43533706,"You are already almost 15 minutes late. Oh, my...","-Gå nu, du är nästan en kvart sen."
43533707,By the powers vested in me by the state of Sou...,I kraft av mitt ämbete i staten South Carolina...
43533708,Who invited you? - I'm-- - Beat it.,Vem bjöd in dig?
43533709,Okay.,! Stick!


In [3]:
pip install transformers datasets tokenizers


Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import (
    PreTrainedTokenizerFast,
    T5Config,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

# ===== 2. Train a Tokenizer from Scratch =====
from tokenizers import ByteLevelBPETokenizer

# Write all text to a file
with open("training_texts.txt", "w", encoding="utf-8") as f:
    for line in df["english"].fillna("").astype(str).tolist() + df["swedish"].fillna("").astype(str).tolist(): 
        f.write(line.strip() + "\n")


# Train tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="training_texts.txt", vocab_size=32000, min_frequency=2)

# Save tokenizer
os.makedirs("tokenizer", exist_ok=True)
tokenizer.save_model("tokenizer")

# Load tokenizer into Hugging Face
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer/tokenizer.json",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>"
)

# ===== 3. Initialize T5 Model from Scratch =====
config = T5Config(
    vocab_size=hf_tokenizer.vocab_size,
    d_model=512,
    d_ff=2048,
    num_layers=6,
    num_heads=8,
    dropout_rate=0.1,
    eos_token_id=hf_tokenizer.eos_token_id,
    pad_token_id=hf_tokenizer.pad_token_id,
    decoder_start_token_id=hf_tokenizer.bos_token_id,
)

model = T5ForConditionalGeneration(config)

# ===== 4. Tokenize the Dataset =====
def preprocess_function(examples):
    inputs = hf_tokenizer(
        examples["english"], truncation=True, padding="max_length", max_length=128
    )
    targets = hf_tokenizer(
        examples["swedish"], truncation=True, padding="max_length", max_length=128
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
val_dataset = split["test"]

# ===== 5. Training Setup =====
training_args = Seq2SeqTrainingArguments(
    output_dir="./scratch_translation_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
)

data_collator = DataCollatorForSeq2Seq(tokenizer=hf_tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=hf_tokenizer,
    data_collator=data_collator,
)

# ===== 6. Train the Model =====
trainer.train()

# ===== 7. Save the Model =====
model.save_pretrained("my_translation_model")
hf_tokenizer.save_pretrained("my_translation_model")


In [None]:
from transformers import T5ForConditionalGeneration, PreTrainedTokenizerFast

model = T5ForConditionalGeneration.from_pretrained("my_translation_model")
tokenizer = PreTrainedTokenizerFast.from_pretrained("my_translation_model")

input_text = "We wouldn't be here if we didn't have evidence."
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

output = model.generate(**inputs, max_length=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))
