In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('en-sv_translations.csv')

In [7]:
df.dropna()

Unnamed: 0,english,swedish
0,Previously on The Hot Zone: Anthrax.,I tidigare avsnitt...
1,Director Mueller just assigned us a major case...,Byråchef Mueller gav oss just ett stort fall.
2,Investigation''s officially been dubbed Ameri...,Utredningen har fått namnet Amerithrax.
3,Whoever sent these letters got their Anthrax ...,Brevskickaren fick sin mjältbrand från ett ame...
4,We wouldn''t be here if we didn''t have eviden...,Vi hade inte varit här om inte bevisen pekat p...
...,...,...
43533706,"You are already almost 15 minutes late. Oh, my...","-Gå nu, du är nästan en kvart sen."
43533707,By the powers vested in me by the state of Sou...,I kraft av mitt ämbete i staten South Carolina...
43533708,Who invited you? - I'm-- - Beat it.,Vem bjöd in dig?
43533709,Okay.,! Stick!


In [31]:
df = df.dropna(subset=["english", "swedish"])  # Drop missing values
df["english"] = df["english"].astype(str)
df["swedish"] = df["swedish"].astype(str)
dataset = Dataset.from_pandas(df)


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43533688 entries, 0 to 43533710
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   english  object
 1   swedish  object
dtypes: object(2)
memory usage: 996.4+ MB


In [38]:
small_df = df.sample(frac=0.01, random_state=42) 
df = small_df
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 435337 entries, 39927259 to 28789177
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   english  435337 non-null  object
 1   swedish  435337 non-null  object
dtypes: object(2)
memory usage: 10.0+ MB


In [3]:
pip install transformers datasets tokenizers


Note: you may need to restart the kernel to use updated packages.


In [42]:
!pip install -U transformers



In [48]:
import transformers
print(transformers.__version__)

4.51.3


In [50]:
import os
import pandas as pd
from datasets import Dataset

from transformers import Seq2SeqTrainingArguments

from transformers import (
    PreTrainedTokenizerFast,
    T5Config,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    #Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

# ===== 2. Train a Tokenizer from Scratch =====
from tokenizers import ByteLevelBPETokenizer

# Write all text to a file
with open("training_texts.txt", "w", encoding="utf-8") as f:
    for line in df["english"].fillna("").astype(str).tolist() + df["swedish"].fillna("").astype(str).tolist(): 
        f.write(line.strip() + "\n")


# Train tokenizer
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="training_texts.txt", vocab_size=32000, min_frequency=2)

# Save tokenizer
os.makedirs("tokenizer", exist_ok=True)
tokenizer.save_model("tokenizer")

import json


config = {
    "add_prefix_space": True,
    "unk_token": "<unk>",
    "bos_token": "<s>",
    "eos_token": "</s>",
    "pad_token": "<pad>",
    "mask_token": "<mask>"
}

with open("tokenizer/tokenizer_config.json", "w") as f:
    json.dump(config, f)



# Load tokenizer into Hugging Face
from transformers import GPT2TokenizerFast

hf_tokenizer = GPT2TokenizerFast.from_pretrained(
    "tokenizer",
    pad_token="<pad>",
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>"
)


from transformers import T5Config, T5ForConditionalGeneration

# ===== 3. Initialize T5 Model from Scratch =====
config = T5Config(
    vocab_size=hf_tokenizer.vocab_size,
    d_model=512,
    d_ff=2048,
    num_layers=6,
    num_heads=8,
    dropout_rate=0.1,
    eos_token_id=hf_tokenizer.eos_token_id,
    pad_token_id=hf_tokenizer.pad_token_id,
    decoder_start_token_id=hf_tokenizer.bos_token_id,
)

model = T5ForConditionalGeneration(config)





# ===== 4. Tokenize the Dataset =====
def preprocess_function(examples):
    english_texts = [str(text) for text in examples["english"]]
    swedish_texts = [str(text) for text in examples["swedish"]]

    # Ensure they are lists of strings
    assert isinstance(english_texts, list), "english_texts must be a list"
    assert isinstance(swedish_texts, list), "swedish_texts must be a list"

    model_inputs = hf_tokenizer(
        english_texts,
        padding="max_length",
        truncation=True,
        max_length=128,
    )

    with hf_tokenizer.as_target_tokenizer():
        labels = hf_tokenizer(
            swedish_texts,
            padding="max_length",
            truncation=True,
            max_length=128,
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



from datasets import Dataset
dataset = Dataset.from_pandas(df)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split["train"]
val_dataset = split["test"]

# ===== 5. Training Setup =====
training_args = Seq2SeqTrainingArguments(
    output_dir="./scratch_translation_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
)

data_collator = DataCollatorForSeq2Seq(tokenizer=hf_tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=hf_tokenizer,
    data_collator=data_collator,
)

# ===== 6. Train the Model =====
trainer.train()

# ===== 7. Save the Model =====
model.save_pretrained("my_translation_model")
hf_tokenizer.save_pretrained("my_translation_model")


Map:   0%|          | 0/435337 [00:00<?, ? examples/s]



TypeError: Seq2SeqTrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
from transformers import T5ForConditionalGeneration, PreTrainedTokenizerFast

model = T5ForConditionalGeneration.from_pretrained("my_translation_model")
tokenizer = PreTrainedTokenizerFast.from_pretrained("my_translation_model")

input_text = "We wouldn't be here if we didn't have evidence."
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

output = model.generate(**inputs, max_length=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))
