In [1]:
import pickle 
from pickle import dump

def load_docum(filename):
    file = open(filename, mode = 'rt', encoding = 'utf-8')
    text = file.read()
    file.close()
    return text

In [2]:
def to_sentences(doc):
    return doc.strip().split('\n')

In [3]:
def sentence_lenghts(sentences):
    lenghts = [len(s.split()) for s in sentences]
    return min(lenghts), max(lenghts)

In [4]:
import re
import string 
import unicodedata
def clean_lines(lines):
    cleaned = list()
    re_print = re.compile('[^%s]'% re.escape(string.printable)) 
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        line = line.split()
        line = [word.lower() for word in line]
        line = [word.translate(table) for word in line] 
        line = [re_print.sub('', w) for w in line]
        line = [word for word in line if word.isalpha()]
        cleaned.append(' '.join(line))
    return cleaned 
        

In [5]:
filename = 'europarl-v7.it-en.en'
doc = load_docum(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lenghts(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))
cleanf = clean_lines(sentences)
filename = 'English.pkl'
outfile = open(filename, 'wb')
pickle.dump(cleanf, outfile)
outfile.close()
print(filename, 'saved')


English data: sentences=1909115, min=0, max=668
English.pkl saved


In [7]:
filename = 'europarl-v7.it-en.it'
doc = load_docum(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lenghts(sentences)
print('Italian data: sentences=%d, min=%d, max=%d' % (len(sentences),
minlen, maxlen))
cleanf = clean_lines(sentences)
filename = 'Italian.pkl'
outfile = open(filename, 'wb')
pickle.dump(cleanf, outfile)
outfile.close()
print(filename, 'saved')


Italian data: sentences=1909115, min=0, max=558
Italian.pkl saved


In [8]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
from pickle import load
from pickle import dump
from collections import Counter 
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))
def save_clean_sentences(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s'%filename)
    

In [10]:
def to_vocab(lines):
    vocab = Counter()
    for line in lines:
        tokens = line.split()
        vocab.update(tokens)
    return vocab

In [11]:
def trim_vocab(vocab, min_occurrence):
    tokens = [k for k,c in vocab.items() if c >= min_occurrence]
    return set(tokens)

In [12]:
def update_dataset(lines, vocab):
    new_lines = list()
    for line in lines:
        new_tokens = list()
        for token in line.split():
            if token in vocab:
                new_tokens.append(token)
            else:
                new_tokens.append('unkn')
        new_line = ' '.join(new_tokens)
        new_lines.append(new_line)
    return new_lines


In [13]:
filename = 'English.pkl'
lines1 = load_clean_sentences(filename)
vocab = to_vocab(lines1)
print('English Vocabulary: %d' % len(vocab))
vocab = trim_vocab(vocab, 5)
print('New English Vocabulary: %d' % len(vocab))
lines1 = update_dataset(lines1, vocab)
filename = 'english_vocab.pkl'
save_clean_sentences(lines1, filename)
for i in range(10):
    print("line",i,":",lines1[i])

English Vocabulary: 104655
New English Vocabulary: 41564
Saved: english_vocab.pkl
line 0 : resumption of the session
line 1 : i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period
line 2 : although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful
line 3 : you have requested a debate on this subject in the course of the next few days during this partsession
line 4 : in the meantime i should like to observe a minute s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union
line 5 : please rise then for this minute s silence
line 6 : the house rose and observed a minute s silence
line 7 : madam president o

In [14]:
filename = 'Italian.pkl'
lines = load_clean_sentences(filename)
vocab = to_vocab(lines)
print('Italian Vocabulary: %d' % len(vocab))
vocab = trim_vocab(vocab, 5)
print('New Italian Vocabulary: %d' % len(vocab))
lines = update_dataset(lines, vocab)
filename = 'italian_vocab.pkl'
save_clean_sentences(lines, filename)
for i in range(10):
    print("line",i,":",lines[i])

Italian Vocabulary: 171079
New Italian Vocabulary: 67378
Saved: italian_vocab.pkl
line 0 : ripresa della sessione
line 1 : dichiaro ripresa la sessione del parlamento europeo interrotta venerdi dicembre e rinnovo a tutti i miei migliori auguri nella speranza che abbiate trascorso delle buone vacanze
line 2 : come avrete avuto modo di constatare il grande baco del millennio non si e materializzato invece i cittadini di alcuni nostri paesi sono stati colpiti da catastrofi naturali di proporzioni davvero terribili
line 3 : avete chiesto che si tenesse una discussione su tale tema nei prossimi giorni nel corso della presente tornata
line 4 : nel frattempo e mio desiderio come del resto mi e stato chiesto da alcuni colleghi osservare un minuto di silenzio in memoria di tutte le vittime delle tempeste che si sono abbattute sui diversi paesi dell unione europea
line 5 : vi invito pertanto ad alzarvi in piedi per osservare appunto un minuto di silenzio
line 6 : il parlamento osserva un minuto 

In [15]:
import pandas as pd

In [16]:
df = pd.DataFrame({'eng': lines1, 'it': lines})
df = df[:5000]
df

Unnamed: 0,eng,it
0,resumption of the session,ripresa della sessione
1,i declare resumed the session of the european ...,dichiaro ripresa la sessione del parlamento eu...
2,although as you will have seen the dreaded mil...,come avrete avuto modo di constatare il grande...
3,you have requested a debate on this subject in...,avete chiesto che si tenesse una discussione s...
4,in the meantime i should like to observe a min...,nel frattempo e mio desiderio come del resto m...
...,...,...
4995,madame president the people of austria have sp...,signora presidente il popolo austriaco si e es...
4996,i believe they gave him of the vote so there i...,credo che egli abbia ottenuto il percento dei ...
4997,your parliament may i suggest madame president...,il suo parlamento signora presidente dovrebbe ...
4998,then and only then might you consider if human...,allora e soltanto allora sara possibile decide...


In [17]:
for i in range(5000):
    if len(df['eng'][i])>128:
        df['eng'][i] = df['eng'][i][:128]
    if len(df['it'][i])>128:
        df['it'][i] = df['it'][i][:128]
        

In [18]:
mc = []
for i in range(5000):
    mc.append({'translation':{'eng':df['eng'][i], 'it':df['it'][i] }})
ds = pd.DataFrame(mc)

In [19]:
ds

Unnamed: 0,translation
0,"{'eng': 'resumption of the session', 'it': 'ri..."
1,{'eng': 'i declare resumed the session of the ...
2,{'eng': 'although as you will have seen the dr...
3,{'eng': 'you have requested a debate on this s...
4,{'eng': 'in the meantime i should like to obse...
...,...
4995,{'eng': 'madame president the people of austri...
4996,{'eng': 'i believe they gave him of the vote s...
4997,{'eng': 'your parliament may i suggest madame ...
4998,{'eng': 'then and only then might you consider...


In [20]:
#create dataset 
from datasets import Dataset
dataset_2 = Dataset.from_pandas(ds)
train_dataset_2, validation_dataset_2= dataset_2.train_test_split(test_size=0.1).values()
import datasets
dss = datasets.DatasetDict({"train":train_dataset_2,"test":validation_dataset_2})

In [21]:
from transformers import AutoTokenizer 
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-it")
prefix = "translate English to Italian: "
max_input_length = 128
max_target_length = 128
source_lang = "eng"
target_lang = "it"

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [22]:
tokenized_datasets = dss.map(preprocess_function, batched=True)
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-it")

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2024-07-05 19:46:36.930793: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-05 19:46:36.933675: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-05 19:46:36.942895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 19:46:36.960716: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 19:46:36.960741: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-05 19:46:36.972692: I tensorflow/core/platform/cpu_feature_guard.cc:

In [30]:
batch_size = 3

args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy = "epoch",
    learning_rate=0.00001,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.001,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [31]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
import numpy as np

def post_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   
    decoded_preds, decoded_labels = post_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    dict_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(dict_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [32]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [33]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [34]:
from datasets import load_dataset, load_metric
metric = load_metric("sacrebleu")
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.9657,1.969931,22.4787,19.472
2,1.5669,1.979759,22.2226,19.474
3,1.3563,1.998489,22.6706,19.344
4,1.2645,2.013925,22.4827,19.388


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]], 'forced_eos_token_id': 43017}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]], 'forced_eos_token_id': 43017}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]], 'forced_eos_token_id': 43017}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]], 'forced_eos_token_id': 43017}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]], 'forced_eos_token_id': 43017}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]], 'forced_eos_token_id': 43017}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]], 'forced_eos_token_id': 43017}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[54421]]

TrainOutput(global_step=6000, training_loss=1.5139954528808595, metrics={'train_runtime': 1207.8699, 'train_samples_per_second': 14.902, 'train_steps_per_second': 4.967, 'total_flos': 593387194023936.0, 'train_loss': 1.5139954528808595, 'epoch': 4.0})

In [35]:
pickle.dump(model, open('model_mt.pkl', 'wb'))

In [36]:
import pandas as pd

model_3 = pd.read_pickle(r'model_mt.pkl')

In [37]:
import pickle
with open('eng_it_data.pkl', 'wb') as f:
     pickle.dump(tokenized_datasets, f)

In [38]:
with open('eng_it_data_nettokenazira.pkl', 'wb') as f:
     pickle.dump(dss, f)