In [1]:
from transformers import MT5ForConditionalGeneration
import pandas as pd 
import transformers

model = MT5ForConditionalGeneration.from_pretrained('google/mt5-base')
# Define your training loop, loss function, optimizer, etc., for fine-tuning

training_df = pd.read_csv('C:/Users/kaczm/OneDrive/Pulpit/Abbr_env/training_df.csv')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import MT5Tokenizer
import json

def create_expected_df(data_path):

    df = pd.read_csv(data_path, sep='\t', header=None, names=['expanded_abbreviation', 'base_abbreviation'])

    return df
   

def create_in_df(data_path_in, data_path_out):

    df = pd.read_csv(data_path_in, sep='\t', header=None, names=['abbreviation', 'context'])

    out_df = create_expected_df(data_path_out)

    df = pd.merge(df, out_df, left_index=True, right_index=True)

    tokenizer = MT5Tokenizer.from_pretrained('google/mt5-large')  

    def tokenize_data(context, expanded_abbreviation):
        tokenized_input = tokenizer.encode(context, return_tensors='pt', max_length=512, truncation=True)
        tokenized_target = tokenizer.encode(expanded_abbreviation, return_tensors='pt', max_length=512, truncation=True)

        # Convert tensors to lists and then to JSON strings
        json_input = json.dumps(tokenized_input.tolist()[0])  # [0] to remove batch dimension
        json_target = json.dumps(tokenized_target.tolist()[0])
        return json_input, json_target

     
    df['tokenized_input'], df['tokenized_target'] = zip(*df.apply(lambda x: tokenize_data(x['context'], x['expanded_abbreviation']), axis=1))

    return df

In [33]:
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-large')

def tokenize_and_update(row):
    
    tokenized_input = tokenizer.encode(row['context'], return_tensors='pt', max_length=512, truncation=True)
    tokenized_target = tokenizer.encode(row['expanded_abbreviation'], return_tensors='pt', max_length=512, truncation=True)

    # Convert tensors to lists and then to JSON strings
    row['tokenized_input'] = json.dumps(tokenized_input.tolist()[0])  # [0] to remove batch dimension
    row['tokenized_target'] = json.dumps(tokenized_target.tolist()[0])
    return row

# Apply the function to each row
training_df = training_df.apply(tokenize_and_update, axis=1)



In [34]:
from torch.utils.data import Dataset
import ast
import torch


class AbbreviationDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        input_ids = torch.tensor(json.loads(item['tokenized_input']))
        labels = torch.tensor(json.loads(item['tokenized_target']))
        return {'input_ids': input_ids, 'labels': labels}


In [3]:
import sentencepiece

val_dataset = create_in_df('C:/Users/kaczm/OneDrive/Pulpit/Abbr_env/in_validation.tsv', 'C:/Users/kaczm/OneDrive/Pulpit/Abbr_env/expected_validation.tsv')

test_dataset_original = pd.concat([
    create_in_df("C:/Users/kaczm/OneDrive/Pulpit/Abbr_env/in_test_a.tsv", "C:/Users/kaczm/OneDrive/Pulpit/Abbr_env/expected_test_a.tsv"),
    create_in_df("C:/Users/kaczm/OneDrive/Pulpit/Abbr_env/in_test_b.tsv", "C:/Users/kaczm/OneDrive/Pulpit/Abbr_env/expected_test_b.tsv")
])



You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [37]:
train_dataset = AbbreviationDataset(training_df)
val_dataset = AbbreviationDataset(val_dataset)  
test_dataset = AbbreviationDataset(test_dataset_original)  

In [7]:
print(training_df['tokenized_input'].iloc[0])
print(training_df['tokenized_target'].iloc[0])

[259, 17093, 18740, 259, 107669, 259, 110416, 259, 266, 259, 144390, 259, 268, 65348, 276, 259, 263, 210001, 268, 104990, 3407, 78718, 650, 259, 268, 485, 5417, 14068, 160987, 1042, 55114, 669, 3105, 3598, 400, 2946, 265, 716, 455, 4430, 259, 202425, 59353, 11525, 9394, 259, 266, 259, 337, 89793, 25177, 38260, 38893, 414, 10207, 662, 259, 72512, 262, 1]
[259, 197642, 1]


In [16]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Separate the input_ids and labels
    input_ids = [item['input_ids'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad the sequences to the maximum length in the batch
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)

    return {'input_ids': input_ids_padded, 'labels': labels_padded}


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [9]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup



# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 10
total_steps = len(train_loader) * num_epochs
warmup_steps = int(total_steps * 0.1)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

# Variables for tracking progress
best_val_loss = float('inf')
early_stopping_counter = 0
early_stopping_limit = 3  # Stop if no improvement after 3 consecutive epochs

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Training loss: {avg_train_loss}")

    # Validation step
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss

            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Validation loss: {avg_val_loss}")

    # Save the best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained('path_to_save_best_model')
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_limit:
        print("Early stopping triggered")
        break

# Load the best model for further use or inference
model = MT5ForConditionalGeneration.from_pretrained('path_to_save_best_model')



Epoch 1/10 - Training loss: 29.221603108875787
Epoch 1/10 - Validation loss: 22.359008026123046
Epoch 2/10 - Training loss: 19.09498912896683
Epoch 2/10 - Validation loss: 13.458402919769288
Epoch 3/10 - Training loss: 11.145802764750238
Epoch 3/10 - Validation loss: 6.284316539764404
Epoch 4/10 - Training loss: 6.357855243469352
Epoch 4/10 - Validation loss: 3.951567530632019
Epoch 5/10 - Training loss: 4.854924235770952
Epoch 5/10 - Validation loss: 3.319741439819336
Epoch 6/10 - Training loss: 4.221307989376695
Epoch 6/10 - Validation loss: 3.0302732467651365
Epoch 7/10 - Training loss: 3.870023554830409
Epoch 7/10 - Validation loss: 2.8029598951339723
Epoch 8/10 - Training loss: 3.7244830291662643
Epoch 8/10 - Validation loss: 2.677043151855469
Epoch 9/10 - Training loss: 3.549687568821124
Epoch 9/10 - Validation loss: 2.6325679779052735
Epoch 10/10 - Training loss: 3.583093061375974
Epoch 10/10 - Validation loss: 2.591477608680725


In [14]:
model_path = 'C:/Users/kaczm/OneDrive/Pulpit/Abbr_env/path_to_save_best_model'
model = MT5ForConditionalGeneration.from_pretrained(model_path)
model.eval()  # Set the model to evaluation mode

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 768)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 768)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
         

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


MT5ForConditionalGeneration(
  (shared): Embedding(250112, 768)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 768)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
         

In [17]:
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [20]:
# Initialize variables to track the number of correct predictions and total predictions
correct_predictions = 0
total_predictions = 0

# Evaluate the model
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=input_ids.ne(0))

    # Decode outputs and labels to text
    output_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    label_texts = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

    # Compare output_texts with label_texts and calculate accuracy
    for output_text, label_text in zip(output_texts, label_texts):
        if output_text == label_text:
            correct_predictions += 1
        total_predictions += 1

# Calculate and print the overall accuracy
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
print(f'Accuracy of expanded abbreviations (Af): {accuracy:.4f}')




Accuracy of expanded abbreviations (Af): 0.0000


In [39]:
# Generate predictions and decode them to text
predicted_expansions = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=input_ids.ne(0))

    predicted_batch = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    predicted_expansions.extend(predicted_batch)

# Add the predictions to the original test DataFrame
test_dataset_original['predicted_expanded_abbreviation'] = predicted_expansions



In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW
from torch.utils.data import Dataset, DataLoader


torch.cuda.empty_cache()


# Function to move batches to a device
def to_device(batch, device):
    return {k: v.to(device) for k, v in batch.items()}

# Load and preprocess data
def preprocess_data(df):
    df['input_text'] = "expand abbreviation: " + df['abbreviation'] + " in context: " + df['context']
    df['target_text'] = df['expanded_abbreviation']  
    return df

# Load the CSV files
training_df = pd.read_csv('C:/Users/kaczm/OneDrive/Pulpit/Abbr_env_v2/training_df.csv')
test_df = pd.read_csv('C:/Users/kaczm/OneDrive/Pulpit/Abbr_env_v2/test_df.csv')

# Preprocess the data
training_df = preprocess_data(training_df)
test_df = preprocess_data(test_df)


# Define a custom dataset
class AbbreviationDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        self.tokenizer = tokenizer
        self.inputs = data['input_text']
        self.targets = data['target_text']
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        # Tokenize input and target texts
        input_tokens = self.tokenizer.encode_plus(input_text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')
        target_tokens = self.tokenizer.encode_plus(target_text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')

        # Combine input and target tokens into one dictionary
        return {**input_tokens, 'labels': target_tokens['input_ids']}

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")  
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print('CUDA being used') 
model.to(device)

# Create Dataset and DataLoader for training and test data
train_dataset = AbbreviationDataset(tokenizer, training_df)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = AbbreviationDataset(tokenizer, test_df)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Training loop
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        # Move batch to the appropriate device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Unpack the input and labels from the batch
        input_ids = batch['input_ids'].squeeze(1)
        labels = batch['labels'].squeeze(1)

        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    # Evaluate on test data
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for batch in test_loader:
            # Move batch to the appropriate device
            batch = to_device(batch, device)

            # Unpack the input and labels from the batch
            input_ids = batch['input_ids'].squeeze(1)
            labels = batch['labels'].squeeze(1)

            # Forward pass
            outputs = model(input_ids=input_ids, labels=labels)
            total_loss += outputs.loss.item()

        avg_loss = total_loss / len(test_loader)
        print(f"Test Loss after Epoch {epoch+1}: {avg_loss}")


  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


CUDA being used




Epoch 1/5, Loss: 0.2966515123844147
Test Loss after Epoch 1: 0.14322816643591482
Epoch 2/5, Loss: 0.16756057739257812
Test Loss after Epoch 2: 0.098079183089165
Epoch 3/5, Loss: 0.4180033206939697
Test Loss after Epoch 3: 0.07183858133001511
Epoch 4/5, Loss: 0.08358603715896606
Test Loss after Epoch 4: 0.058316460531281464
Epoch 5/5, Loss: 0.03675549104809761
Test Loss after Epoch 5: 0.05246976653271605


In [2]:
model_save_path = 'final_model'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to final_model


In [4]:
# Load the trained model (if not already in memory)
model.load_state_dict(torch.load('final_model'))
model.eval()

# Variables to store predictions and actual values
predictions = []
actuals = []

# Iterate over test data
for batch in test_loader:
    # Move batch to the appropriate device
    batch = to_device(batch, device)

    # Forward pass without gradient calculation
    with torch.no_grad():
        input_ids = batch['input_ids'].squeeze(1)
        labels = batch['labels'].squeeze(1)

        outputs = model.generate(input_ids=input_ids)
        
        # Convert outputs and labels to lists for evaluation
        pred_list = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
        labels_list = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]

        predictions.extend(pred_list)
        actuals.extend(labels_list)

# Now 'predictions' and 'actuals' contain the model's predictions and the true labels, respectively.
# You can use these lists to calculate accuracy or other metrics.




In [5]:
predictions

['<extra_id_0> Guillaume.',
 '<extra_id_0>.',
 '<extra_id_0> s. 16). s. 16',
 '<extra_id_0> Marius Lindvik - 129',
 '<extra_id_0> służy pielęgnacji i ',
 '<extra_id_0> -',
 '<extra_id_0> o możliwości PO i PO',
 '<extra_id_0>. ) o. o. o.',
 '<extra_id_0> ) o.o.',
 '<extra_id_0> drogi o krawędzi drogi o',
 '<extra_id_0> - g. g. g. g. g.',
 '<extra_id_0> ul.. <extra_id_14> ul..',
 '<extra_id_0> o.',
 '<extra_id_0> utrzymania.',
 '<extra_id_0> o nazwiska o nazwisku o nazwisku o ',
 '<extra_id_0> (in context: )',
 '<extra_id_0> -',
 '<extra_id_0> zmian. s.',
 '<extra_id_0>.',
 '<extra_id_0> - undefined:',
 '<extra_id_0>?',
 '<extra_id_0> o. o. o.',
 '<extra_id_0> -',
 '<extra_id_0> i.',
 '<extra_id_0> -',
 '<extra_id_0> WZ -',
 '<extra_id_0> usługi usługi usługi usługi ',
 '<extra_id_0> znaczenia znaczenia obecności serii.',
 '<extra_id_0>.',
 '<extra_id_0> o nazwisku o nazwisku o nazwisku o ',
 '<extra_id_0>.',
 '<extra_id_0> u. u. u. u. u. u.',
 '<extra_id_0> - n.',
 '<extra_id_0> -',
 '<

In [6]:
actuals

['Gil.',
 'wewnętrzny',
 'sobota',
 'metrów',
 'age.',
 'godzina',
 'byłego',
 'ojciec',
 'F. spółka',
 'metrów',
 'godzina',
 'piątek telefon',
 'ojciec',
 'godzina',
 'WiR.',
 'środa',
 'sobota',
 'punktów',
 'wiersz; werset',
 'cesarsko-królewskiej',
 'Nic. Zero.',
 'ojciec',
 'asysta',
 'byłego',
 'jeziora',
 'pani',
 'litr',
 'metrów',
 'językiem',
 'województwie',
 'gać.',
 'ulica',
 'niedziela',
 'metra',
 'imienia biskupa',
 'pracy',
 'pan',
 'pani',
 'pana',
 'wieku',
 'bok.',
 'Kapitan',
 'bo.',
 'sobota',
 'pani',
 'sobota',
 'punktów',
 'godzina',
 'log.',
 'mieście',
 'ojca',
 'matka',
 'wieku',
 'sobota',
 'bramka',
 'była',
 'Cow.',
 'stronach',
 'miasta',
 'pic.',
 'miasto',
 'wieczny',
 'niedziela',
 'dogrywka',
 'pan',
 'wieku',
 'ojciec',
 'a.',
 'koń.',
 'metrów',
 'były',
 'ojciec',
 'imienia',
 'godzinę',
 'baryłkę',
 'były',
 'były',
 'ojców',
 'grupie C.',
 'sobota',
 'waga',
 'łez.',
 'firmą',
 'dziennie',
 'sekundy',
 'sobota',
 'niedziela',
 'reżyseria',
 'to

In [2]:
torch.cuda.empty_cache()