In [30]:
import pandas as pd

training_df = pd.read_csv('C:/Users/kaczm/OneDrive/Pulpit/Abbr_env_v2/training_df_v2.csv')

test_df = pd.read_csv('C:/Users/kaczm/OneDrive/Pulpit/Abbr_env_v2/test_df.csv')

In [31]:
def extract_ngrams_around_mask(context, n=4):
    words = context.split()
    try:
        mask_index = words.index('<mask>')
        
        # Calculate start and end indices for n-grams
        half_n = n // 2
        start = max(mask_index - half_n, 0)
        end = min(mask_index + half_n + 1, len(words))

        ngrams = words[start:end]

        # Remove '<mask>' from the n-grams
        ngrams = [word for word in ngrams if word != '<mask>']

        return ' '.join(ngrams)
    except ValueError:
        # Return an empty string or some default value if '<mask>' is not found
        return ''


In [32]:
from collections import Counter

def get_all_ngrams(contexts, n=4):
    ngram_list = []
    for context in contexts:
        ngrams = extract_ngrams_around_mask(context, n)
        if ngrams:
            ngram_list.extend(ngrams.split())
    return ngram_list

# Get a list of all n-grams
all_ngrams = get_all_ngrams(training_df['context'], n=4)

# Count frequencies
ngram_frequencies = Counter(all_ngrams)

# Determine a frequency threshold
frequency_threshold = 5 

In [33]:
training_df['ngrams'] = training_df['context'].apply(lambda x: extract_ngrams_around_mask(x, n=4))


In [34]:
def extract_frequent_ngrams_around_mask(context, ngram_frequencies, n=4, threshold=5):
    words = context.split()
    try:
        mask_index = words.index('<mask>')
        
        # Calculate start and end indices for n-grams
        half_n = n // 2
        start = max(mask_index - half_n, 0)
        end = min(mask_index + half_n + 1, len(words))

        # Extract n-grams and filter based on frequency
        ngrams = [word for word in words[start:end] if word != '<mask>' and ngram_frequencies[word] >= threshold]

        return ' '.join(ngrams)
    except ValueError:
        return ''

# Apply the function to the DataFrame
training_df['frequent_ngrams'] = training_df['context'].apply(lambda x: extract_frequent_ngrams_around_mask(x, ngram_frequencies, n=4, threshold=frequency_threshold))


# Ok having taken a look I guess ngrams are not the best approach here

In [35]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("allegro/plT5-small")
model = AutoModel.from_pretrained("allegro/plT5-small")

In [36]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from torch.utils.data import Dataset, DataLoader

In [43]:
from torch.utils.data import Dataset
import torch

class AbbreviationDataset(Dataset):
    def __init__(self, tokenizer, contexts, abbreviations, targets, clusters, max_len):
        self.tokenizer = tokenizer
        self.contexts = contexts
        self.abbreviations = abbreviations
        self.targets = targets
        self.clusters = clusters
        self.max_len = max_len

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        abbreviation = self.abbreviations[idx]
        target = self.targets[idx]
        cluster = self.clusters[idx]

        input_text = f"context: {context} abbreviation: {abbreviation} cluster: {cluster}"
        target_text = target

        # Tokenize input and target texts
        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        # Prepare decoder_input_ids
        # Remove the call to .flatten() to keep the tensor 2-dimensional
        decoder_input_ids = self.shift_tokens_right(target_encoding['input_ids'])

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),  # Squeeze to remove extra dimension
            'attention_mask': input_encoding['attention_mask'].squeeze(),  # Same as above
            'decoder_input_ids': decoder_input_ids.squeeze()  # Squeeze to remove extra dimension
        }

    def shift_tokens_right(self, input_ids):
        # Shift input ids one token to the right
        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
        shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
        shifted_input_ids[:, 0] = self.tokenizer.pad_token_id
        return shifted_input_ids



In [44]:
contexts = training_df['context']
abbreviations = training_df['abbreviation']
targets =  training_df['base_abbreviation'] 
clusters = training_df['cluster'] 


max_len = 512

dataset = AbbreviationDataset(tokenizer, contexts, abbreviations, targets, clusters, max_len)
data_loader = DataLoader(dataset, batch_size=32)

In [49]:
num_epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['decoder_input_ids']  # These are the shifted right target sequences

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_mask, 
                        labels=labels)

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {total_loss / len(data_loader)}")



TypeError: T5Model.forward() got an unexpected keyword argument 'labels'

In [None]:
model_save_path = 'plt5_model.pth'

# After training loop
torch.save(model.state_dict(), model_save_path)