In [2]:
import pandas as pd

training_df = pd.read_csv('C:/Users/kaczm/OneDrive/Pulpit/Abbr_env_v2/training_df_v2.csv')

In [4]:
import torch
from transformers import BertTokenizer, BertForMaskedLM


# Step 1: Load the BERT Model
tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-cased-v1")
model = BertForMaskedLM.from_pretrained("dkleczek/bert-base-polish-cased-v1")


  from .autonotebook import tqdm as notebook_tqdm
pytorch_model.bin: 100%|██████████| 531M/531M [00:16<00:00, 32.9MB/s] 


In [7]:
# Create Dictionaries

base_abb_dict = {}

for ab in training_df['abbreviation'].unique():
    # Get all unique base_abbreviations for this abbreviation
    base_abbs = training_df[training_df['abbreviation'] == ab]['base_abbreviation'].unique()
    
    # Get all unique values from potential_expansions arrays for this abbreviation
    potential_expansions = training_df[training_df['abbreviation'] == ab]['potential_expansions']
    unique_expansions = set().union(*potential_expansions)  # Flatten and get unique values
    
    # Merge base abbreviations and expansions
    base_abb_dict[ab] = set(base_abbs).union(unique_expansions)


base_abb_dict


{'l.': {' ',
  "'",
  ',',
  '.',
  'L',
  '[',
  ']',
  'a',
  'i',
  'l',
  'l.',
  'liczba',
  'linia',
  'litr',
  'lokal',
  'r',
  'rok',
  't'},
 's.': {' ',
  "'",
  ',',
  '.',
  'S',
  '[',
  ']',
  'a',
  'd',
  'do spraw',
  'e',
  'i',
  'k',
  'n',
  'o',
  'r',
  's',
  's.',
  'sala',
  'samobójcza',
  'sektor',
  'sekunda',
  'seria',
  'siedziba',
  'silnik',
  'siostra',
  'sobota',
  'societe',
  'sprawa',
  'stopień',
  'strona',
  'strony',
  'syn',
  'sędzia',
  'sędzia; sędziować',
  't',
  'u',
  'y'},
 'Sen.': {' ',
  "'",
  ',',
  '.',
  '1',
  '2',
  '3',
  ':',
  'S',
  'Sen.',
  '[',
  ']',
  'e',
  'f',
  'i',
  'm',
  'n',
  'o',
  'r',
  's',
  'senator'},
 'b.': {' ',
  "'",
  ',',
  '.',
  'B',
  '[',
  ']',
  'a',
  'b',
  'b.',
  'bardziej; bardzo',
  'bardzo',
  'baryłka',
  'bieżący',
  'blues',
  'boisko',
  'brat',
  'budżet',
  'być',
  'były',
  'błogosławiony',
  'c',
  'd',
  'e',
  'i',
  'o',
  'r',
  't',
  'y',
  'z',
  'ą',
  'ł',
  'ż'

In [10]:
import string

def is_punctuation(s):
    return any(c in string.punctuation for c in s)

# Clean the base_abb_dict
cleaned_base_abb_dict = {}

for ab, expansions in base_abb_dict.items():
    cleaned_expansions = {exp for exp in expansions if isinstance(exp, str) and len(exp) > 1 and not is_punctuation(exp)}
    if cleaned_expansions:  # Only add to the cleaned dictionary if there are valid expansions left
        cleaned_base_abb_dict[ab] = cleaned_expansions

# Display the cleaned dictionary
print(cleaned_base_abb_dict)


{'l.': {'liczba', 'lokal', 'linia', 'rok', 'litr'}, 's.': {'stopień', 'sędzia', 'silnik', 'sektor', 'do spraw', 'siedziba', 'samobójcza', 'sala', 'sobota', 'seria', 'strony', 'siostra', 'societe', 'strona', 'sekunda', 'sprawa', 'syn'}, 'Sen.': {'senator'}, 'b.': {'brat', 'były', 'baryłka', 'budżet', 'być', 'boisko', 'bardzo', 'blues', 'bieżący', 'błogosławiony'}, 'm.': {'minuta', 'metr', 'miasto', 'mecz', 'miejscowość', 'mikrogram', 'mężczyzna', 'masa', 'mieszkanie', 'matka', 'miesiąc', 'miejsce', 'most'}, 'w.': {'węzeł', 'kwartał', 'waga', 'wysoki', 'według', 'wersja', 'w sprawie', 'wymieniony', 'wtorek', 'werset', 'wewnętrzny', 'wysokość', 'warsztat', 'wiek', 'wybory', 'województwo', 'wieczny', 'wieś'}, 'o.': {'około', 'ojciec', 'ograniczony', 'oddział'}, 'p.': {'paulownia', 'półrocze', 'przeciwko', 'pan', 'praca', 'poziom', 'page', 'pomoc', 'pokój', 'piątek', 'pułk', 'papież', 'prawny', 'państwo', 'pani', 'przeciw', 'punkt', 'patrzeć', 'procent', 'plac', 'piłka', 'przypis', 'piętro'

In [16]:
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch


class AbbreviationDataset(Dataset):
    def __init__(self, contexts, abbreviations, labels, tokenizer, max_len):
        self.contexts = contexts
        self.abbreviations = abbreviations
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, item):
        # Combine context and abbreviation
        context_abbr = f"{self.contexts[item]} [SEP] {self.abbreviations[item]}"
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            context_abbr,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )

        return {
            'context_abbr_text': context_abbr,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [18]:
from sklearn.preprocessing import LabelEncoder

# Assuming `base_abbreviation` is a column in your DataFrame
label_encoder = LabelEncoder()
numerical_labels = label_encoder.fit_transform(training_df['base_abbreviation'])



contexts = training_df['context']
abbreviations = training_df['abbreviation']
labels = numerical_labels 
dataset = AbbreviationDataset(contexts, abbreviations, labels, tokenizer, max_len=512)
data_loader = DataLoader(dataset, batch_size=32)

from transformers import BertForSequenceClassification

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(
    "dkleczek/bert-base-polish-cased-v1",
    num_labels=len(set(labels))  # number of unique labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dkleczek/bert-base-polish-cased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in data_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    

In [21]:
model.save_pretrained('model_bert_uncased_v1')


In [23]:
def predict(context, abbreviation, model, tokenizer, label_encoder):
    # Combine context and abbreviation
    context_abbr = f"{context} [SEP] {abbreviation}"

    # Tokenize input
    inputs = tokenizer.encode_plus(
        context_abbr,
        add_special_tokens=True,
        max_length=512,
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1)

    # Decode the prediction
    predicted_label = label_encoder.inverse_transform(prediction.numpy())[0]

    return predicted_label


prediction = predict("wszystkie kwalifikacje do pełnienia tego urzędu – dodaje Niesiołowski.  Paweł Kowal, <mask> wiceminister spraw zagranicznych ze stowarzyszonej z PiS Polski Razem...", "b.", model, tokenizer, label_encoder)
print("Predicted Base Abbreviation:", prediction)

Predicted Base Abbreviation: były
