In [None]:
def get_expansion(abbreviation):
    abbreviation_lookup = {
        "R&D": "research and development",
        "CPU": "central processing unit",
        "RSVP": "répondez s'il vous plaît",
        "VIP": "very important person",
        "WHO": "World Health Organization",
        "OKRs": "objectives and key results",
        "GPS": "global positioning system",
        "FAQ": "frequently asked questions",
        "ATM": "automated teller machine",
        "USB": "universal serial bus",
        "NDA": "non-disclosure agreement",
        "AI": "artificial intelligence",
        "CCTV": "closed-circuit television",
        "ETA": "estimated time of arrival",
        "PDF": "portable document format",
        "HR": "human resources",
        "IPO": "initial public offering",
        "RAM": "random-access memory",
        "ROM": "read-only memory",
        "CTO": "chief technology officer",
        "ASAP": "as soon as possible",
        "Wi-Fi": "wireless fidelity",
        "SaaS": "software as a service",
        "LAN": "local area network",
        "IT": "information technology",
        "MRI": "magnetic resonance imaging",
        "ICU": "intensive care unit",
        "FY": "fiscal year",
        "BYOD": "bring your own device",
        "POS": "point of sale",
        "NGO": "non-governmental organization",
        "HTTP": "hypertext transfer protocol",
        "HTML": "hypertext markup language",
        "URL": "uniform resource locator",
        "QA": "quality assurance",
        "SARS": "severe acute respiratory syndrome",
        "CSV": "comma-separated values",
        "SQL": "structured query language",
        "API": "application programming interface",
        "DNS": "domain name system",
        "IP": "internet protocol",
        "Bluetooth": "short-range wireless communication"
    }
    return abbreviation_lookup.get(abbreviation, "Unknown abbreviation")
 

In [27]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
from tqdm import tqdm
import json

In [28]:
# Function to predict abbreviation expansion
def predict_abbreviation_expansion(sentence: str, abbreviation: str, model_name: str = 'bert-base-uncased') -> str:
    """
    Predict the expansion of an abbreviation in a given sentence.

    Args:
        sentence (str): The input sentence containing the abbreviation to expand.
        abbreviation (str): The abbreviation in the sentence to expand.
        model_name (str): The name of the pretrained model to use (default: 'bert-base-uncased').

    Returns:
        str: The predicted expansion for the abbreviation.
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForMaskedLM.from_pretrained(model_name)
        
        # Replace abbreviation with [MASK]
        masked_sentence = sentence.replace(abbreviation, "[MASK]")
        
        # Tokenize Input
        inputs = tokenizer(masked_sentence, return_tensors="pt")
        
        # Get Predictions
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = outputs.logits
        
        # Find [MASK] token and its predicted token
        masked_index = (inputs['input_ids'] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
        predicted_token_id = predictions[0, masked_index].argmax(dim=1)
        predicted_token = tokenizer.decode(predicted_token_id)
        
        return predicted_token.strip()
    except Exception as e:
        return f"Error during prediction: {str(e)}"


In [29]:
# Dataset class for fine-tuning
class AbbreviationDataset(Dataset):
    def __init__(self, sentences, abbreviations, expansions, tokenizer, max_len=128):
        self.sentences = sentences
        self.abbreviations = abbreviations
        self.expansions = expansions
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx].replace(self.abbreviations[idx], "[MASK]")
        inputs = self.tokenizer(sentence, return_tensors="pt", max_length=self.max_len, padding="max_length", truncation=True)
        label = self.tokenizer(self.expansions[idx], return_tensors="pt", max_length=self.max_len, padding="max_length", truncation=True)
        inputs['labels'] = label['input_ids'].squeeze()
        return {key: val.squeeze() for key, val in inputs.items()}

In [30]:
# Function to fine-tune the model
def fine_tune_model(sentences, abbreviations, expansions, model_name='bert-base-uncased', epochs=3, batch_size=16, lr=5e-5):
    """
    Fine-tune the BERT model for abbreviation expansion.

    Args:
        sentences (list): List of input sentences.
        abbreviations (list): List of abbreviations in the sentences.
        expansions (list): List of correct expansions for the abbreviations.
        model_name (str): Pretrained model name (default: 'bert-base-uncased').
        epochs (int): Number of fine-tuning epochs (default: 3).
        batch_size (int): Batch size for training (default: 16).
        lr (float): Learning rate (default: 5e-5).

    Returns:
        AutoModelForMaskedLM: The fine-tuned model.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    model.train()

    # Prepare dataset and dataloader
    dataset = AbbreviationDataset(sentences, abbreviations, expansions, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Optimizer and loss function
    optimizer = AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    # Fine-tuning loop
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(epochs):
        epoch_loss = 0
        loop = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}")
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch + 1} Loss: {epoch_loss / len(dataloader)}")

    return model

In [31]:
if __name__ == "__main__":

    # Load the dataset
    with open('abbreviation_dataset.json', 'r') as file:
        dataset = json.load(file)

    # Prepare sentences, abbreviations, and expansions
    sentences = [item['sentence'] for item in dataset]
    abbreviations = [item['abbreviation'] for item in dataset]
    expansions = [item['expansion'] for item in dataset]

    print("Sentences:", sentences)
    print("Abbreviations:", abbreviations)
    print("Expansions:", expansions)

    model = 'bert-base-uncased'

    # Fine-tune the model
    fine_tuned_model = fine_tune_model(sentences, abbreviations, expansions, model)

    # Save the model and tokenizer
    fine_tuned_model.save_pretrained("fine_tuned_abbreviation_model")
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.save_pretrained("fine_tuned_abbreviation_model")


def predict_with_fine_tuned_model(sentence: str, abbreviation: str, model_path: str = "fine_tuned_abbreviation_model"):
    """
    Use the fine-tuned model and tokenizer to predict abbreviation expansion.

    Args:
        sentence (str): The sentence containing the abbreviation.
        abbreviation (str): The abbreviation to expand.
        model_path (str): Path to the directory containing the fine-tuned model and tokenizer.

    Returns:
        str: The predicted expansion for the abbreviation.
    """
    try:
        # Load the fine-tuned model and tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForMaskedLM.from_pretrained(model_path)

        # Replace the abbreviation with [MASK]
        masked_sentence = sentence.replace(abbreviation, "[MASK]")

        # Tokenize Input
        inputs = tokenizer(masked_sentence, return_tensors="pt")

        # Get Predictions
        with torch.no_grad():
            outputs = model(**inputs)
        predictions = outputs.logits

        # Find the [MASK] token and its predicted token
        masked_index = (inputs['input_ids'] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
        predicted_token_id = predictions[0, masked_index].argmax(dim=1)
        predicted_token = tokenizer.decode(predicted_token_id)

        return predicted_token.strip()
    except Exception as e:
        return f"Error during prediction: {str(e)}"


# Example predictions
test_sentence = "The VIP section was reserved for special guests."
test_abbreviation = "VIP"
predicted_expansion = predict_with_fine_tuned_model(test_sentence, test_abbreviation)
print(f"Predicted Expansion for '{test_abbreviation}': {predicted_expansion}")

test_sentence_2 = "The WHO released new guidelines on pandemic control."
test_abbreviation_2 = "WHO"
predicted_expansion_2 = predict_with_fine_tuned_model(test_sentence_2, test_abbreviation_2)
print(f"Predicted Expansion for '{test_abbreviation_2}': {predicted_expansion_2}")


Sentences: ['The R&D team delivered their findings to the board.', "The CPU's performance was evaluated in the lab.", 'The RSVP deadline for the event is tomorrow.', 'The VIP section was reserved for special guests.', 'The WHO released new guidelines on pandemic control.', 'The CEO presented the OKRs to the team.', 'The GPS system was accurate in navigating the route.', 'The FAQ section provides answers to common questions.', 'The ATM was out of cash this morning.', 'The USB drive contained all the project files.', 'The WHO is working on global health challenges.', 'The NDA prevents employees from sharing sensitive information.', 'The new AI model outperformed previous ones.', 'The CCTV footage was reviewed by security personnel.', 'The ETA for the delivery is tomorrow evening.', 'The PDF document was emailed to the client.', 'The HR team organized a training session.', 'The IPO of the company attracted many investors.', 'The RAM on the laptop was upgraded for better performance.', "Th

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1/3: 100%|██████████| 3/3 [00:14<00:00,  4.83s/it, loss=11.9]


Epoch 1 Loss: 14.654485066731771


Epoch 2/3: 100%|██████████| 3/3 [00:13<00:00,  4.45s/it, loss=8.23]


Epoch 2 Loss: 9.288009961446127


Epoch 3/3: 100%|██████████| 3/3 [00:13<00:00,  4.46s/it, loss=6.32]


Epoch 3 Loss: 6.742023468017578
Predicted Expansion for 'VIP': [PAD]
Predicted Expansion for 'WHO': [PAD]
