In [1]:
import torch
import torch.nn as nn

import transformers

from transformers import pipeline, AutoTokenizer, AutoModel
from datasets import load_dataset

import random

In [2]:
import pandas as pd
import numpy as np

## DistilBERT for Masked Language Modelling

In [3]:
MODEL_TYPE = 'distilbert-base-uncased'
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_TYPE)

In [4]:
model = transformers.AutoModelForMaskedLM.from_pretrained(MODEL_TYPE)

In [5]:
text = "A thesis should be [MASK]."

In [6]:
inputs = tokenizer(text, return_tensors="pt")

In [7]:
inputs

{'input_ids': tensor([[ 101, 1037, 9459, 2323, 2022,  103, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [8]:
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

In [9]:
mask_token_index

tensor([5])

In [10]:
tokenizer.mask_token_id

103

In [11]:
torch.where(inputs["input_ids"] == tokenizer.mask_token_id)

(tensor([0]), tensor([5]))

In [12]:
logits = model(**inputs).logits

In [13]:
logits.shape

torch.Size([1, 8, 30522])

In [14]:
mask_token_logits = logits[0, mask_token_index, :]

In [15]:
top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))

A thesis should be considered.
A thesis should be defended.
A thesis should be written.


## DistilBERT for Contextual Embeddings

In [16]:
def generate_similarities(gendered_text,target_text):
    result = {}
    result['Target_Texts'] = target_text
    cos = nn.CosineSimilarity(dim=1)
    for gt in gendered_text:
        encoded_input = tokenizer(gt, return_tensors='pt')
        output = model(**encoded_input)
        last_hidden_state = output.last_hidden_state
        gt_embedding = last_hidden_state.mean(axis=1)
        result[gt] = []
        for tt in target_text:
            encoded_input = tokenizer(tt, return_tensors='pt')
            output = model(**encoded_input)
            last_hidden_state = output.last_hidden_state
            tt_embedding = last_hidden_state.mean(axis=1)
            sim = cos(gt_embedding, tt_embedding)
            result[gt].append(sim.item())
    
    temp = {}
    temp['Gendered_Texts'] = pd.DataFrame(result).set_index('Target_Texts')
    result = pd.concat(temp, axis=1)

    return result

In [17]:
MODEL_TYPE = 'distilbert-base-uncased'

tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_TYPE)

In [18]:
model = transformers.AutoModel.from_pretrained(MODEL_TYPE)
print(f"# DistilBert Parameters: {round(model.num_parameters() / 1_000_000)}M (Remember from the lecture that BERT has around 110M parameters)")

# DistilBert Parameters: 66M (Remember from the lecture that BERT has around 110M parameters)


In [22]:
text1 = "He is walking." 
text2 = "She is walking."
text3 = "The dancer is walking." 
text4 = "The chef is walking." 

In [23]:
gendered_text = [text1,text2]
target_text = [text3,text4]

In [24]:
generate_similarities(gendered_text,target_text)

Unnamed: 0_level_0,Gendered_Texts,Gendered_Texts
Unnamed: 0_level_1,He is walking.,She is walking.
Target_Texts,Unnamed: 1_level_2,Unnamed: 2_level_2
The dancer is walking.,0.889623,0.907225
The chef is walking.,0.879,0.875073


## Using Sentence-BERT

In [25]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [37]:
def generate_similarities_sentBERT(gendered_text,target_text,tokenizer,model):
    result = {}
    result['Target_Texts'] = target_text
    cos = nn.CosineSimilarity(dim=1)
    for gt in gendered_text:
        encoded_input = tokenizer(gt, padding=True, truncation=True, return_tensors='pt')
        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)
        # Perform pooling. In this case, max pooling.
        gt_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
        # gt_embedding = last_hidden_state.mean(axis=1)
        result[gt] = []
        for tt in target_text:
            encoded_input = tokenizer(tt, padding=True, truncation=True, return_tensors='pt')
            with torch.no_grad():
                model_output = model(**encoded_input)
            tt_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
            sim = cos(gt_embedding, tt_embedding)
            result[gt].append(sim.item())
    
    temp = {}
    temp['Gendered_Texts'] = pd.DataFrame(result).set_index('Target_Texts')
    result = pd.concat(temp, axis=1)

    return result

In [35]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [38]:
generate_similarities_sentBERT(gendered_text,target_text,tokenizer,model)

Unnamed: 0_level_0,Gendered_Texts,Gendered_Texts
Unnamed: 0_level_1,He is walking.,She is walking.
Target_Texts,Unnamed: 1_level_2,Unnamed: 2_level_2
The dancer is walking.,0.818181,0.845284
The chef is walking.,0.741682,0.655532


## Fine-Tuning using pre-trained model

In [4]:
dataset = load_dataset("yelp_review_full")

In [5]:
dataset["train"][100]

{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [23]:
dataset["train"][10]

{'label': 0,
 'text': "Owning a driving range inside the city limits is like a license to print money.  I don't think I ask much out of a driving range.  Decent mats, clean balls and accessible hours.  Hell you need even less people now with the advent of the machine that doles out the balls.  This place has none of them.  It is april and there are no grass tees yet.  BTW they opened for the season this week although it has been golfing weather for a month.  The mats look like the carpet at my 107 year old aunt Irene's house.  Worn and thread bare.  Let's talk about the hours.  This place is equipped with lights yet they only sell buckets of balls until 730.  It is still light out.  Finally lets you have the pit to hit into.  When I arrived I wasn't sure if this was a driving range or an excavation site for a mastodon or a strip mining operation.  There is no grass on the range. Just mud.  Makes it a good tool to figure out how far you actually are hitting the ball.  Oh, they are cash 

In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [8]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [18]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))

In [19]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [13]:
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=39, training_loss=1.5139410556891026, metrics={'train_runtime': 1740.6311, 'train_samples_per_second': 0.172, 'train_steps_per_second': 0.022, 'total_flos': 78935442739200.0, 'train_loss': 1.5139410556891026, 'epoch': 3.0})