In [2]:
import torch
import torch.nn as nn

import transformers

from transformers import pipeline
from datasets import load_dataset

import random

In [75]:
import pandas as pd

## DistilBERT for Masked Language Modelling

In [3]:
MODEL_TYPE = 'distilbert-base-uncased'
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_TYPE)

Downloading (…)okenizer_config.json: 100%|██████████████████████████████████████████████████| 28.0/28.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)lve/main/config.json: 100%|████████████████████████████████████████████████████| 483/483 [00:00<?, ?B/s]
Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████| 232k/232k [00:00<00:00, 799kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████████████████████████████████████| 466k/466k [00:00<00:00, 1.17MB/s]


In [4]:
model = transformers.AutoModelForMaskedLM.from_pretrained(MODEL_TYPE)

Downloading model.safetensors: 100%|████████████████████████████████████████████████| 268M/268M [02:24<00:00, 1.85MB/s]


In [18]:
text = "A thesis should be [MASK]."

In [19]:
inputs = tokenizer(text, return_tensors="pt")

In [20]:
inputs

{'input_ids': tensor([[ 101, 1037, 9459, 2323, 2022,  103, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

In [22]:
mask_token_index

tensor([5])

In [23]:
tokenizer.mask_token_id

103

In [24]:
torch.where(inputs["input_ids"] == tokenizer.mask_token_id)

(tensor([0]), tensor([5]))

In [25]:
logits = model(**inputs).logits

In [26]:
logits.shape

torch.Size([1, 8, 30522])

In [27]:
mask_token_logits = logits[0, mask_token_index, :]

In [28]:
top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))

A thesis should be considered.
A thesis should be defended.
A thesis should be written.


## DistilBERT for Contextual Embeddings

In [90]:
def generate_similarities(gendered_text,target_text):
    result = {}
    result['Target_Texts'] = target_text
    cos = nn.CosineSimilarity(dim=1)
    for gt in gendered_text:
        encoded_input = tokenizer(gt, return_tensors='pt')
        output = model(**encoded_input)
        last_hidden_state = output.last_hidden_state
        gt_embedding = last_hidden_state.mean(axis=1)
        result[gt] = []
        for tt in target_text:
            encoded_input = tokenizer(tt, return_tensors='pt')
            output = model(**encoded_input)
            last_hidden_state = output.last_hidden_state
            tt_embedding = last_hidden_state.mean(axis=1)
            sim = cos(gt_embedding, tt_embedding)
            result[gt].append(sim.item())
    
    temp = {}
    temp['Gendered_Texts'] = pd.DataFrame(result).set_index('Target_Texts')
    result = pd.concat(temp, axis=1)

    return result

In [30]:
MODEL_TYPE = 'distilbert-base-uncased'

tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_TYPE)

In [31]:
model = transformers.AutoModel.from_pretrained(MODEL_TYPE)
print(f"# DistilBert Parameters: {round(model.num_parameters() / 1_000_000)}M (Remember from the lecture that BERT has around 110M parameters)")

# DistilBert Parameters: 66M (Remember from the lecture that BERT has around 110M parameters)


In [113]:
text1 = "He is walking." 
text2 = "She is walking."
text3 = "The boy is walking." 
text4 = "The girl is walking." 

In [114]:
gendered_text = [text1,text2]
target_text = [text3,text4]

In [115]:
generate_similarities(gendered_text,target_text)

Unnamed: 0_level_0,Gendered_Texts,Gendered_Texts
Unnamed: 0_level_1,He is walking.,She is walking.
Target_Texts,Unnamed: 1_level_2,Unnamed: 2_level_2
The boy is walking.,0.942332,0.924348
The girl is walking.,0.919521,0.955328
