In [1]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
import pandas as pd

In [3]:
df1 = pd.read_json('small_training_data.json')

In [4]:
df1

Unnamed: 0,id,title,publish_date,number,offset,length,magnitude,masked,title_sci_10E,title_char,title_sci_10E_char
0,4271,Ryan O'Neal to guest star on '90210',20100107,90210,30,5,5,Ryan O'Neal to guest star on '[Num]',Ryan O'Neal to guest star on '[Num]',Ryan O'Neal to guest star on '[Num]',Ryan O'Neal to guest star on '[Num]'
1,278032,Fact sheet and screens released for God of War...,20110606,3,72,1,1,Fact sheet and screens released for God of War...,Fact sheet and screens released for God of War...,Fact sheet and screens released for God of War...,Fact sheet and screens released for God of War...
2,281843,Reality Steve names remaining three 'mystery m...,20110614,2,60,1,1,Reality Steve names remaining three 'mystery m...,Reality Steve names remaining three 'mystery m...,Reality Steve names remaining three 'mystery m...,Reality Steve names remaining three 'mystery m...
3,298868,'Kate Plus 8': Kate Gosselin brings out the ab...,20110730,8,11,1,1,'Kate Plus [Num]': Kate Gosselin brings out th...,'Kate Plus [Num]': Kate Gosselin brings out th...,'Kate Plus [Num]': Kate Gosselin brings out th...,'Kate Plus [Num]': Kate Gosselin brings out th...
4,366413,ISL Releases FY 2011-2012 Illinois Public Libr...,20120113,2011,16,4,4,ISL Releases FY [Num]-2012 Illinois Public Lib...,ISL Releases FY [Num]-2.0120000000E+03 Illinoi...,ISL Releases FY [Num]- 2 0 1 2 Illinois Public...,ISL Releases FY [Num]- 2 . 0 1 2 0 0 0 0 0 0 0...


In [5]:
X = df1[['title', 'masked', 'length']]
X

Unnamed: 0,title,masked,length
0,Ryan O'Neal to guest star on '90210',Ryan O'Neal to guest star on '[Num]',5
1,Fact sheet and screens released for God of War...,Fact sheet and screens released for God of War...,1
2,Reality Steve names remaining three 'mystery m...,Reality Steve names remaining three 'mystery m...,1
3,'Kate Plus 8': Kate Gosselin brings out the ab...,'Kate Plus [Num]': Kate Gosselin brings out th...,1
4,ISL Releases FY 2011-2012 Illinois Public Libr...,ISL Releases FY [Num]-2012 Illinois Public Lib...,4


In [6]:
y = df1[['number']]
y

Unnamed: 0,number
0,90210
1,3
2,2
3,8
4,2011


In [7]:
text = list(X['title'])

In [8]:
text

["Ryan O'Neal to guest star on '90210'",
 'Fact sheet and screens released for God of War: Origins Collection on PS3',
 "Reality Steve names remaining three 'mystery men' on Season 2 of 'Bachelor Pad'",
 "'Kate Plus 8': Kate Gosselin brings out the abs with Jon around",
 'ISL Releases FY 2011-2012 Illinois Public Library Annual Report']

In [9]:
#with open('clean.txt', 'r') as fp:
 #   text = fp.read().split('\n')

In [10]:
text[:5]

["Ryan O'Neal to guest star on '90210'",
 'Fact sheet and screens released for God of War: Origins Collection on PS3',
 "Reality Steve names remaining three 'mystery men' on Season 2 of 'Bachelor Pad'",
 "'Kate Plus 8': Kate Gosselin brings out the abs with Jon around",
 'ISL Releases FY 2011-2012 Illinois Public Library Annual Report']

In [11]:
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [12]:
inputs

{'input_ids': tensor([[ 101, 4575, 1051,  ...,    0,    0,    0],
        [ 101, 2755, 7123,  ...,    0,    0,    0],
        [ 101, 4507, 3889,  ...,    0,    0,    0],
        [ 101, 1005, 5736,  ...,    0,    0,    0],
        [ 101, 2003, 2140,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [13]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [14]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [15]:
mask_array = []
for sentence in X['masked']:
    temp = []
    for inp in sentence.split():
        temp.append(inp == '[Num]')
    mask_array.append(temp)

In [16]:
mask_array

[[False, False, False, False, False, False, False],
 [False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False],
 [False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  True,
  False,
  False,
  False],
 [False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False],
 [False, False, False, False, False, False, False, False, False]]

In [17]:
import numpy as np
selection = []

for i in range(len(mask_array)):
    selection.append(
        np.nonzero(mask_array[i])[0].tolist()
    )

In [18]:
selection[:5]

[[], [], [9], [], []]

In [19]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [20]:
inputs.input_ids

tensor([[ 101, 4575, 1051,  ...,    0,    0,    0],
        [ 101, 2755, 7123,  ...,    0,    0,    0],
        [ 101, 4507, 3889,  ...,    0,    0,    0],
        [ 101, 1005, 5736,  ...,    0,    0,    0],
        [ 101, 2003, 2140,  ...,    0,    0,    0]])

In [21]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [22]:
dataset = MeditationsDataset(inputs)

In [23]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [24]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [25]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)



In [26]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:33<00:00, 33.33s/it, loss=18.9]
Epoch 1: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:33<00:00, 33.30s/it, loss=13.5]


In [27]:
from transformers import pipeline




In [28]:
fill = pipeline('fill-mask', model = 'bert-base-uncased', tokenizer ='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
list = fill(f'Iron Man {fill.tokenizer.mask_token}: Robert Downey Jr. talks retirement and new tech preview.')
(list[0]['token_str'])

'2'

In [30]:
# **TESTING DATA FROM HERE ON**

In [84]:
list = fill(f'Roberto Firmino used to wear jersey number {fill.tokenizer.mask_token} for Liverpool')
list

[{'score': 0.06222548708319664,
  'token': 1023,
  'token_str': '9',
  'sequence': 'roberto firmino used to wear jersey number 9 for liverpool'},
 {'score': 0.04768068715929985,
  'token': 1015,
  'token_str': '1',
  'sequence': 'roberto firmino used to wear jersey number 1 for liverpool'},
 {'score': 0.045771580189466476,
  'token': 1021,
  'token_str': '7',
  'sequence': 'roberto firmino used to wear jersey number 7 for liverpool'},
 {'score': 0.043786484748125076,
  'token': 2184,
  'token_str': '10',
  'sequence': 'roberto firmino used to wear jersey number 10 for liverpool'},
 {'score': 0.03595929592847824,
  'token': 1022,
  'token_str': '8',
  'sequence': 'roberto firmino used to wear jersey number 8 for liverpool'}]

In [32]:
df2 = pd.read_json('test_it_out_data.json')

In [33]:
X = df2['number']
X


0    90210
1        3
2        2
3        8
4     2011
Name: number, dtype: int64

In [34]:
Y = df2['masked'].tolist()


In [35]:
for i in range(len(Y)):
    Y[i] = Y[i].replace("[Num]","{}")
    Y[i] = Y[i].replace("'","")

In [36]:
Y[3]

'Kate Plus {}: Kate Gosselin brings out the abs with Jon around'

In [37]:
MASK_TOKEN = tokenizer.mask_token

fill(Y[0].format(MASK_TOKEN))[0]['token_str']

'.'

In [38]:
Z=[]
for i in range(len(Y)):
    MASK_TOKEN = tokenizer.mask_token
    Z.append(fill(Y[i].format(MASK_TOKEN))[0]['token_str'])
    

In [39]:
Z

['.', '.', '2', 'one', '2011']

In [40]:
for i in range(len(Z)):
    try:
        Z[i] = int(Z[i])
        print(Z[i])
    except ValueError:
        Z[i] = 0
   

2
2011


In [41]:
Z

[0, 0, 2, 0, 2011]

In [42]:
X = X.tolist()

In [43]:
X

[90210, 3, 2, 8, 2011]

In [44]:
from sklearn.metrics import accuracy_score, f1_score


# Calculate accuracy
accuracy = accuracy_score(X, Z)
f1 = f1_score(X, Z, average='macro')



print(f"Accuracy: {accuracy:.2f}")
print("F1 Score:", f1)

Accuracy: 0.40
F1 Score: 0.3333333333333333


In [45]:
from sklearn.metrics import classification_report

# Assuming 'y_true' is the true class labels and 'y_pred' is the predicted class labels
report = classification_report(X, Z)

print(report)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       1.00      1.00      1.00         1
           3       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
        2011       1.00      1.00      1.00         1
       90210       0.00      0.00      0.00         1

    accuracy                           0.40         5
   macro avg       0.33      0.33      0.33         5
weighted avg       0.40      0.40      0.40         5



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
