# MLM
we input a sentence and ask that BERT outputs the same sentence.

- Tokenize our text. 
- Create a labels tensor. We’re training our model here, so we need a labels tensor to calculate loss against — and optimize towards.
- Mask tokens in input_ids. The BERT paper uses a 15% probability of masking each token during model pre-training, with a few additional rules — we’ll use a simplified version of this and assign a 15% probability of each word being masked.
- Calculate loss. pred=softmax(logits)

In [25]:
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
pd.set_option('max_colwidth',300)

In [26]:
python_files = sorted(Path('./datasets/python/final/jsonl').glob('**/*.gz'))

columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

columns_for_token = ['code_tokens', 'partition']

def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

pydf = jsonl_list_to_dataframe(python_files, columns_for_token)

In [27]:
train = pydf.groupby("partition").filter(lambda df : pd.Series(['train']).isin(df['partition']).all())
valid = pydf.groupby("partition").filter(lambda df : pd.Series(['valid']).isin(df['partition']).all())
test  = pydf.groupby("partition").filter(lambda df : pd.Series(['test']).isin(df['partition']).all())
print(train.shape[0], valid.shape[0], test.shape[0])
train.head(3)

412178 23107 22176


Unnamed: 0,code_tokens,partition
0,"[def, train, (, train_dir, ,, model_save_path, =, None, ,, n_neighbors, =, None, ,, knn_algo, =, 'ball_tree', ,, verbose, =, False, ), :, X, =, [, ], y, =, [, ], # Loop through each person in the training set, for, class_dir, in, os, ., listdir, (, train_dir, ), :, if, not, os, ., path, ., isdir...",train
1,"[def, predict, (, X_img_path, ,, knn_clf, =, None, ,, model_path, =, None, ,, distance_threshold, =, 0.6, ), :, if, not, os, ., path, ., isfile, (, X_img_path, ), or, os, ., path, ., splitext, (, X_img_path, ), [, 1, ], [, 1, :, ], not, in, ALLOWED_EXTENSIONS, :, raise, Exception, (, ""Invalid im...",train
2,"[def, show_prediction_labels_on_image, (, img_path, ,, predictions, ), :, pil_image, =, Image, ., open, (, img_path, ), ., convert, (, ""RGB"", ), draw, =, ImageDraw, ., Draw, (, pil_image, ), for, name, ,, (, top, ,, right, ,, bottom, ,, left, ), in, predictions, :, # Draw a box around the face u...",train


In [28]:
train['code_tokens'].iloc[0][:5]

['def', 'train', '(', 'train_dir', ',']

In [29]:
train_samples = train['code_tokens'].iloc[:20048] # 取2048个mini样本作为train demo
valid_samples = valid['code_tokens'].iloc[:256] # 取256个mini样本作为valid demo
test_samples = valid['code_tokens'].iloc[:64] # 取64个mini样本作为test demo

# Remove the comment

In [30]:
from typing import List, Iterable

def is_comment_token(language: str, token: str) -> bool:
    len_token = len(token)

    if language in ['python', 'ruby', 'php'] and len_token >= 1 and token.startswith('#'):
        return True
    if language in ['java', 'javascript', 'go', 'php'] \
            and len_token >= 2 and (token.startswith('//') or token.startswith('/*')):
        return True

    return False

def remove_inline_comments(language: str, code_tokens: List[str]) -> List[str]:
    return [token for token in code_tokens if not is_comment_token(language, token)]

In [31]:
new_train_samples = train_samples.apply(lambda tokens: remove_inline_comments('python', tokens))

In [32]:
print("Origin token of 30th is a comment: ", train_samples[0][30])
print("New token after removing the comment is the keyword \'for\': ", new_train_samples[0][30])

Origin token of 30th is a comment:  # Loop through each person in the training set
New token after removing the comment is the keyword 'for':  for


# Add new key words extracted from the code projects, as special token

In [41]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [43]:
for name, param in model.named_parameters():
    print(name, param.shape)
    break
print(tokenizer.tokenize('!='), tokenizer.convert_tokens_to_ids(['!=']))
print(tokenizer.tokenize('model_save_path'), tokenizer.convert_tokens_to_ids(['model_save_path']))

bert.embeddings.word_embeddings.weight torch.Size([30522, 768])
['!', '='] [100]
['model', '_', 'save', '_', 'path'] [100]


In [44]:
# add new tokens
for new_tokens in new_train_samples:
    num_added_tokens = tokenizer.add_tokens(new_tokens) # (132702, 768)
    break
model.resize_token_embeddings(len(tokenizer))

Embedding(30551, 768)

In [45]:
for name, param in model.named_parameters():
    print(name, param.shape)
    break
print(tokenizer.tokenize('!='), tokenizer.convert_tokens_to_ids(['!=']))
print(tokenizer.tokenize('model_save_path'), tokenizer.convert_tokens_to_ids(['model_save_path']))

bert.embeddings.word_embeddings.weight torch.Size([30551, 768])
['!='] [30537]
['model_save_path'] [30523]


In [46]:
def save_model(net, tokenizer):
    net.save_pretrained('results/bert/')
    tokenizer.save_pretrained('results/tokenizer/')
# def save_model(net):
#     model_name = './results/bert.net'
#     checkpoint = {
#         'state_dict': net.state_dict()
#     }
    

# #     checkpoint = {'n_hidden': net.n_hidden,
# #                 'n_layers': net.n_layers,
# #                 'state_dict': net.state_dict(),
# #                 'tokens': dictionary,
# #                 'int2token': int2token,
# #                 'token2int': token2int}

#     with open(model_name, 'wb') as f:
#         torch.save(checkpoint, f)

In [47]:
save_model(model, tokenizer)

In [48]:
tokenizer = BertTokenizer.from_pretrained('./results/tokenizer/')
bert = BertForMaskedLM.from_pretrained('./results/bert/')

# Split into fixed Chunks.

In [102]:
def split_method_to_fixed_length(samples, seq_length_max=510):
    chunks = []
    for i, method in enumerate(samples):
#         print(method[:10])
        chunks.extend([method[i:i+seq_length_max] for i in range(0, len(method), seq_length_max)]) # split method into chunks
#         print(len(chunks)) 
    return chunks

In [103]:
chunks = split_method_to_fixed_length(new_train_samples)

In [104]:
print(len(chunks))
print(len(chunks[0]), len(chunks[1]), len(chunks[2]), len(chunks[3]), len(chunks[4]), len(chunks[-1]))

20548
234 208 169 30 54 31


In [105]:
max_len = 100
min_len = 100

for seq in chunks:
        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
#     input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(seq))
    
    min_len = min(min_len, len(seq))
    
print('Max chunk length: ', max_len)
print('Min chunk length: ', min_len)

Max chunk length:  510
Min chunk length:  1


In [70]:
# inputs = tokenizer(text, 
#                     return_tensors='pt', 
#                     max_length=512, 
#                     truncation=True, 
#                     padding='max_length')

In [114]:
encodings = {"input_ids": [], "token_type_ids": [], "attention_mask": [], "labels": []}

for chunk in chunks:
    
    encoding = tokenizer.encode_plus(
      chunk,
      add_special_tokens=True,
      max_length=512, # 510 + [CLS] + [PAD]
      return_token_type_ids=True,
      padding="max_length",
      return_attention_mask=True,
      return_tensors='pt',
    )
    
#     encoding['labels'] = encoding.input_ids.detach().clone()
#     encodings.append(encoding)

    encodings["input_ids"].append(encoding["input_ids"].squeeze())
    encodings["token_type_ids"].append(encoding["token_type_ids"].squeeze())
    encodings["attention_mask"].append(encoding["attention_mask"].squeeze())
    encodings["labels"].append(encoding["input_ids"].squeeze())
    
# inputs["input_ids"] = torch.cat(encodings["input_ids"], dim=0)
encodings["input_ids"] = torch.stack(encodings["input_ids"])
encodings["token_type_ids"] = torch.stack(encodings["token_type_ids"])
encodings["attention_mask"] = torch.stack(encodings["attention_mask"])
encodings["labels"] = torch.stack(encodings["labels"])
encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [118]:
print(encodings["input_ids"].shape, encodings["token_type_ids"].shape, encodings["attention_mask"].shape, encodings["labels"].shape)
print(type(encodings['input_ids']), type(encodings['token_type_ids']), type(encodings['attention_mask']), type(encodings['labels']))

torch.Size([20548, 512]) torch.Size([20548, 512]) torch.Size([20548, 512]) torch.Size([20548, 512])
<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>


In [117]:
print(encoding["input_ids"].squeeze()[:100])
print(encoding["attention_mask"].squeeze()[:100])

tensor([  101, 13366,   100,  1006,  2969,  1010,  5950,  1027,   100,  1006,
         1007,  1007,  1024,  2065,  2969,  1012,   100,   100,  2969,  1012,
          100,  1024,  2709,  2969,  1012,   100,  2842,  1024,  2709,  2969,
         1012,   100,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# Construct dataset

In [121]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(encodings["input_ids"].shape)
# create mask array for mlm task
mask_arr = (rand < 0.15) * (encodings["input_ids"] != 101) * \
           (encodings["input_ids"] != 102) * (encodings["input_ids"] != 0)
selection = []

for i in range(encodings["input_ids"].shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero(as_tuple=False)).tolist()
    )
    
for i in range(encodings["input_ids"].shape[0]):
    encodings["input_ids"][i, selection[i]] = 103

In [128]:
for key, val in encodings.items():
    print(key)
    print(val.shape)

input_ids
torch.Size([20548, 512])
token_type_ids
torch.Size([20548, 512])
attention_mask
torch.Size([20548, 512])
labels
torch.Size([20548, 512])


In [127]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings["input_ids"])

In [129]:
dataset = MyDataset(inputs)
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [133]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

print("done")

done


In [134]:
from transformers import AdamW
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm import tqdm  # for our progress bar

epochs = 1

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
#         outputs = model(input_ids, attention_mask=attention_mask)
        
#         prediction_logits = outputs.last_hidden_state
#         pooler_output = outputs.pooler_output

#         print(prediction_logits.shape) # (batch_size, sequence_length, hidden_size)
# #         print(pooler_output.shape) # ((batch_size, hidden_size)
#         print(labels.shape) # (batch_size, sequence_length)
        # extract loss
        loss = outputs.loss
        print(loss)
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/32 [00:00<?, ?it/s]

tensor(14.8045, grad_fn=<NllLossBackward>)


Epoch 0:   3%|▎         | 1/32 [01:50<57:02, 110.40s/it, loss=14.8]

tensor(11.3474, grad_fn=<NllLossBackward>)


Epoch 0:   6%|▋         | 2/32 [03:40<55:06, 110.21s/it, loss=11.3]

tensor(9.6161, grad_fn=<NllLossBackward>)


Epoch 0:   9%|▉         | 3/32 [05:24<52:22, 108.36s/it, loss=9.62]

tensor(8.3105, grad_fn=<NllLossBackward>)


Epoch 0:  12%|█▎        | 4/32 [07:13<50:41, 108.63s/it, loss=8.31]

tensor(7.5944, grad_fn=<NllLossBackward>)


Epoch 0:  16%|█▌        | 5/32 [08:59<48:34, 107.95s/it, loss=7.59]

tensor(6.5973, grad_fn=<NllLossBackward>)


Epoch 0:  19%|█▉        | 6/32 [10:46<46:39, 107.67s/it, loss=6.6] 

tensor(6.2084, grad_fn=<NllLossBackward>)


Epoch 0:  22%|██▏       | 7/32 [12:33<44:45, 107.41s/it, loss=6.21]

tensor(5.7379, grad_fn=<NllLossBackward>)


Epoch 0:  25%|██▌       | 8/32 [14:21<43:00, 107.54s/it, loss=5.74]

tensor(5.1174, grad_fn=<NllLossBackward>)


Epoch 0:  28%|██▊       | 9/32 [16:11<41:31, 108.33s/it, loss=5.12]

tensor(4.5005, grad_fn=<NllLossBackward>)


Epoch 0:  31%|███▏      | 10/32 [18:01<39:52, 108.73s/it, loss=4.5]

tensor(4.0221, grad_fn=<NllLossBackward>)


Epoch 0:  34%|███▍      | 11/32 [19:55<38:37, 110.38s/it, loss=4.02]

tensor(3.6693, grad_fn=<NllLossBackward>)


Epoch 0:  38%|███▊      | 12/32 [21:43<36:31, 109.59s/it, loss=3.67]

tensor(3.1024, grad_fn=<NllLossBackward>)


Epoch 0:  41%|████      | 13/32 [23:31<34:34, 109.16s/it, loss=3.1] 

tensor(2.8592, grad_fn=<NllLossBackward>)


Epoch 0:  44%|████▍     | 14/32 [25:20<32:42, 109.04s/it, loss=2.86]

tensor(2.4726, grad_fn=<NllLossBackward>)


Epoch 0:  47%|████▋     | 15/32 [27:07<30:46, 108.62s/it, loss=2.47]

tensor(2.2463, grad_fn=<NllLossBackward>)


Epoch 0:  50%|█████     | 16/32 [28:53<28:45, 107.83s/it, loss=2.25]

tensor(2.0031, grad_fn=<NllLossBackward>)


Epoch 0:  53%|█████▎    | 17/32 [30:43<27:04, 108.28s/it, loss=2]   

tensor(1.7557, grad_fn=<NllLossBackward>)


Epoch 0:  56%|█████▋    | 18/32 [32:28<25:05, 107.51s/it, loss=1.76]