In [1]:
from google.colab import drive
drive._mount('/content/drive/')

Mounted at /content/drive/


In [None]:
#!pip install -q keras

In [2]:
!pip install -q pydrive

In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name() 

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [4]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
!nvidia-smi

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

Sat Dec  4 22:29:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P8    32W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 14819889134611883572
 xla_global_id: -1, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 10843127808
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 4864828189585404777
 physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"
 xla_global_id: 416903419]

In [5]:
#!pip install transformers
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-4ymo8yd5
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-4ymo8yd5
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 5.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 32.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 529 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp3

In [6]:
project_path='/content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/'

In [None]:
#cd '/content/drive/My Drive/'

/content/drive/My Drive


In [None]:
#cd $project_path

In [None]:
#ls

In [7]:
import numpy as np
import pandas as pd

In [8]:
def tensorify(lst):
    """
    List must be nested list of tensors (with no varying lengths within a dimension).
    Nested list of nested lengths [D1, D2, ... DN] -> tensor([D1, D2, ..., DN)

    :return: nested list D
    """
    # base case, if the current list is not nested anymore, make it into tensor
    if type(lst[0]) != list:
        if type(lst) == torch.Tensor:
            return lst
        elif type(lst[0]) == torch.Tensor:
            return torch.stack(lst, dim=0)
        else:  # if the elements of lst are floats or something like that
            return torch.tensor(lst)
    current_dimension_i = len(lst)
    for d_i in range(current_dimension_i):
        tensor = tensorify(lst[d_i])
        lst[d_i] = tensor
    # end of loop lst[d_i] = tensor([D_i, ... D_0])
    tensor_lst = torch.stack(lst, dim=0)
    return tensor_lst

In [9]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.temp = None

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
######## Shuffling Training Dataset for Plausibility Classification ######################
#########################################################################

import pickle

with open(project_path+'Pickles/Plausibility_Classification_PT_Bert_FT_1/pickle_shuffled_train_clarification_single_filler_dataset_1_1.pickle', 'rb') as f:
  shuffled_train_dataset = pickle.load(f)
  #pickle.dump((test_df['labels'],predictions),f)


In [None]:
import pickle

with open(project_path+'Pickles/Plausibility_Classification_PT_Bert_FT_1/pickle_train_clarification_single_filler_dataset_1_1.pickle', 'rb') as f:
  train_dataset = pickle.load(f)
  #pickle.dump((test_df['labels'],predictions),f)

In [None]:
import pickle

with open(project_path+'Pickles/Plausibility_Classification_PT_Bert_FT_1/pickle_valid_clarification_single_filler_dataset_1_1.pickle', 'rb') as f:
  val_dataset = pickle.load(f)
  #pickle.dump((test_df['labels'],predictions),f)

In [None]:
'''
import pickle

with open(project_path+'Pickles/pickle_valid_puzzle_shuffled_1000_dataset_1.pickle', 'rb') as f:
  shuffled_val_dataset = pickle.load(f)
  #pickle.dump((test_df['labels'],predictions),f)
'''

In [None]:
import pickle

with open(project_path+'Pickles/Plausibility_Classification_PT_Bert_FT_1/pickle_valid_small_clarification_single_filler_dataset_1_1.pickle', 'rb') as f:
  val_small_dataset = pickle.load(f)
  #pickle.dump((test_df['labels'],predictions),f)

In [11]:

import pickle

with open(project_path+'Pickles/Plausibility_Classification_PT_Bert_FT_1/pickle_test_clarification_single_filler_dataset_1_1.pickle', 'rb') as f:
  test_dataset = pickle.load(f)
  #pickle.dump((test_df['labels'],predictions),f)


In [12]:
from transformers import AutoTokenizer,BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased",max_length=512) #, skip_special_tokens=True)

#special_tokens_dict = {'additional_special_tokens': ['[SEP]','[FILLER]','[\FILLER]']}
#num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
'''
decoder_tokenizer.cls_token = decoder_tokenizer.bos_token
decoder_tokenizer.sep_token = decoder_tokenizer.eos_token
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token
encoder_tokenizer.pad_token = decoder_tokenizer.eos_token
'''

print(tokenizer.cls_token,tokenizer.sep_token,tokenizer.bos_token,tokenizer.eos_token,tokenizer.pad_token,tokenizer.sep_token_id,tokenizer.pad_token_id)
print(tokenizer.special_tokens_map)
print(tokenizer.vocab_size)
#print(decoder_tokenizer.cls_token,decoder_tokenizer.sep_token,decoder_tokenizer.bos_token,decoder_tokenizer.eos_token,decoder_tokenizer.pad_token,decoder_tokenizer.sep_token_id,decoder_tokenizer.pad_token_id)

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


[CLS] [SEP] None None [PAD] 102 0
{'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}
30522


In [None]:
'''
from transformers import AutoModelForSequenceClassification

roberta_new = AutoModelForSequenceClassification.from_pretrained("roberta-base",num_labels=3)


print(roberta_new.get_input_embeddings())

roberta_new.resize_token_embeddings(len(tokenizer))

print(roberta_new.get_input_embeddings())

roberta_new.save_pretrained(project_path+'Saved_Models/PLAUSIBILITY_CLASSIFICATION_ROBERTA_1/')
'''


In [13]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(project_path+'Saved_Models/Training_1/Plausibility_Classification_Bert_1/Final_Bert_Not_Scratch',num_labels=3) #########

Some weights of the model checkpoint at /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Bert_1/Final_Bert_Not_Scratch were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification 

In [None]:
#model.config.decoder.add_cross_attention=True
#print(model.config.decoder)
print(model.get_input_embeddings())
print(model.config.pad_token_id)
#model.save_pretrained(project_path+'Saved_Models/BertGPT_2_decoder_cross_attention')

Embedding(30522, 768, padding_idx=0)
0


In [None]:
#led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")
#led_model = AutoModelForSeq2SeqLM.from_pretrained("allenai/led-base-16384", gradient_checkpointing=True, use_cache=False,)

In [14]:
!pip install datasets==1.2.1
!pip install rouge_score

Collecting datasets==1.2.1
  Downloading datasets-1.2.1-py3-none-any.whl (159 kB)
[K     |████████████████████████████████| 159 kB 5.1 MB/s 
[?25hCollecting tqdm<4.50.0,>=4.27
  Downloading tqdm-4.49.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 7.2 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 42.1 MB/s 
Installing collected packages: xxhash, tqdm, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.62.3
    Uninstalling tqdm-4.62.3:
      Successfully uninstalled tqdm-4.62.3
Successfully installed datasets-1.2.1 tqdm-4.49.0 xxhash-2.0.2


Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [15]:
#from transformers import logging ################
#logging.set_verbosity_info() #####################

from sklearn.metrics import f1_score, precision_score, recall_score

from datasets import load_dataset, load_metric
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
)

#tokenizer = tokenizer

# load rouge
metric = load_metric("accuracy")

# compute Rouge score during validation
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = metric.compute(predictions=predictions, references=labels)['accuracy']

    micro_f1 = f1_score(labels, predictions, average='micro')
    macro_f1 = f1_score(labels, predictions, average='macro')
    weighted_f1 = f1_score(labels, predictions, average='weighted')



    return {
        "Accuracy": accuracy,
        "micro_F1": round(micro_f1, 4),
        "macro_F1": round(macro_f1, 4),
        "weighted_F1": round(weighted_f1, 4),
    }


Downloading:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

In [16]:
!pip install wandb

import wandb
wandb.login()

%env WANDB_PROJECT=Plausibility_Clarification_PT_Bert_FT_Training_2_2

Collecting wandb
  Downloading wandb-0.12.7-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.3 MB/s 
[?25hCollecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 52.4 MB/s 
[?25hCollecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.0-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 48.8 MB/s 
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting yaspin>=1.0.0
  Downloading yaspin-2.1.0-py3-none-any.whl (18 kB)
Collecting configparser>=3.8.1
  Downloading configparser-5.2.0-py3-none-any.whl (19 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting subprocess32>=3.5.3
  Downloading subprocess32-3.5.4.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 5.4 MB/s 

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_PROJECT=Plausibility_Clarification_PT_Bert_FT_Training_2_2


In [17]:
batch_size = 8 ####################



training_args = TrainingArguments(
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #fp16=True,
    #fp16_backend="apex",
    output_dir=project_path+'Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/',
    logging_dir = project_path+'Saved_Logs/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/',
    num_train_epochs = 20,  ##########################################################
    logging_steps=10, #250,
    eval_steps=50,  # 200, #5000,
    save_steps=50, #200, #500,
    warmup_steps=1, #1500,
    #save_total_limit=2,
    gradient_accumulation_steps=32, ############################################################
    load_best_model_at_end = False,
    #resume_from_checkpoint = project_path+'Saved_Models/Training_1/checkpoint-1500',
    report_to="wandb",  # enable logging to W&B
    run_name="plausibility-classification-PT-Bert-FT-run-2-2",  # name of the W&B run (optional)
)


# BEST TO ME by Accuracy (project_path+'Saved_Logs/Training_1/Plausibility_Classification_Roberta_1/checkpoint-150')
# BEST TO ME by Validation Loss 

In [None]:
# instantiate trainer BATCH SIZE = 8 ############
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    #data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset= shuffled_train_dataset,  #train_dataset,  ########################################
    eval_dataset=test_dataset,  # val_dataset, 
)

# start training
#trainer.train()
trainer.train(project_path+'Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-400')

Loading model from /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-400).
***** Running training *****
  Num examples = 17960
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 1400
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 5
  Continuing training from global step 400
  Will skip the first 5 epochs then the first 1600 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/1600 [00:00<?, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mtawkat[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss,Accuracy,Micro F1,Macro F1,Weighted F1
450,0.5268,1.572097,0.4332,0.4332,0.4195,0.4528
500,0.4214,1.540317,0.4644,0.4644,0.4396,0.481
550,0.4367,1.54196,0.4688,0.4688,0.4359,0.4776
600,0.3665,1.804356,0.4572,0.4572,0.4255,0.4631


***** Running Evaluation *****
  Num examples = 2500
  Batch size = 8
Saving model checkpoint to /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-450
Configuration saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-450/config.json
Model weights saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-450/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-450/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Pl

In [None]:
# instantiate trainer BATCH SIZE = 8 ############
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    #data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset= shuffled_train_dataset,  #train_dataset,  ########################################
    eval_dataset=test_dataset,  # val_dataset, 
)

# start training
#trainer.train()
trainer.train(project_path+'Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-200')

Loading model from /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-200).
***** Running training *****
  Num examples = 17960
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 1400
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 200
  Will skip the first 2 epochs then the first 1920 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/1920 [00:00<?, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mtawkat[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss,Accuracy,Micro F1,Macro F1,Weighted F1
250,0.789,1.177341,0.4452,0.4452,0.4252,0.4635
300,0.6461,1.36765,0.4984,0.4984,0.4514,0.4948
350,0.6816,1.245178,0.4672,0.4672,0.4465,0.4857
400,0.5589,1.41059,0.4728,0.4728,0.4361,0.4776


***** Running Evaluation *****
  Num examples = 2500
  Batch size = 8
Saving model checkpoint to /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-250
Configuration saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-250/config.json
Model weights saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-250/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-250/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Pl

In [None]:
# instantiate trainer BATCH SIZE = 8 ############
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    #data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset= shuffled_train_dataset,  #train_dataset,  ########################################
    eval_dataset=test_dataset,  # val_dataset, 
)

# start training
trainer.train()
#trainer.train(project_path+'Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_2/checkpoint-950')

***** Running training *****
  Num examples = 17960
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 32
  Total optimization steps = 1400
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy,Micro F1,Macro F1,Weighted F1
50,1.0844,1.134654,0.2792,0.2792,0.2455,0.246
100,1.0621,1.089177,0.4524,0.4524,0.3447,0.4014
150,0.9507,1.14936,0.5128,0.5128,0.4039,0.4744
200,0.9174,1.107851,0.4524,0.4524,0.4202,0.4628


***** Running Evaluation *****
  Num examples = 2500
  Batch size = 8
Saving model checkpoint to /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-50
Configuration saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-50/config.json
Model weights saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-50/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_PT_Bert_FT_More_Grad_Acc_2_2/checkpoint-50/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausi

In [None]:
# instantiate trainer BATCH SIZE = 8 ############
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    #data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset= shuffled_train_dataset,  #train_dataset,  ########################################
    eval_dataset=test_dataset,  # val_dataset, 
)

# start training
#trainer.train()
trainer.train(project_path+'Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-700')

Loading model from /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-700).
***** Running training *****
  Num examples = 17960
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 5600
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 700
  Will skip the first 2 epochs then the first 1120 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/1120 [00:00<?, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mtawkat[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss,Accuracy,Micro F1,Macro F1,Weighted F1
750,1.0757,1.117875,0.3572,0.3572,0.3492,0.3771
800,1.0326,1.170667,0.2812,0.2812,0.2818,0.2825
850,1.0111,1.122727,0.436,0.436,0.3635,0.4191


***** Running Evaluation *****
  Num examples = 2500
  Batch size = 8
Saving model checkpoint to /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-750
Configuration saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-750/config.json
Model weights saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-750/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-750/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-750/special_tokens_map.json
*

Step,Training Loss,Validation Loss,Accuracy,Micro F1,Macro F1,Weighted F1
750,1.0757,1.117875,0.3572,0.3572,0.3492,0.3771
800,1.0326,1.170667,0.2812,0.2812,0.2818,0.2825
850,1.0111,1.122727,0.436,0.436,0.3635,0.4191
900,0.9556,1.234073,0.32,0.32,0.3209,0.3302
950,0.9409,1.18974,0.3716,0.3716,0.3618,0.3915
1000,0.9836,1.288924,0.282,0.282,0.2656,0.266


***** Running Evaluation *****
  Num examples = 2500
  Batch size = 8
Saving model checkpoint to /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-900
Configuration saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-900/config.json
Model weights saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-900/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-900/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-900/special_tokens_map.json
*

OSError: ignored

In [None]:
# instantiate trainer BATCH SIZE = 8 ############
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    #data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset= shuffled_train_dataset,  #train_dataset,  ########################################
    eval_dataset=test_dataset,  # val_dataset, 
)

# start training
trainer.train()
#trainer.train(project_path+'Saved_Logs/Training_1/Plausibility_Classification_Roberta_1/checkpoint-850')

***** Running training *****
  Num examples = 17960
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 8
  Total optimization steps = 5600
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mtawkat[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss,Validation Loss,Accuracy,Micro F1,Macro F1,Weighted F1
50,1.108,1.125301,0.3332,0.3332,0.2819,0.2901
100,1.0973,1.162198,0.178,0.178,0.1063,0.0607
150,1.1051,1.140031,0.2204,0.2204,0.1765,0.1546
200,1.0916,1.099588,0.4376,0.4376,0.2344,0.2953
250,1.0882,1.091331,0.422,0.422,0.2346,0.2852
300,1.0899,1.208503,0.2288,0.2288,0.1919,0.1782
350,1.0984,1.130554,0.2768,0.2768,0.2678,0.2677
400,1.0862,1.120622,0.3996,0.3996,0.3322,0.3727
450,1.0938,1.138465,0.3276,0.3276,0.308,0.3261
500,1.0698,1.185204,0.2336,0.2336,0.2213,0.2065


***** Running Evaluation *****
  Num examples = 2500
  Batch size = 8
Saving model checkpoint to /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-50
Configuration saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-50/config.json
Model weights saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-50/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-50/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Machine Learning/UBC/CPSC_503/SemEval22_T7/Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-50/special_tokens_map.json
***** 

# **Testing**

In [None]:
class_names = ["IMPLAUSIBLE", "NEUTRAL", "PLAUSIBLE"]

LABEL_DICT = {}
LABEL_DICT[class_names[0]] = 0
LABEL_DICT[class_names[1]] = 1
LABEL_DICT[class_names[2]] = 2

In [None]:
print(tokenizer.batch_decode(train_dataset[499:500]['input_ids']))

labels_ids = train_dataset[499:500]['labels']#[test_dataset[499:500]['labels'] == -100] = tokenizer.pad_token_id
labels_ids[labels_ids == -100] = tokenizer.pad_token_id
#print(labels_ids)
output = tokenizer.batch_decode(labels_ids, skip_special_tokens=False)
print(output[0])

["solve: def sat(x: List[int], a: int=-165, r: int=1, l: int=42): assert type(x) is list and all(type(a) is int for a in x), 'x must be of type List[int]' return x[0] == a and len(x) == l and all([x[i] * r == x[i + 1] for i in range(len(x) - 1)])</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [None]:
print(tokenizer.special_tokens_map)

{'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': "['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>', '<extra_id_46>', '<extra_id_47>', '<extra_id_48>', '<extra_id_49>', '<extra_id_50>', '<extra_id_51>', '<extra_id_52>', '<extra_id_5

In [None]:
accuracy = Accuracy(actual_str,output_str)
sommth_bleu_score = round(_bleu(actual_str, output_str),2)

print(accuracy)
print(sommth_bleu_score)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(project_path+'Saved_Models/Training_1/Plausibility_Classification_Roberta_1/checkpoint-850',num_labels=3)
model.to(device)

In [None]:
start = 200
end = 210
model.eval()
outputs = model(input_ids=test_dataset[start:end]['input_ids'].to(device),attention_mask=test_dataset[start:end]['attention_mask'].to(device))

In [None]:
print(np.argmax(outputs.logits.detach().cpu().numpy(),axis=1))
print(test_dataset[start:end]['labels'])

[2 2 2 2 2 2 2 2 2 2]
tensor([0, 0, 1, 2, 1, 0, 1, 0, 0, 2])


In [None]:
total_output_str=[]
batch_size = 100

curr = 0
for i in range(0,len(test_dataset),batch_size):
  end = min(i+batch_size,len(test_dataset))
  code_tokens = ed.generate(input_ids=test_dataset[i:end]['input_ids'].to(device),attention_mask=test_dataset[i:end]['attention_mask'].to(device))
  
  codes = decoder_tokenizer.batch_decode(code_tokens,skip_special_tokens=True)

  for c in codes:
    total_output_str.append(c)
  print(i)
  if(end == len(test_dataset)):
    break


In [None]:
print(len(total_output_str))

6545


In [None]:
class Example(object):
    """A single training/test example."""
    def __init__(self,
                 idx,
                 source,
                 target,
                 ):
        self.idx = idx
        self.source = source
        self.target = target

In [None]:
import pickle
with open(project_path+'Pickles/pickle_test_buggy_fixed_1.pickle', 'rb') as f:
  test_examples = pickle.load(f)

In [None]:
total_actual_str=[]

for te in test_examples:
  total_actual_str.append(te.target)

print(len(total_actual_str))

6545


In [None]:
import pickle
with open(project_path+'Pickles/pickle_pred_test_buggy_fixed_13000_1.pickle', 'wb') as f:
  pickle.dump((total_actual_str,total_output_str),f)

In [None]:
import pickle
with open(project_path+'Pickles/pickle_pred_test_buggy_fixed_13000_1.pickle', 'rb') as f:
  total_actual_str,total_output_str = pickle.load(f)

In [None]:
print(total_actual_str[100])
print(total_output_str[100])

public void METHOD_1 ( ) { TYPE_1 query = new TYPE_1 ( ) ; TYPE_2 VAR_1 = TYPE_3 . METHOD_2 ( VAR_2 class ) ; TYPE_3 . METHOD_3 ( VAR_1 . getId ( ) ) . METHOD_4 ( 1 ) ; TYPE_3 . METHOD_5 ( VAR_1 ) ; java.lang.Long count = VAR_3 . METHOD_6 ( VAR_1 , query ) ; TYPE_4 . assertEquals ( INT_2 , count . METHOD_7 ( ) ) ; }
public void METHOD_1 ( ) { TYPE_1 query = new TYPE_1 ( ) ; TYPE_2 VAR_1 = TYPE_3. METHOD_2 ( VAR_2 class ) ; TYPE_3. METHOD_3 ( VAR_1. getId ( ) ). METHOD_4 ( INT_1 ) ; TYPE_3. METHOD_5 ( VAR_1 ) ; java.lang.Long count = VAR_3. METHOD_6 ( VAR_1, query ) ; TYPE_4. assertEquals ( INT_2, count. METHOD_7 ( ) ) ; }


In [None]:
accuracy = Accuracy(total_actual_str,total_output_str)
sommth_bleu_score = round(_bleu(total_actual_str, total_output_str),2)

print(accuracy)
print(sommth_bleu_score)

[['public', 'java.lang.String', 'METHOD_1', '(', ')', '{', 'if', '(', '(', 'METHOD_2', '(', ')', ')', '&&', '(', 'METHOD_3', '(', 'VAR_1', '.', 'METHOD_4', '(', ')', ')', ')', ')', '{', 'return', 'VAR_1', '.', 'METHOD_4', '(', ')', ';', '}', 'else', 'if', '(', 'METHOD_3', '(', 'VAR_3', '.', 'METHOD_5', '(', ')', '.', 'METHOD_6', '(', ')', ')', ')', '{', 'return', 'VAR_3', '.', 'METHOD_5', '(', ')', '.', 'METHOD_6', '(', ')', ';', '}', 'else', '{', 'return', 'VAR_4', '.', 'METHOD_4', '(', ')', ';', '}', '}'], ['private', 'void', 'METHOD_1', '(', 'TYPE_1', 'index', ',', 'java.util.Collection', '<', 'TYPE_2', '>', 'VAR_1', ')', '{', 'TYPE_1', 'VAR_2', '=', 'index', '.', 'METHOD_2', '(', 'VAR_3', ')', ';', 'for', '(', 'TYPE_3', '<', 'TYPE_2', '>', 'VAR_4', ':', 'this', '.', 'VAR_1', '.', 'values', '(', ')', ')', '{', 'VAR_4', '.', 'METHOD_3', '(', 'VAR_2', ',', 'null', ')', ';', '}', 'METHOD_4', '(', 'index', ',', 'VAR_1', ')', ';', '}'], ['public', 'void', 'remove', '(', 'int', 'id', ')',

In [None]:
accuracy = Accuracy(total_actual_str,total_actual_str)
sommth_bleu_score = round(_bleu(total_actual_str, total_actual_str),2)

print(accuracy)
print(sommth_bleu_score)

[['public', 'java.lang.String', 'METHOD_1', '(', ')', '{', 'if', '(', '(', 'METHOD_2', '(', ')', ')', '&&', '(', 'METHOD_3', '(', 'VAR_1', '.', 'METHOD_4', '(', ')', ')', ')', ')', '{', 'return', 'VAR_1', '.', 'METHOD_4', '(', ')', ';', '}', 'else', 'if', '(', 'METHOD_3', '(', 'VAR_3', '.', 'METHOD_5', '(', ')', '.', 'METHOD_6', '(', ')', ')', ')', '{', 'return', 'VAR_3', '.', 'METHOD_5', '(', ')', '.', 'METHOD_6', '(', ')', ';', '}', 'else', '{', 'return', 'VAR_4', '.', 'METHOD_4', '(', ')', ';', '}', '}'], ['private', 'void', 'METHOD_1', '(', 'TYPE_1', 'index', ',', 'java.util.Collection', '<', 'TYPE_2', '>', 'VAR_1', ')', '{', 'TYPE_1', 'VAR_2', '=', 'index', '.', 'METHOD_2', '(', 'VAR_3', ')', ';', 'for', '(', 'TYPE_3', '<', 'TYPE_2', '>', 'VAR_4', ':', 'this', '.', 'VAR_1', '.', 'values', '(', ')', ')', '{', 'VAR_4', '.', 'METHOD_3', '(', 'VAR_2', ',', 'null', ')', ';', '}', 'METHOD_4', '(', 'index', ',', 'VAR_1', ')', ';', '}'], ['public', 'void', 'remove', '(', 'int', 'id', ')',

In [None]:
print(ed.config.encoder.max_length)

20


In [None]:
import torch
output_ids_padding = torch.where(train_labels[0]== -100,led_tokenizer.pad_token_id,train_labels[0])
print(led_tokenizer.decode(output_ids_padding))

<s>[problem_tags]greedy, math, sortings[problem_difficulty]1100[req_time]218 ms[req_memory]400 KB[code_text]#include<bits/stdc++.h>
using namespace std;

int main()
{
	int t; cin>>t; while(t-->0){
		int n; cin>>n; int a[n]; for(int i=0;i<n;i++) cin>>a[i];
		int p=n-1; sort(a,a+n);
		for(int i=0;i<p;i++){
			if(a[i+1]-a[i] < a[p]){
				p--; i--;
			}
		}
		cout << p+1 << endl;
	}
}</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

In [None]:
import torch

from datasets import load_dataset, load_metric
from transformers import LEDTokenizer, LEDForConditionalGeneration

# load pubmed
pubmed_test = load_dataset("scientific_papers", "pubmed", ignore_verifications=True, split="test")

# load tokenizer
tokenizer = LEDTokenizer.from_pretrained("patrickvonplaten/led-large-16384-pubmed")
model = LEDForConditionalGeneration.from_pretrained("patrickvonplaten/led-large-16384-pubmed").to("cuda").half()


def generate_answer(batch):
  inputs_dict = tokenizer(batch["article"], padding="max_length", max_length=8192, return_tensors="pt", truncation=True)
  input_ids = inputs_dict.input_ids.to("cuda")
  attention_mask = inputs_dict.attention_mask.to("cuda")
  global_attention_mask = torch.zeros_like(attention_mask)
  # put global attention on <s> token
  global_attention_mask[:, 0] = 1

  predicted_abstract_ids = model.generate(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
  batch["predicted_abstract"] = tokenizer.batch_decode(predicted_abstract_ids, skip_special_tokens=True)
  return batch


result = pubmed_test.map(generate_answer, batched=True, batch_size=4)

# load rouge
rouge = load_metric("rouge")

print("Result:", rouge.compute(predictions=result["predicted_abstract"], references=result["abstract"], rouge_types=["rouge2"])["rouge2"].mid)


In [None]:
for i in range(len(train_labels)):
  for x in train_labels[i]:
    if(x>50277 or x<0):
      print(x)

In [None]:
for x in train_labels[0]:
  if(x!=1):
    #print(x)
    #s = led_tokenizer.convert_ids_to_tokens(torch.tensor([x]))
    #x = led_model.get_decoder().embed_tokens(train_labels[0])[0]
    #print(s)
    m = led_model.get_decoder()
    ss = m(tensorify([[x]]))

In [None]:
arr = torch.zeros_like(train_labels[0])
print(len(arr))

4096


In [None]:
print(led_model.config.max_decoder_position_embeddings)
dec = led_model.get_decoder()
print(led_model.get_decoder().embed_tokens)
dec.set_input_embeddings(led_model.get_encoder().embed_tokens)

In [None]:
x = edm.get_decoder()
ss = x(tensorify([arr[:1024]]))

ValueError: ignored

In [None]:
print(ss)

In [None]:
print(len(led_tokenizer))

50265


In [None]:
print(led_model)

In [None]:
model = led_model.get_decoder()
#x.from_pretrained("allenai/led-base-16384")
model.max_target_positions = 2048

In [None]:
from transformers import EncoderDecoderModel,AutoModelForCausalLM

encoder_model = AutoModelForCausalLM.from_pretrained('allenai/longformer-base-4096')
#encoder_model.resize_token_embeddings(len(encoder_tokenizer))

encoder_model.save_pretrained(project_path+'Saved_Models/Longformer_Encoder_Init_2')



In [None]:
from transformers import EncoderDecoderModel,AutoModelForSeq2SeqLM, ReformerModel,ReformerForMaskedLM

edm = EncoderDecoderModel.from_encoder_decoder_pretrained(project_path+'Saved_Models/asd_E',project_path+'Saved_Models/asd_D')

In [None]:
print(edm.config.decoder.max_position_embeddings)

edm.save_pretrained(project_path+'Saved_Models/asd_ED')

65536


In [None]:
from transformers import ReformerModel, ReformerConfig
# Initializing a Reformer configuration
configuration = ReformerConfig.from_pretrained("google/reformer-enwik8", lsh_attn_chunk_length=16386, local_attn_chunk_length=16386)
# Initializing a Reformer model
enc_model = ReformerModel(configuration)

enc_model.save_pretrained(project_path+'Saved_Models/asd_E')

In [None]:
from transformers import ReformerForMaskedLM, ReformerConfig
# Initializing a Reformer configuration
configuration = ReformerConfig.from_pretrained("google/reformer-enwik8", lsh_attn_chunk_length=16386, local_attn_chunk_length=16386)
configuration.is_decoder=False
# Initializing a Reformer model
dec_model = ReformerForMaskedLM(configuration)

dec_model.save_pretrained(project_path+'Saved_Models/asd_D')

In [None]:
red = AutoModelForSeq2SeqLM.from_pretrained(project_path+'Saved_Models/asd_ED')

In [None]:
print(red.config.decoder.is_decoder)

True


In [None]:
cnt_1024 = 0
cnt_4096 = 0

for x in train_labels:
  try:
    if(x.tolist().index(1)+1<=1023):
      cnt_1024=cnt_1024+1
    else:
      cnt_4096 = cnt_4096+1
  except:
    cnt_4096 = cnt_4096+1

print(cnt_1024)
print(cnt_4096)

16863
12389


In [None]:
print(len(train_encoding['input_ids']))

29252


In [None]:
print(train_encoding['input_ids'][0].tolist().index(1))

626


In [None]:
#python_code = "def convert(x): return x"
PHP_CODE = """
public static <mask> set(string $key, $value) {
    if (!in_array($key, self::$allowedKeys)) {
        throw new \InvalidArgumentException('Invalid key given');
    }
    self::$storedValues[$key] = $value;
}
""".lstrip()

from transformers import pipeline, EncoderDecoderModel

#summarizer = pipeline("summarization")

ed_model = EncoderDecoderModel.from_encoder_decoder_pretrained("microsoft/codebert-base-mlm","microsoft/codebert-base")
'''
fill_mask = pipeline(
    "summarization",
    model=model,
    tokenizer="microsoft/codebert-base-mlm"
)

print(fill_mask(PHP_CODE))
'''


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=498.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=498627950.0, style=ProgressStyle(descri…




Some weights of RobertaForCausalLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['encoder.layer.1.crossattention.output.LayerNorm.weight', 'encoder.layer.4.crossattention.output.dense.weight', 'encoder.layer.2.crossattention.output.dense.bias', 'encoder.layer.4.crossattention.self.query.bias', 'encoder.layer.1.crossattention.self.value.bias', 'lm_head.layer_norm.bias', 'encoder.layer.7.crossattention.self.value.weight', 'encoder.layer.5.crossattention.self.key.bias', 'encoder.layer.8.crossattention.output.dense.bias', 'encoder.layer.3.crossattention.output.LayerNorm.weight', 'encoder.layer.1.crossattention.output.dense.bias', 'lm_head.bias', 'encoder.layer.7.crossattention.self.query.weight', 'encoder.layer.7.crossattention.output.dense.weight', 'encoder.layer.5.crossattention.self.value.bias', 'encoder.layer.10.crossattention.output.LayerNorm.weight', 'encoder.layer.4.crossattention.self.key.bias', 'encoder.layer.2.crossattention

'\nfill_mask = pipeline(\n    "summarization",\n    model=model,\n    tokenizer="microsoft/codebert-base-mlm"\n)\n\nprint(fill_mask(PHP_CODE))\n'

In [None]:
summarizer = pipeline("text-generation", model="microsoft/codebert-base", tokenizer="microsoft/codebert-base", framework="tf")
summarizer("def convert_int_to_str(x):", min_length=5, max_length=50)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=150.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




PipelineException: ignored