In [None]:
!pip install transformers torch

In [2]:
from transformers import BertForMaskedLM, BertTokenizer, RobertaForMaskedLM, RobertaTokenizer, DistilBertForMaskedLM, DistilBertTokenizer
import torch

In [3]:
bert_base = "google-bert/bert-base-uncased"
roberta_base = "FacebookAI/roberta-base"
distilbert_base = "distilbert/distilbert-base-uncased"

In [None]:
bert_model = BertForMaskedLM.from_pretrained("google-bert/bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

roberta_model = RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base")
roberta_tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

distilbert_model = DistilBertForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased")
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [5]:
def mask_model_language(model, tokenizer):
  def masked_prediction(sentence, top_k=5):
    # Tokenize the input text and identify the masked token's position
    inputs = tokenizer(sentence, return_tensors="pt")
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # The prediction logits for the masked tokens
    logits = outputs.logits

    # Get the predictions for the masked position
    mask_token_logits = logits[0, mask_token_index, :]
    top_5_tokens = torch.topk(mask_token_logits, top_k, dim=1).indices[0].tolist()

    # Convert token ids to words
    predicted_words = [tokenizer.decode([token]) for token in top_5_tokens]

    return predicted_words
  return masked_prediction

In [6]:
bert_base_prediction = mask_model_language(bert_model, bert_tokenizer)
roberta_base_prediction = mask_model_language(roberta_model, roberta_tokenizer)
distilbert_base_prediction = mask_model_language(distilbert_model, distilbert_tokenizer)

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
import os
import json

In [9]:
# dataset_path1 = "/content/drive/MyDrive/Temporal Chatbot BTP/BTP Datasets/temporal_ordering_changed2.json"
# dataset_path = "/content/drive/MyDrive/Temporal Chatbot BTP/BTP Datasets/temp_temporal_graph_data.json"
# dataset_path2 = "/content/drive/MyDrive/Temporal Chatbot BTP/BTP Datasets/temporal_ordering_split_changed.json"

In [20]:
# with open(dataset_path, "r") as f:
#   dataset = json.load(f)

In [49]:
# new_dataset = dict()
# for i in dataset:
#     new_dataset[i] = dict()
#     for j in dataset[i]:
#         new_dataset[i][j] = dataset[i][j] if j == 'ngbs' else {"time_value": dataset[i][j][0], "time_duration": dataset[i][j][1]}

In [51]:
# with open("/content/drive/MyDrive/Temporal Chatbot BTP/BTP Datasets/temp_temporal_graph_data1.json", "w") as f:
#   json.dump(new_dataset, f)

In [62]:
# lines = []
# for current_event in new_dataset:
#     current_line = f"{current_event} -> "
#     next_events = new_dataset[current_event]['ngbs']
#     for next_event in next_events:
#         current_line += f"{next_event}, "
#     for time_values in new_dataset[current_event]['time_point']:
#         if time_values == 'time_value':
#             current_line += f": Time value = {new_dataset[current_event]['time_point'][time_values][1]} "
#         else:
#             current_line += f": Time duration = {new_dataset[current_event]['time_point'][time_values][1]} "
#     lines.append(current_line)

In [93]:
# bert_masked_time_value_lines = []
# for current_event in new_dataset:
#     current_data = dict()

#     current_line = f"{current_event} -> "
#     for next_event in new_dataset[current_event]['ngbs']:
#         current_line += f"{next_event}, "

#     time_value = new_dataset[current_event]['time_point']['time_value'][1]
#     time_duration = new_dataset[current_event]['time_point']['time_duration'][1]
#     current_line += f": Time value =  [MASK] "
#     current_line += f": Time duration = {time_duration} "

#     current_data[current_event] = current_line
#     current_data['mask_value'] = time_value
#     bert_masked_time_value_lines.append(current_data)

# bert_masked_time_duration_lines = []
# for current_event in new_dataset:
#     current_data = dict()

#     current_line = f"{current_event} -> "
#     for next_event in new_dataset[current_event]['ngbs']:
#         current_line += f"{next_event}, "

#     time_value = new_dataset[current_event]['time_point']['time_value'][1]
#     time_duration = new_dataset[current_event]['time_point']['time_duration'][1]
#     current_line += f": Time value = {time_value} "
#     current_line += f": Time duration =  [MASK] "

#     current_data[current_event] = current_line
#     current_data['mask_value'] = time_value
#     bert_masked_time_duration_lines.append(current_data)

# bert_masked_next_event = []
# for current_event in new_dataset:
#     time_value = new_dataset[current_event]['time_point']['time_value'][1]
#     time_duration = new_dataset[current_event]['time_point']['time_duration'][1]

#     for i in range(len(new_dataset[current_event]['ngbs'])):
#         current_line = \
#         f"{current_event} -> " +\
#         ", ".join([next_event for next_event in new_dataset[current_event]['ngbs'][:i]]) +\
#         ", [MASK], " +\
#         ", ".join([next_event for next_event in new_dataset[current_event]['ngbs'][i+1:]]) +\
#         f": Time value = {time_value} " +\
#         f": Time duration = {time_duration} "
#         event = new_dataset[current_event]['ngbs'][i]
#         bert_masked_next_event.append({
#             current_event: current_line,
#             'mask_value': event
#         })

# roberta_masked_time_value_lines = []
# for current_event in new_dataset:
#     current_data = dict()

#     current_line = f"{current_event} -> "
#     for next_event in new_dataset[current_event]['ngbs']:
#         current_line += f"{next_event}, "

#     time_value = new_dataset[current_event]['time_point']['time_value'][1]
#     time_duration = new_dataset[current_event]['time_point']['time_duration'][1]
#     current_line += f": Time value =  <MASK> "
#     current_line += f": Time duration = {time_duration} "

#     current_data[current_event] = current_line
#     current_data['mask_value'] = time_value
#     roberta_masked_time_value_lines.append(current_data)

# roberta_masked_time_duration_lines = []
# for current_event in new_dataset:
#     current_data = dict()

#     current_line = f"{current_event} -> "
#     for next_event in new_dataset[current_event]['ngbs']:
#         current_line += f"{next_event}, "

#     time_value = new_dataset[current_event]['time_point']['time_value'][1]
#     time_duration = new_dataset[current_event]['time_point']['time_duration'][1]
#     current_line += f": Time value = {time_value} "
#     current_line += f": Time duration =  <MASK> "

#     current_data[current_event] = current_line
#     current_data['mask_value'] = time_value
#     roberta_masked_time_duration_lines.append(current_data)

# roberta_masked_next_event = []
# for current_event in new_dataset:
#     time_value = new_dataset[current_event]['time_point']['time_value'][1]
#     time_duration = new_dataset[current_event]['time_point']['time_duration'][1]

#     for i in range(len(new_dataset[current_event]['ngbs'])):
#         current_line = \
#         f"{current_event} -> " +\
#         ", ".join([next_event for next_event in new_dataset[current_event]['ngbs'][:i]]) +\
#         ", <MASK>, " +\
#         ", ".join([next_event for next_event in new_dataset[current_event]['ngbs'][i+1:]]) +\
#         f": Time value = {time_value} " +\
#         f": Time duration = {time_duration} "
#         event = new_dataset[current_event]['ngbs'][i]
#         bert_masked_next_event.append({
#             current_event: current_line,
#             'mask_value': event
#         })

In [94]:
# masked_dataset = {
#     "bert_masked_time_value": bert_masked_time_value_lines,
#     "bert_masked_time_duration": bert_masked_time_duration_lines,
#     "roberta_masked_time_value": roberta_masked_time_value_lines,
#     "roberta_masked_time_duration": roberta_masked_time_duration_lines,
#     "bert_masked_next_event": bert_masked_next_event,
#     "roberta_masked_next_event": roberta_masked_next_event
# }

In [95]:
# with open("/content/drive/MyDrive/Temporal Chatbot BTP/BTP Datasets/masked_dataset.json", "w") as f:
#   json.dump(masked_dataset, f)