In [40]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import random

import gc
import os

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("the device is:",device)

the device is: cpu


In [25]:
class Dataset():
    def __init__(self, path):
        loaded_data = pd.read_csv(path)
        data = pd.DataFrame()
        data["inputs"] = loaded_data["inputs"]
        data["outputs"] = loaded_data["outputs"]
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, id):
        input_output = self.data.iloc[id]
        X = self.string2list(input_output["inputs"])
        Y = int(input_output["outputs"])
        return X, Y

    def string2list(self, string):
        L = string[1:-1].split(",")
        L = list(int(s.strip()) for s in L)
        return L


In [26]:
data = Dataset("drive/My Drive/QCM GENERATION AI/squad3.csv")

In [44]:
def batch_data_loader(dataset)

In [None]:
!pip install transformers

In [31]:
from transformers import BertConfig, BertModel
from transformers.modeling_bert import BertOnlyMLMHead
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

In [32]:
def train_epoch_v2(model, data, loss_fn, optimizer, device, forced_teaching_rate):

    optimizer.zero_grad()

    answer   = data["answer"] 
    question = data["question"]
    context  = data["context"]

    #here to get the length of the whole input
    tokens = tokenizer.tokenize(" [CLS] "+context+" [SEP] "+answer+" [SEP] "+question+" [SEP]" )
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    loss = 0

    #check if the length is under 500 ( it should be lower than 512)
    if (len(token_ids) <= 500) and question != "":

        #exemple:
        #question: How are you doing ?
        #question_x: [""   , "How", "are", "you"  ,"doing", "?""]
        #               |       |     |       |       |       |  
        #question_y: ["How", "are", "you", "doing", "?"   , "[SEP]"]
        question_x, question_y = get_question_x_y(question)

        losses = []
        sum_losses = 0

        model = model.train()

        #initialize the question
        formed_question = ""
        i = 0

        forced_teaching = True if random.random() < forced_teaching_rate else False

        #case of forced teaching = True
        if forced_teaching:
            for (q_x, q_y) in zip(question_x, question_y):
                #we token q_x to the question
                formed_question = formed_question + " " +q_x
                #we get the full input = [CLS], context, [SEP], answer, [SEP], formed_question, [MASK]
                X = get_bert_input(context, answer, formed_question,)
                X_input_ids      = X["input_ids"]
                X_token_type_ids = X["token_type_ids"]
                score_prediction = model(X_input_ids,
                                        token_type_ids=X_token_type_ids)  
                target =  q_y.view(1).to(device)
                loss  +=  loss_fn(score_prediction.view(1, -1),target)
                #check if we have a memory leak, if True we backward and update weights to free memory
                if torch.cuda.memory_allocated() > 8000000000:
                    loss = loss / i
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    loss = 0
                    i = 0
                i += 1

        else:
            q_x = ''
            for  q_y in  question_y:
                #we token q_x to the question
                formed_question = formed_question + " " +q_x
                #we get the full input = [CLS], context, [SEP], answer, [SEP], formed_question, [MASK]
                X = get_bert_input(context, answer, formed_question,)
                X_input_ids      = X["input_ids"]
                X_token_type_ids = X["token_type_ids"]
                score_prediction = model(X_input_ids,
                                        token_type_ids=X_token_type_ids)
                target =  q_y.view(1).to(device)
                loss  +=  loss_fn(score_prediction.view(1, -1),target)
                #check if we have a memory leak, if True we backward and update weights to free memory
                if torch.cuda.memory_allocated() > 8000000000:
                    loss = loss / i
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    loss = 0
                    i = 0
                i += 1

                # the next word if the output of the previous iteration
                id  = score_prediction.argmax().item()
                q_x = tokenizer.convert_ids_to_tokens(id)



        if i > 1:
            loss = loss / i
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        try:
            loss.detach()
            loss_val = loss.item()
        except:
            loss_val = loss

        del  target, X_input_ids, X_token_type_ids, score_prediction, model, loss
        gc.collect()

        return (loss_val, forced_teaching)

    return (-1,-1)

In [33]:
def evaluate(model, data, device, max_length=20):
    answer   = data["answer"]
    context  = data["context"]
    question = data["question"]

    model = model.eval()
    formed_question = ""

    for _ in range(max_length):
        X = get_bert_input(context, answer, formed_question + " [MASK]")

        X_input_ids = X["input_ids"].to(device)
        X_token_type_ids = X["token_type_ids"].to(device)

        score_prediction = model(X_input_ids,
                                token_type_ids=X_token_type_ids)
        
        id = score_prediction.argmax().item()
        word = tokenizer.convert_ids_to_tokens(id)

        if word == "[SEP]":
            break
        formed_question += word + " "

    X_input_ids.detach()
    X_token_type_ids.detach()
    torch.cuda.empty_cache()
    del X_input_ids, X_token_type_ids
    gc.collect()

    print("context           :",context)
    print("answer            :",answer)
    print("Predicted question:",formed_question)
    print("Real question     :",question)

In [34]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

config = BertConfig() #the standard configuration of BERT
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) #load the pre trained model

model = Bert_QG(bert_model, config).to(device) #creating the model

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




In [None]:
loss_fn   = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

EPOCHS = 5
total_steps = EPOCHS * len(dataset)

In [82]:
class Bert_QG(nn.Module):
    def __init__(self, bert_model, config):
        super(Bert_QG, self).__init__()
        #the pre trained bert model
        self.bert = bert_model          
        self.cls = BertOnlyMLMHead(config)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        output_attentions=None,
        output_hidden_states=None):

        batch_size = input_ids.shape[0]
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        #we tae the last hidden state from all the hidden states since it represent
        #the output of the [MASK] token, and it represent the next quesry of the question
        mask_out = outputs[0][0][-1]
        prediction_score = self.cls(mask_out)

        return prediction_score

model = Bert_QG(bert_model, config).to(device) #creating the model

In [84]:
out = model(torch.randint(0, 100, size=(5,50)))

torch.Size([5, 50])


In [81]:
out[0].shape

torch.Size([5, 50, 768])

In [58]:
torch.randint(0, 5, size=(1,5))

tensor([[2, 4, 2, 0, 1]])

In [None]:
L = torch.tensor