In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


import torch
import torch.nn as nn
import torch.nn.functional as F

import random

import gc
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("the device is:",device)

the device is: cuda:0


## Prepare the dataset ( SQuAD )

In [None]:
#The data class
class Dataset():
    def __init__(self, qac_path):
        self.qac_path = qac_path

        data_qac = pd.read_csv(qac_path)
        data = pd.DataFrame()
        data["answers"]   = data_qac["answers"]
        data["questions"] = data_qac["questions"]
        data["contexts"]  = data_qac["contexts"]
        self.qac_data = data

    def __len__(self):
        return len(self.qac_data)

    def __getitem__(self, id):
        item     = self.qac_data.loc[id]
        answer   = item["answers"]
        question = item["questions"]
        context  = item["contexts"]
        return {"answer":answer, "question":question, "context":context}


In [None]:
#the question, answer, context csv_file path
qac_path       = "drive/My Drive/QCM GENERATION AI/squad2.csv"

In [None]:
#our data set
dataset = Dataset(qac_path=qac_path)
print("length:",len(dataset))
dataset.qac_data.head()

length: 111623


Unnamed: 0,answers,questions,contexts
0,in the late 1990s,When did Beyonce start becoming popular?,"Born and raised in Houston, Texas, she perform..."
1,singing and dancing,What areas did Beyonce compete in when she was...,"Born and raised in Houston, Texas, she perform..."
2,2003,When did Beyonce leave Destiny's Child and bec...,Their hiatus saw the release of Beyoncé's debu...
3,"Houston, Texas",In what city and state did Beyonce grow up?,"Born and raised in Houston, Texas, she perform..."
4,late 1990s,In which decade did Beyonce become famous?,"Born and raised in Houston, Texas, she perform..."


### IMPORTANT !!
We are planning to use the BERT Pre-trained model, which have a maxium of 512 token per input.

So the problem here is that we may have inputs ( [CLS], context, [SEP], answer, [SEP], question, [MASK] ) that contains more than 512 token.

**I'm not going to change the dataset** (not going to delete the answer,question,context combinations that has more than 512 tokens ), **but in the training, we will first check if the len(input) <= 512** and then we make the forward propagation.


# Preparing the model ( BERT )

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 21.2MB/s eta 0:00:01[K     |▌                               | 20kB 26.3MB/s eta 0:00:01[K     |▊                               | 30kB 30.5MB/s eta 0:00:01[K     |█                               | 40kB 29.2MB/s eta 0:00:01[K     |█▎                              | 51kB 27.6MB/s eta 0:00:01[K     |█▌                              | 61kB 22.3MB/s eta 0:00:01[K     |█▊                              | 71kB 22.9MB/s eta 0:00:01[K     |██                              | 81kB 20.1MB/s eta 0:00:01[K     |██▎                             | 92kB 19.2MB/s eta 0:00:01[K     |██▌                             | 102kB 20.0MB/s eta 0:00:01[K     |██▊                             | 112kB 20.0MB/s eta 0:00:01[K     |███                             | 

In [None]:
from transformers import BertTokenizer

PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [None]:
answer_ = dataset[0]["answer"]
question_ = dataset[0]["question"]
context_  = dataset[0]["context"]

answer_tokens = tokenizer.tokenize(answer_)
answer_token_ids = tokenizer.convert_tokens_to_ids(answer_tokens)

question_tokens = tokenizer.tokenize(question_)
question_token_ids = tokenizer.convert_tokens_to_ids(question_tokens)

context_tokens = tokenizer.tokenize(context_)
context_token_ids = tokenizer.convert_tokens_to_ids(context_tokens)

print(f'   Answer: {answer_}')
print(f'   Tokens: {answer_tokens}')
print(f'Token IDs: {answer_token_ids}', end="\n\n")

print(f' Question: {question_}')
print(f'   Tokens: {question_tokens}')
print(f'Token IDs: {question_token_ids}', end="\n\n")

print(f'  Context: {context_}')
print(f'   Tokens: {context_tokens}')
print(f'Token IDs: {context_token_ids}', end="\n\n")

   Answer: in the late 1990s
   Tokens: ['in', 'the', 'late', '1990s']
Token IDs: [1107, 1103, 1523, 3281]

 Question: When did Beyonce start becoming popular?
   Tokens: ['When', 'did', 'Bey', '##on', '##ce', 'start', 'becoming', 'popular', '?']
Token IDs: [1332, 1225, 24896, 1320, 2093, 1838, 2479, 1927, 136]

  Context: Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child
   Tokens: ['Born', 'and', 'raised', 'in', 'Houston', ',', 'Texas', ',', 'she', 'performed', 'in', 'various', 'singing', 'and', 'dancing', 'competitions', 'as', 'a', 'child', ',', 'and', 'rose', 'to', 'fame', 'in', 'the', 'late', '1990s', 'as', 'lead', 'singer', 'of', 'R', '&', 'B', 'girl', '-', 'group', 'Destiny', "'", 's', 'Child']
Token IDs: [3526, 1105, 2120, 1107, 4666, 117, 2245, 117, 1131, 1982, 1107, 1672, 4241, 1105, 5923, 6025, 1112, 170, 2027, 117, 1105, 3152, 1106, 8408

In [None]:
def get_bert_input(context, answer, question="", max_length=512):
    """
        inputs = context, answer and question
        this function is to :
            create our input for the bert model.
            get the attention mask.
            get the token type ids.

        we create an input as input = [CLS], context, [SEP], answer, [SEP], question, [MASK]
        our two segments are: segment_A : [CLS], context, [SEP], answer, [SEP]
                              segment_B : question, [MASK]

        returns a dict of:
            input_ids: tensor( input ids )
            attention_mask : tensor( int )
            token type ids : tensor( int )
    """
    #prepare the input as a text
    txt = "[CLS] " + context + " [SEP] " + answer + " [SEP] " + question + " [MASK]"

    #encode the inout to get the tokens
    encoding = tokenizer.encode_plus(
      txt,
      add_special_tokens=True,
      max_length=max_length,
      return_token_type_ids=False,
      return_tensors='pt',    
    )

    question_tokens = tokenizer.tokenize(question)
    question_token_ids = tokenizer.convert_tokens_to_ids(question_tokens)

    question_tokens.insert(0,"")
    question_token_ids.append(102)


    #delete the last [SEP]

    encoding["input_ids"] = encoding["input_ids"][0][0:-1].view(1,-1)

    #create the segment_A = [CLS] context [SEP] answer [SEP]
    #create the segment_B = question [MASK]
    last_sep_id = torch.nonzero(encoding["input_ids"].flatten() == 102).flatten()[-1]
    first_zero = len(encoding["input_ids"][0])

    segment_A = torch.zeros(last_sep_id + 1)
    segment_B = torch.ones(first_zero - last_sep_id - 1)

    #concatenate the two segments and add the padding
    token_type_ids = torch.cat((segment_A, segment_B), 0)

    return {"input":txt,
            "input_ids":encoding["input_ids"].long().to(device),
            "question_x":question_tokens,
            "question_y":torch.tensor(question_token_ids),
            "token_type_ids":token_type_ids.long().to(device)}



In [None]:
class Bert_QG(nn.Module):
    def __init__(self, bert_model, config):
        super(Bert_QG, self).__init__()
        #the pre trained bert model
        self.bert = bert_model          
        self.cls = BertOnlyMLMHead(config)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        output_attentions=None,
        output_hidden_states=None):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        #we tae the last hidden state from all the hidden states since it represent
        #the output of the [MASK] token, and it represent the next quesry of the question

        mask_out = outputs[0][0][-1]
        prediction_score = self.cls(mask_out)

        return prediction_score

Creating the model:

# Training:

In [None]:
from transformers import BertConfig, BertModel
from transformers.modeling_bert import BertOnlyMLMHead
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

In [None]:
def get_question_x_y(question):
    question_x = tokenizer.tokenize(question)
    question_token_ids = tokenizer.convert_tokens_to_ids(question_x)

    question_x.insert(0,"")
    question_token_ids.append(102)
    return question_x, torch.tensor(question_token_ids).long()

In [None]:
def train_epoch_v2(model, data, loss_fn, optimizer, device, forced_teaching_rate):

    optimizer.zero_grad()

    answer   = data["answer"] 
    question = data["question"]
    context  = data["context"]

    #here to get the length of the whole input
    tokens = tokenizer.tokenize(" [CLS] "+context+" [SEP] "+answer+" [SEP] "+question+" [SEP]" )
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    loss = 0

    #check if the length is under 500 ( it should be lower than 512)
    if (len(token_ids) <= 500) and question != "":

        #exemple:
        #question: How are you doing ?
        #question_x: [""   , "How", "are", "you"  ,"doing", "?""]
        #               |       |     |       |       |       |  
        #question_y: ["How", "are", "you", "doing", "?"   , "[SEP]"]
        question_x, question_y = get_question_x_y(question)

        losses = []
        sum_losses = 0

        model = model.train()

        #initialize the question
        formed_question = ""
        i = 0

        forced_teaching = True if random.random() < forced_teaching_rate else False

        #case of forced teaching = True
        if forced_teaching:
            for (q_x, q_y) in zip(question_x, question_y):
                #we token q_x to the question
                formed_question = formed_question + " " +q_x
                #we get the full input = [CLS], context, [SEP], answer, [SEP], formed_question, [MASK]
                X = get_bert_input(context, answer, formed_question,)
                X_input_ids      = X["input_ids"]
                X_token_type_ids = X["token_type_ids"]
                score_prediction = model(X_input_ids,
                                        token_type_ids=X_token_type_ids)  
                target =  q_y.view(1).to(device)
                loss  +=  loss_fn(score_prediction.view(1, -1),target)
                #check if we have a memory leak, if True we backward and update weights to free memory
                if torch.cuda.memory_allocated() > 8000000000:
                    loss = loss / i
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    loss = 0
                    i = 0
                i += 1

        else:
            q_x = ''
            for  q_y in  question_y:
                #we token q_x to the question
                formed_question = formed_question + " " +q_x
                #we get the full input = [CLS], context, [SEP], answer, [SEP], formed_question, [MASK]
                X = get_bert_input(context, answer, formed_question,)
                X_input_ids      = X["input_ids"]
                X_token_type_ids = X["token_type_ids"]
                score_prediction = model(X_input_ids,
                                        token_type_ids=X_token_type_ids)
                target =  q_y.view(1).to(device)
                loss  +=  loss_fn(score_prediction.view(1, -1),target)
                #check if we have a memory leak, if True we backward and update weights to free memory
                if torch.cuda.memory_allocated() > 8000000000:
                    loss = loss / i
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    loss = 0
                    i = 0
                i += 1

                # the next word if the output of the previous iteration
                id  = score_prediction.argmax().item()
                q_x = tokenizer.convert_ids_to_tokens(id)



        if i > 1:
            loss = loss / i
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        try:
            loss.detach()
            loss_val = loss.item()
        except:
            loss_val = loss

        del  target, X_input_ids, X_token_type_ids, score_prediction, model, loss
        gc.collect()

        return (loss_val, forced_teaching)

    return (-1,-1)

In [None]:
def evaluate(model, data, device, max_length=20):
    answer   = data["answer"]
    context  = data["context"]
    question = data["question"]

    model = model.eval()
    formed_question = ""

    for _ in range(max_length):
        X = get_bert_input(context, answer, formed_question + " [MASK]")

        X_input_ids = X["input_ids"].to(device)
        X_token_type_ids = X["token_type_ids"].to(device)

        score_prediction = model(X_input_ids,
                                token_type_ids=X_token_type_ids)
        
        id = score_prediction.argmax().item()
        word = tokenizer.convert_ids_to_tokens(id)

        if word == "[SEP]":
            break
        formed_question += word + " "

    X_input_ids.detach()
    X_token_type_ids.detach()
    torch.cuda.empty_cache()
    del X_input_ids, X_token_type_ids
    gc.collect()

    print("context           :",context)
    print("answer            :",answer)
    print("Predicted question:",formed_question)
    print("Real question     :",question)

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

config = BertConfig() #the standard configuration of BERT
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) #load the pre trained model

model = Bert_QG(bert_model, config).to(device) #creating the model

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




In [None]:
loss_fn   = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

EPOCHS = 5
total_steps = EPOCHS * len(dataset)

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

### Start the training:

In [None]:
check_point = os.listdir("drive/My Drive/QCM GENERATION AI/check_points_v2/")[-1]
print("loaded checkpoint:", check_point)
model.load_state_dict(torch.load("drive/My Drive/QCM GENERATION AI/check_points_v2/"+check_point))

In [None]:
open("drive/My Drive/QCM GENERATION AI/steps.txt").read()

'2500'

In [None]:
EPOCHS = 1
PRINT_PER = 100
EVAL_PER = 1000
SAVE_PER = 1000
losses = []

save_n = len(os.listdir("drive/My Drive/QCM GENERATION AI/check_points_v2/"))
step = int(open("drive/My Drive/QCM GENERATION AI/steps.txt").read())

while step < len(dataset):

    for _ in range(1000):
        i = random.randint(0, len(dataset))
        data = dataset[i]
        loss, forced_teaching = train_epoch_v2(model, data, loss_fn, optimizer, device, forced_teaching_rate=1)

        if loss != -1:
            losses.append(loss)

        if step % PRINT_PER == 0:
            print("[CHECK POINT] CP:",save_n,"    [STEP] Step:",step,"/",len(dataset),"    [LOSS] loss:",sum(losses)/len(losses))
            losses = []

        if step % EVAL_PER == 0:
            evaluate(model, dataset[random.randint(0, len(dataset))], device)

        if step % SAVE_PER == 0:
            print("SAVING MODEL..")
            torch.save(model.state_dict(), "drive/My Drive/QCM GENERATION AI/check_points_v2/model_checkpoint"+str(save_n))
            f = open("drive/My Drive/QCM GENERATION AI/steps.txt", "w")
            f.write(str(step))
            f.close()
            print("MODEL SAVED ", end="\n\n")
            save_n = save_n + 1

        step += 1
        if step == len(dataset):
            break


[CHECK POINT] CP: 4     [STEP] Step: 2500 / 111623     [LOSS] loss: 4.81884765625
[CHECK POINT] CP: 4     [STEP] Step: 2600 / 111623     [LOSS] loss: 4.9817125248908996
[CHECK POINT] CP: 4     [STEP] Step: 2700 / 111623     [LOSS] loss: 4.885774617195129
[CHECK POINT] CP: 4     [STEP] Step: 2800 / 111623     [LOSS] loss: 4.852754747867584
[CHECK POINT] CP: 4     [STEP] Step: 2900 / 111623     [LOSS] loss: 4.898598160743713
[CHECK POINT] CP: 4     [STEP] Step: 3000 / 111623     [LOSS] loss: 4.746620047092438
context           : The Standard Model groups matter particles into three generations, where each generation consists of two quarks and two leptons
answer            : The Standard Model
Predicted question: is what is the name of the first time the first time the first time the first is used ? 
Real question     : What model has two generations?
SAVING MODEL..
MODEL SAVED 

[CHECK POINT] CP: 5     [STEP] Step: 3100 / 111623     [LOSS] loss: 5.154781892299652
[CHECK POINT] CP: 5     