In [4]:
import torch
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA RTX A4000


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("p208p2002/gpt2-squad-qg-hl")

model = AutoModelForCausalLM.from_pretrained("p208p2002/gpt2-squad-qg-hl")
model.to(device)
print("moved model to cuda")

moved model to cuda


In [6]:
from datasets import load_dataset
dataset = load_dataset("squad")
dataset

Found cached dataset squad (C:/Users/NIT/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [12]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import logging 
# Set the logging level to suppress warning messages
logging.getLogger("transformers").setLevel(logging.ERROR)



temperature = 1.0
k = 0
p = 0.9
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='left'


def gpt2_gen_sentences(words_list, max_length=1024, batch_size=32):
    input_texts = words_list
    num_batches = len(input_texts) // batch_size + 1
    predictions = []
    
    with tqdm(total=num_batches, desc='Generating Sentences') as pbar:
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            batch_texts = input_texts[start_idx:end_idx]
            
            if len(batch_texts) > 0:
                features = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
                input_ids = features['input_ids'].to(device)
                attention_mask = features['attention_mask'].to(device)
                outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length,
                                         temperature = temperature, repetition_penalty = 1.0,
                                         top_k = k, top_p = p, num_return_sequences=1)
                
                
                batch_predictions = []
                for output in outputs:
                    decoded_output = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                    generated_tokens = decoded_output.split(' ')
                    question_tokens = []
                    hl_count = 0
                    for token in generated_tokens:
                        if token == '[HL]':
                            hl_count += 1
                            if hl_count == 2:
                                break
                        elif hl_count == 2:
                            question_tokens.append(token)
                    question = ' '.join(question_tokens)
                    batch_predictions.append(question)

                predictions.extend(batch_predictions)
            pbar.update(1)
    
    return predictions


inputs = []
for i in range(dataset['validation'].num_rows): 
    inputdata = dataset['validation'][i]['context'] + '[HL]' + dataset['validation'][i]['answers']['text'][0] + '[HL]'
    inputs.append(inputdata)

predictions = gpt2_gen_sentences(inputs)

Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [13:20<00:00,  2.42s/it]


In [8]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import logging 
# Set the logging level to suppress warning messages
logging.getLogger("transformers").setLevel(logging.ERROR)



temperature = 1.0
k = 0
p = 0.9
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='left'


def gpt2_gen_sentences(words_list, max_length=512, batch_size=32):
    input_texts = words_list
    num_batches = len(input_texts) // batch_size + 1
    predictions = []
    
    with tqdm(total=num_batches, desc='Generating Sentences') as pbar:
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            batch_texts = input_texts[start_idx:end_idx]
            
            if len(batch_texts) > 0:
                features = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
                input_ids = features['input_ids'].to(device)
                attention_mask = features['attention_mask'].to(device)
                outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length,
                                         temperature = temperature, repetition_penalty = 1.0,
                                         top_k = k, top_p = p, num_return_sequences=1)
                
                
                batch_predictions = []
                for output in outputs:
                    decoded_output = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
                    last_hl_index = decoded_output.rfind('[HL]')
                    question = decoded_output[last_hl_index + len('[HL]'):].strip()
                    batch_predictions.append(question)
                predictions.extend(batch_predictions)
            pbar.update(1)
    return predictions





In [None]:
inputs = []
for i in range(dataset['validation'].num_rows): 
    inputdata = dataset['validation'][i]['context'] + '[HL]' + dataset['validation'][i]['answers']['text'][0] + '[HL]'
    inputs.append(inputdata)

predictions = gpt2_gen_sentences(inputs)

In [27]:
from rouge import Rouge

# Initialize ROUGE scorer
rouge_scorer = Rouge()

# Replace empty predictions with "?"
predictions = [prediction if prediction else "?" for prediction in predictions]

# Calculate ROUGE scores for each sentence
rouge_scores = rouge_scorer.get_scores(predictions, dataset['validation']['question'])

# Calculate average ROUGE scores
rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

print("Average ROUGE-1 Score:", avg_rouge_1)
print("Average ROUGE-2 Score:", avg_rouge_2)
print("Average ROUGE-L Score:", avg_rouge_l)

Average ROUGE-1 Score: 0.2615455413346295
Average ROUGE-2 Score: 0.08492763221916563
Average ROUGE-L Score: 0.24912288623178772


In [17]:
from datasets import load_from_disk
from rouge import Rouge



pertNames = ["noNouns", "noVerbs", "noFirst", "noLast", "swapText", "addText", "changeChar", "bias"]

for pert in pertNames:
    valDs = load_from_disk('../Datasets/'+'squad'+'validation'+pert)
    inputs = []
    for i in range(len(valDs['context'])): 
        inputdata = valDs[i]['context'] + '[HL]' + valDs[i]['answers']['text'][0] + '[HL]'
        inputs.append(inputdata)

    predictions = gpt2_gen_sentences(inputs)
    # Replace empty predictions with "?"
    predictions = [prediction if prediction else "?" for prediction in predictions]

    
    rouge_scorer = Rouge()

    # Calculate ROUGE scores for each sentence
    rouge_scores = rouge_scorer.get_scores(predictions, valDs['question'])

    # Calculate average ROUGE scores
    rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
    rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
    rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    print("Perturbation:", pert)
    print("Average ROUGE-1 Score:", avg_rouge_1)
    print("Average ROUGE-2 Score:", avg_rouge_2)
    print("Average ROUGE-L Score:", avg_rouge_l)
    print()

Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [20:57<00:00,  3.80s/it]


Perturbation: noNouns
Average ROUGE-1 Score: 0.20557141750747882
Average ROUGE-2 Score: 0.05415378444200015
Average ROUGE-L Score: 0.1968495103745301



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [07:45<00:00,  1.41s/it]


Perturbation: noVerbs
Average ROUGE-1 Score: 0.23397444026505781
Average ROUGE-2 Score: 0.07318150639206751
Average ROUGE-L Score: 0.2231810095808056



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [07:10<00:00,  1.30s/it]


Perturbation: noFirst
Average ROUGE-1 Score: 0.2609917437456836
Average ROUGE-2 Score: 0.08400809879570323
Average ROUGE-L Score: 0.24857285896995027



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [07:22<00:00,  1.34s/it]


Perturbation: noLast
Average ROUGE-1 Score: 0.2602802079521063
Average ROUGE-2 Score: 0.08423108952140376
Average ROUGE-L Score: 0.24798403683678524



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [07:06<00:00,  1.29s/it]


Perturbation: swapText
Average ROUGE-1 Score: 0.2385366578621786
Average ROUGE-2 Score: 0.07921936254384164
Average ROUGE-L Score: 0.22703071850693443



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [09:13<00:00,  1.67s/it]


Perturbation: addText
Average ROUGE-1 Score: 0.25353139937892866
Average ROUGE-2 Score: 0.07909825314128967
Average ROUGE-L Score: 0.2421560476183898



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [12:13<00:00,  2.21s/it]


Perturbation: changeChar
Average ROUGE-1 Score: 0.22282465527808878
Average ROUGE-2 Score: 0.058804071114475207
Average ROUGE-L Score: 0.21446997729919226



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [07:14<00:00,  1.31s/it]


Perturbation: bias
Average ROUGE-1 Score: 0.2520320926154008
Average ROUGE-2 Score: 0.08224004497217495
Average ROUGE-L Score: 0.23946002818860276



In [16]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total Parameters:", total_params)

Total Parameters: 124442112


In [28]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")

model = AutoModelForSeq2SeqLM.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
model.to(device)
print("moved model to ", device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

moved model to  cuda


In [31]:
def gen_sentences(words_list, max_length=128, batch_size=32):
    input_texts = words_list
    num_batches = len(input_texts) // batch_size + 1
    predictions = []
    
    with tqdm(total=num_batches, desc='Generating Sentences') as pbar:
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            batch_texts = input_texts[start_idx:end_idx]
            
            if len(batch_texts) > 0:
                features = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
                input_ids = features['input_ids'].to(device)
                attention_mask = features['attention_mask'].to(device)
                outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)
                batch_predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
                predictions.extend(batch_predictions)
            
            pbar.update(1)
    
    return predictions


inputs = []
for i in range(dataset['validation'].num_rows): 
    inputdata = 'answer: ' + dataset['validation'][i]['answers']['text'][0] + 'context' + dataset['validation'][i]['context']
    inputs.append(inputdata)
    
predictions = gen_sentences(inputs)

Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [05:34<00:00,  1.01s/it]


In [33]:
from rouge import Rouge
# Initialize ROUGE scorer
rouge_scorer = Rouge()

# Calculate ROUGE scores for each sentence
rouge_scores = rouge_scorer.get_scores(predictions, dataset['validation']['question'])

# Calculate average ROUGE scores
rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

print("Average ROUGE-1 Score:", avg_rouge_1)
print("Average ROUGE-2 Score:", avg_rouge_2)
print("Average ROUGE-L Score:", avg_rouge_l)

Average ROUGE-1 Score: 0.41241320036432605
Average ROUGE-2 Score: 0.22388853611313164
Average ROUGE-L Score: 0.3963288135971544


In [41]:
from datasets import load_from_disk
## Perturbations applied on the context and not on the answers
pertNames = ["noNouns", "noVerbs", "noFirst", "noLast", "swapText", "addText", "changeChar", "bias"]

for pert in pertNames:
    valDs = load_from_disk('../../Datasets/'+'squad'+'validation'+pert)
    inputs = []
    for i in range(len(valDs['context'])): 
        inputdata = 'answer: ' + valDs[i]['answers']['text'][0] + 'context' + valDs[i]['context']
        inputs.append(inputdata)
    
    predictions = gen_sentences(inputs)
    
    rouge_scorer = Rouge()

    # Calculate ROUGE scores for each sentence
    rouge_scores = rouge_scorer.get_scores(predictions, valDs['question'])

    # Calculate average ROUGE scores
    rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
    rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
    rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    print("Perturbation:", pert)
    print("Average ROUGE-1 Score:", avg_rouge_1)
    print("Average ROUGE-2 Score:", avg_rouge_2)
    print("Average ROUGE-L Score:", avg_rouge_l)
    print()

Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [05:32<00:00,  1.01s/it]


Perturbation: noNouns
Average ROUGE-1 Score: 0.262085939512435
Average ROUGE-2 Score: 0.08663498674685655
Average ROUGE-L Score: 0.2515729685356585



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [05:30<00:00,  1.00it/s]


Perturbation: noVerbs
Average ROUGE-1 Score: 0.36504205758650093
Average ROUGE-2 Score: 0.17391520343838945
Average ROUGE-L Score: 0.34955748519491414



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [05:27<00:00,  1.01it/s]


Perturbation: noFirst
Average ROUGE-1 Score: 0.38857225890797503
Average ROUGE-2 Score: 0.20274750801523092
Average ROUGE-L Score: 0.37337256117193396



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [05:28<00:00,  1.01it/s]


Perturbation: noLast
Average ROUGE-1 Score: 0.4108130805428246
Average ROUGE-2 Score: 0.22309482329888874
Average ROUGE-L Score: 0.3948261226061751



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [05:43<00:00,  1.04s/it]


Perturbation: swapText
Average ROUGE-1 Score: 0.40446087630407324
Average ROUGE-2 Score: 0.2156343039723088
Average ROUGE-L Score: 0.3884155529947696



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [06:22<00:00,  1.16s/it]


Perturbation: addText
Average ROUGE-1 Score: 0.3984876562845445
Average ROUGE-2 Score: 0.20681129802158715
Average ROUGE-L Score: 0.383188876285867



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [09:04<00:00,  1.64s/it]


Perturbation: changeChar
Average ROUGE-1 Score: 0.28476758165095445
Average ROUGE-2 Score: 0.10962850787422557
Average ROUGE-L Score: 0.27533632405897207



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 331/331 [12:36<00:00,  2.29s/it]


Perturbation: bias
Average ROUGE-1 Score: 0.41199576036413355
Average ROUGE-2 Score: 0.2236740628904512
Average ROUGE-L Score: 0.39602025233789434

