In [1]:
import torch
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA RTX A4000


In [4]:
from transformers import AutoModelWithLMHead, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-common_gen")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-common_gen")
model.to(device)
print('Moved model to device')
def gen_sentence(words, max_length=32):
    input_text = words
    features = tokenizer([input_text], return_tensors='pt')
    input_ids = features['input_ids'].to(device)
    attention_mask = features['attention_mask'].to(device)
    output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)
    return tokenizer.decode(output[0], skip_special_tokens=True)



Moved model to device


In [4]:
from datasets import load_dataset
dataset = load_dataset('common_gen')
dataset

Found cached dataset common_gen (C:/Users/NIT/.cache/huggingface/datasets/common_gen/default/2020.5.30/1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 67389
    })
    validation: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 4018
    })
    test: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 1497
    })
})

In [8]:
from transformers import AutoModelWithLMHead, AutoTokenizer
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-common_gen")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-common_gen")
model.to(device)
print('Moved model to device')

def gen_sentences(words_list, max_length=32, batch_size=32):
    input_texts = words_list
    num_batches = len(input_texts) // batch_size + 1
    predictions = []
    
    with tqdm(total=num_batches, desc='Generating Sentences') as pbar:
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            batch_texts = input_texts[start_idx:end_idx]
            
            if len(batch_texts) > 0:
                features = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
                input_ids = features['input_ids'].to(device)
                attention_mask = features['attention_mask'].to(device)
                outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)
                batch_predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
                predictions.extend(batch_predictions)
            
            pbar.update(1)
    
    return predictions

concept_sentences = []
for concepts in dataset['validation']['concepts']:
    concept_sentence = " ".join(concepts)
    concept_sentences.append(concept_sentence)

predictions = gen_sentences(concept_sentences)


Moved model to device


Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [01:05<00:00,  1.93it/s]


In [13]:
!pip install rouge

Defaulting to user installation because normal site-packages is not writeable
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1



[notice] A new release of pip is available: 23.0 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from rouge import Rouge
# Initialize ROUGE scorer
rouge_scorer = Rouge()

# Calculate ROUGE scores for each sentence
rouge_scores = rouge_scorer.get_scores(predictions, dataset['validation']['target'])

# Calculate average ROUGE scores
rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

print("Average ROUGE-1 Score:", avg_rouge_1)
print("Average ROUGE-2 Score:", avg_rouge_2)
print("Average ROUGE-L Score:", avg_rouge_l)

Average ROUGE-1 Score: 0.43081504149069927
Average ROUGE-2 Score: 0.15452907351747325
Average ROUGE-L Score: 0.3703312074539739


In [28]:
from datasets import load_from_disk
pertNames = ["noNouns", "noVerbs", "noFirst", "noLast", "swapText", "addText", "changeChar", "bias"]

for pert in pertNames:
    valDs = load_from_disk('../../Datasets/'+'common_gen'+'validation'+pert)
    concept_sentences = []
    for concepts in valDs['concepts']:
        concept_sentence = " ".join(concepts)
        concept_sentences.append(concept_sentence)

    predictions = gen_sentences(concept_sentences)
    
    rouge_scorer = Rouge()

    # Calculate ROUGE scores for each sentence
    rouge_scores = rouge_scorer.get_scores(predictions, valDs['target'])

    # Calculate average ROUGE scores
    rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
    rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
    rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    print("Perturbation:", pert)
    print("Average ROUGE-1 Score:", avg_rouge_1)
    print("Average ROUGE-2 Score:", avg_rouge_2)
    print("Average ROUGE-L Score:", avg_rouge_l)
    print()

Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:57<00:00,  2.18it/s]


Perturbation: noNouns
Average ROUGE-1 Score: 0.08085281230749881
Average ROUGE-2 Score: 0.0035266703605457573
Average ROUGE-L Score: 0.07841141460672997



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [01:05<00:00,  1.94it/s]


Perturbation: noVerbs
Average ROUGE-1 Score: 0.4253911810419506
Average ROUGE-2 Score: 0.14974445729833707
Average ROUGE-L Score: 0.3653882315170545



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [01:01<00:00,  2.05it/s]


Perturbation: noFirst
Average ROUGE-1 Score: 0.35562313547585467
Average ROUGE-2 Score: 0.10729990828499364
Average ROUGE-L Score: 0.3089223558872054



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [01:01<00:00,  2.04it/s]


Perturbation: noLast
Average ROUGE-1 Score: 0.35561257443926336
Average ROUGE-2 Score: 0.10868197691316023
Average ROUGE-L Score: 0.3092190595420525



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [01:12<00:00,  1.74it/s]


Perturbation: swapText
Average ROUGE-1 Score: 0.43183945325225037
Average ROUGE-2 Score: 0.15510133678348315
Average ROUGE-L Score: 0.37290191500315895



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [01:34<00:00,  1.33it/s]


Perturbation: addText
Average ROUGE-1 Score: 0.38991934742357637
Average ROUGE-2 Score: 0.12700927838026813
Average ROUGE-L Score: 0.3305986696114022



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [01:44<00:00,  1.21it/s]


Perturbation: changeChar
Average ROUGE-1 Score: 0.2956996085224905
Average ROUGE-2 Score: 0.0860855014962293
Average ROUGE-L Score: 0.25866376338279934



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [01:09<00:00,  1.82it/s]


Perturbation: bias
Average ROUGE-1 Score: 0.42771324642964964
Average ROUGE-2 Score: 0.15322178306512263
Average ROUGE-L Score: 0.3684103062235944



### GPT-2 for text generation

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mrm8488/GPT-2-finetuned-common_gen")
model = AutoModelForCausalLM.from_pretrained("mrm8488/GPT-2-finetuned-common_gen")
model.to(device)
print("moved model to cuda")

moved model to cuda


In [33]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

import logging 
# Set the logging level to suppress warning messages
logging.getLogger("transformers").setLevel(logging.ERROR)



temperature = 1.0
k = 0
p = 0.9
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='left'


def gpt2_gen_sentences(words_list, max_length=25, batch_size=32):
    input_texts = words_list
    num_batches = len(input_texts) // batch_size + 1
    predictions = []
    
    with tqdm(total=num_batches, desc='Generating Sentences') as pbar:
        for i in range(num_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            batch_texts = input_texts[start_idx:end_idx]
            
            if len(batch_texts) > 0:
                features = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
                input_ids = features['input_ids'].to(device)
                attention_mask = features['attention_mask'].to(device)
                outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length,
                                         temperature = temperature, repetition_penalty = 1.0,
                                         top_k = k, top_p = p, num_return_sequences=1)
                batch_predictions = [tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True).split(': ', 1)[-1] for output in outputs]
                batch_predictions = [prediction.replace('\n', '') for prediction in batch_predictions]
                predictions.extend(batch_predictions)
            pbar.update(1)
    
    return predictions


concept_sentences = []
for concepts in dataset['validation']['concepts']:
    concept_sentence = "<|endoftext|> " +  ", ".join(concepts) + ":"
    concept_sentences.append(concept_sentence)

predictions = gpt2_gen_sentences(concept_sentences)




Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:38<00:00,  3.24it/s]


In [None]:
import logging 
# Set the logging level to suppress warning messages
logging.getLogger("transformers").setLevel(logging.ERROR)

temperature = 1.0
k = 0
p = 0.9
repetition_penalty = 1.0
num_return_sequences = 1
tokenizer.pad_token = tokenizer.eos_token
stop_token = '.'

def gpt2_gen_sentence(prompt, max_length=20):
    encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
    encoded_prompt = encoded_prompt.to(device)
    input_ids = encoded_prompt
    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length +  len(encoded_prompt[0]),
        temperature=temperature,
        top_k=k,
        top_p=p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=num_return_sequences,
    )
    
    # Decode text
    text = tokenizer.decode(output_sequences[0], clean_up_tokenization_spaces=True)

    # Remove all text after the stop token
    text = text[: text.find(stop_token) if stop_token else None]
    text = text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
    return text



concept_sentences = []
for concepts in dataset['validation']['concepts']:
    concept_sentence = "<|endoftext|> " +  ", ".join(concepts) + ":"
    concept_sentences.append(concept_sentence)

predictions = [gpt2_gen_sentence(sentence) for sentence in concept_sentences]

In [34]:
from rouge import Rouge
# Initialize ROUGE scorer
rouge_scorer = Rouge()

# Calculate ROUGE scores for each sentence
rouge_scores = rouge_scorer.get_scores(predictions, dataset['validation']['target'])

# Calculate average ROUGE scores
rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

print("Average ROUGE-1 Score:", avg_rouge_1)
print("Average ROUGE-2 Score:", avg_rouge_2)
print("Average ROUGE-L Score:", avg_rouge_l)

Average ROUGE-1 Score: 0.32566055197067684
Average ROUGE-2 Score: 0.08837040134761721
Average ROUGE-L Score: 0.277551081859734


In [38]:
from datasets import load_from_disk
pertNames = ["noNouns", "noVerbs", "noFirst", "noLast", "swapText", "addText", "changeChar", "bias"]

for pert in pertNames:
    valDs = load_from_disk('../../Datasets/'+'common_gen'+'validation'+pert)
    concept_sentences = []
    for concepts in valDs['concepts']:
        concept_sentence = "<|endoftext|> " +  ", ".join(concepts) + ":"
        concept_sentences.append(concept_sentence)

    predictions = gpt2_gen_sentences(concept_sentences)
    
    rouge_scorer = Rouge()

    # Calculate ROUGE scores for each sentence
    rouge_scores = rouge_scorer.get_scores(predictions, valDs['target'])

    # Calculate average ROUGE scores
    rouge_1_scores = [scores['rouge-1']['f'] for scores in rouge_scores]
    rouge_2_scores = [scores['rouge-2']['f'] for scores in rouge_scores]
    rouge_l_scores = [scores['rouge-l']['f'] for scores in rouge_scores]

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    print("Perturbation:", pert)
    print("Average ROUGE-1 Score:", avg_rouge_1)
    print("Average ROUGE-2 Score:", avg_rouge_2)
    print("Average ROUGE-L Score:", avg_rouge_l)
    print()

Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:47<00:00,  2.64it/s]


Perturbation: noNouns
Average ROUGE-1 Score: 0.0883921871462191
Average ROUGE-2 Score: 0.0013126678622518576
Average ROUGE-L Score: 0.08108837060758858



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:38<00:00,  3.23it/s]


Perturbation: noVerbs
Average ROUGE-1 Score: 0.32290851642811197
Average ROUGE-2 Score: 0.0858385425352512
Average ROUGE-L Score: 0.2752751857756849



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:44<00:00,  2.81it/s]


Perturbation: noFirst
Average ROUGE-1 Score: 0.2760757508885975
Average ROUGE-2 Score: 0.06539975876515017
Average ROUGE-L Score: 0.24106411400256167



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:43<00:00,  2.86it/s]


Perturbation: noLast
Average ROUGE-1 Score: 0.27566074818658265
Average ROUGE-2 Score: 0.0638344633310915
Average ROUGE-L Score: 0.23759388201509557



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:39<00:00,  3.20it/s]


Perturbation: swapText
Average ROUGE-1 Score: 0.32372083752967357
Average ROUGE-2 Score: 0.08676805878262825
Average ROUGE-L Score: 0.27669085592756903



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:25<00:00,  4.90it/s]


Perturbation: addText
Average ROUGE-1 Score: 0.27810935588016633
Average ROUGE-2 Score: 0.0712584204753092
Average ROUGE-L Score: 0.24529013884494635



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:26<00:00,  4.83it/s]


Perturbation: changeChar
Average ROUGE-1 Score: 0.2225393904716752
Average ROUGE-2 Score: 0.049786641357941315
Average ROUGE-L Score: 0.1983627015361544



Generating Sentences: 100%|██████████████████████████████████████████████████████████| 126/126 [00:39<00:00,  3.16it/s]


Perturbation: bias
Average ROUGE-1 Score: 0.3044201464840175
Average ROUGE-2 Score: 0.08014594554088858
Average ROUGE-L Score: 0.25811103365960136

