In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

In [2]:
import json
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset

# Load the data from the intents file
with open('Intents-Pegasus-Updated.json', 'r') as file:
    data = json.load(file)

# Prepare the dataset
inputs = [json.dumps(entry['input']) for entry in data]
outputs = [entry['output'] for entry in data]
dataset = Dataset.from_dict({'input': inputs, 'output': outputs})


# Load the Pegasus tokenizer and model
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [3]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples['input'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], max_length=1024, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/560 [00:00<?, ? examples/s]



In [4]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.548761
2,1.152100,0.385922
3,1.152100,0.346852


Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


TrainOutput(global_step=840, training_loss=0.8969276246570406, metrics={'train_runtime': 761.2341, 'train_samples_per_second': 2.207, 'train_steps_per_second': 1.103, 'total_flos': 425930757292032.0, 'train_loss': 0.8969276246570406, 'epoch': 3.0})

In [5]:
model.save_pretrained("./pegasus_intents_model")
tokenizer.save_pretrained("./pegasus_intents_tokenizer")

Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


('./pegasus_intents_tokenizer/tokenizer_config.json',
 './pegasus_intents_tokenizer/special_tokens_map.json',
 './pegasus_intents_tokenizer/spiece.model',
 './pegasus_intents_tokenizer/added_tokens.json')

In [6]:
def generate_output(input_json):
    input_text = json.dumps(input_json)
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True)
    # Move input tensors to GPU if available
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Add this line
    summary_ids = model.generate(inputs['input_ids'])
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output

# Example usage
new_input =  {'blood_group': 'AB+'}

output = generate_output(new_input)
print(output)

The Blood Group is AB+.


In [None]:
import joblib

joblib.dump(model, 'modelpega.joblib')
joblib.dump(tokenizer, 'tokenizerpega.joblib')

['tokenizerpega.joblib']

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!cp -r /content/pegasus_intents_tokenizer /content/drive/MyDrive/


In [9]:
!cp -r /content/pegasus_intents_model /content/drive/MyDrive/

In [None]:

# Import necessary libraries
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sklearn.metrics import precision_score

# Download necessary NLTK data files
nltk.download('punkt')

# Sample test data
test_data = [
    {'input': {'blood_group': 'AB+'}, 'output': 'Blood Group: AB+'},
    {'input': {'contact': {'phone': '001-492-070-7295'}}, 'output': 'The phone number is 001-492-070-7295'},
    {'input': {'emergency_contact': {'name': 'Stephanie Stevens'}}, 'output': 'The Emergency Contact Name is Stephanie Stevens.'},
    {'input': {'contact': {'address': {'street': '37238 Allison Locks Suite 777', 'city': 'Lake Jose', 'state': 'KY', 'zip': '04806', 'country': 'Qatar'}}}, 'output': 'The Contact Address is 37238 Allison Locks Suite 777, Lake Jose, KY, 04806, Qatar.'},
    {'input': {'insurance': {'claimed_insurance': [{'claim_id': 'C00446', 'date': '2021-10-06', 'amount': '770 USD', 'status': 'Approved'}]}}, 'output': 'The Insurance Claim with ID C00446 was made on 2021-10-06 for an amount of 770 USD and the status is Approved.'}
]




# Initialize metrics
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
precisions = []

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluation
for data in test_data:
    generated_query = generate_output(data['input'])

    # Calculate BLEU score
    reference = nltk.word_tokenize(data['output'])
    hypothesis = nltk.word_tokenize(generated_query)
    bleu_score = sentence_bleu([reference], hypothesis)
    bleu_scores.append(bleu_score)

    rouge_score = scorer.score(data['output'], generated_query)
    rouge_scores['rouge1'].append(rouge_score['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(rouge_score['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(rouge_score['rougeL'].fmeasure)

    # Calculate Precision
    reference_set = set(reference)
    hypothesis_set = set(hypothesis)
    true_positives = len(reference_set.intersection(hypothesis_set))
    precision = true_positives / len(hypothesis_set) if hypothesis_set else 0
    precisions.append(precision)

# Average metrics
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])
avg_precision = sum(precisions) / len(precisions)

print(f'Average BLEU Score: {avg_bleu}')
print(f'Average ROUGE-1 Score: {avg_rouge1}')
print(f'Average ROUGE-2 Score: {avg_rouge2}')
print(f'Average ROUGE-L Score: {avg_rougeL}')
print(f'Average Precision: {avg_precision}')

