In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1

In [None]:
import json
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset

# Load the data from the intents file
with open('datasets.json', 'r') as file:
    data = json.load(file)

# Prepare the dataset
inputs = [json.dumps(entry['input']) for entry in data]
outputs = [entry['output'] for entry in data]
dataset = Dataset.from_dict({'input': inputs, 'output': outputs})


# Load the Pegasus tokenizer and model


tokenizer = PegasusTokenizer.from_pretrained('/content/drive/MyDrive/pegasus_intents_tokenizer_new')
model = PegasusForConditionalGeneration.from_pretrained('/content/drive/MyDrive/pegasus_intents_model_new')

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples['input'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], max_length=1024, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1084 [00:00<?, ? examples/s]



In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.8929,0.301542
2,0.4014,0.239874
3,0.2994,0.209787
4,0.2658,0.194611
5,0.2476,0.189451


Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


TrainOutput(global_step=2710, training_loss=0.408373057798266, metrics={'train_runtime': 1465.4943, 'train_samples_per_second': 3.698, 'train_steps_per_second': 1.849, 'total_flos': 611008853901312.0, 'train_loss': 0.408373057798266, 'epoch': 5.0})

In [None]:
model.save_pretrained("./pegasus_intents_model_new1")
tokenizer.save_pretrained("./pegasus_intents_tokenizer_new1")

Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


('./pegasus_intents_tokenizer_new1/tokenizer_config.json',
 './pegasus_intents_tokenizer_new1/special_tokens_map.json',
 './pegasus_intents_tokenizer_new1/spiece.model',
 './pegasus_intents_tokenizer_new1/added_tokens.json')

In [None]:
def generate_output(input_json):
    input_text = json.dumps(input_json)
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True)
    # Move input tensors to GPU if available
    inputs = {k: v.to(model.device) for k, v in inputs.items()}  # Add this line
    summary_ids = model.generate(inputs['input_ids'])
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output

# Example usage
new_input =  {
        "address": {
            "street": "2217 Price Pines Suite 902",
            "city": "Daltonton",
            "state": "UT",
            "zip": "87392",
            "country": "Burkina Faso"
        }
    }
output = generate_output(new_input)
print(output)

The address of the patient is 2217 Price Pines Suite 902 in Daltonton, UT 87392, Burkina Faso.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r /content/pegasus_intents_tokenizer_new1 /content/drive/MyDrive/


In [None]:
!cp -r /content/pegasus_intents_model_new1 /content/drive/MyDrive/

In [None]:


# Import necessary libraries
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sklearn.metrics import precision_score

# Download necessary NLTK data files
nltk.download('punkt')

# Sample test data
test_data = [


    {'input': {
        'address': {
            'street': '2217 Price Pines Suite 902',
            'city': 'Daltonton',
            'state': 'UT',
            'zip': '87392',
            'country': 'Burkina Faso'
        }
    }, 'expected_output': 'The address of the patient is 2217 Price Pines Suite 902, Daltonton, UT 87392, Burkina Faso.'},

    {'input': {
        'address': {
            'street': '2444 Smith Brooks',
            'city': 'South Kathy',
            'state': 'KY',
            'zip': '04207',
            'country': 'Slovenia'
        }
    }, 'expected_output': 'The address of the patient is 2444 Smith Brooks, South Kathy, KY 04207, Slovenia.'},
    {
        'input': {
            "appointments": [
                {
                    "doctor": {
                        "name": "Emily Clark"
                    }
                }
            ]
        },
        'expected_output': "The doctor for the appointment is Emily Clark."
    },
    {
        'input': {
            "appointments": [
                {
                    "doctor": {
                        "specialty": "Endocrinology"
                    }
                }
            ]
        },
        'expected_output': "The doctor's specialty for the appointment is Endocrinology."
    },
    {
        'input': {
            "appointments": [
                {
                    "notes": "Check-up on diabetes management."
                }
            ]
        },
        'expected_output': "The appointment notes are given as Check-up on diabetes management."
    },
    {
    'input': {
        "emergency_contact": {
            "name": "Prathiksha"
        }
    },
    'expected_output': "The Emergency Contact Name is Howard"
},
{
    'input': {
        "emergency_contact": {
            "name": "Christopher Williams"
        }
    },
    'expected_output': "The Emergency Contact Name is Christine."
}


]

# Initialize metrics
bleu_scores = []
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
precisions = []

# Initialize the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluation
for data in test_data:
    generated_query = generate_output(data['input'])

    # Calculate BLEU score
    reference = nltk.word_tokenize(data['expected_output'])
    hypothesis = nltk.word_tokenize(generated_query)
    bleu_score = sentence_bleu([reference], hypothesis)
    bleu_scores.append(bleu_score)

    # Calculate ROUGE scores
    rouge_score = scorer.score(data['expected_output'], generated_query)
    rouge_scores['rouge1'].append(rouge_score['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(rouge_score['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(rouge_score['rougeL'].fmeasure)

    # Calculate Precision
    reference_set = set(reference)
    hypothesis_set = set(hypothesis)
    true_positives = len(reference_set.intersection(hypothesis_set))
    precision = true_positives / len(hypothesis_set) if hypothesis_set else 0
    precisions.append(precision)

# Average metrics
avg_bleu = sum(bleu_scores) / len(bleu_scores)
avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])
avg_precision = sum(precisions) / len(precisions)

print(f'Average BLEU Score: {avg_bleu}')
print(f'Average ROUGE-1 Score: {avg_rouge1}')
print(f'Average ROUGE-2 Score: {avg_rouge2}')
print(f'Average ROUGE-L Score: {avg_rougeL}')
print(f'Average Precision: {avg_precision}')





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Average BLEU Score: 0.8591090330390496
Average ROUGE-1 Score: 0.9388944388944388
Average ROUGE-2 Score: 0.9186426476749058
Average ROUGE-L Score: 0.9388944388944388
Average Precision: 0.9159505907626209
