# model training

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="train.jsonl", split="train")

In [None]:
inputs = [entry["input"] for entry in dataset]
outputs = [entry["output"] for entry in dataset]
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def tokenize_function(examples):
    model_inputs = tokenizer(examples["input"], padding="max_length", truncation=True)
    labels = tokenizer(examples["output"], padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [28]:
from transformers import T5ForConditionalGeneration

# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from transformers import Trainer, TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,   
    eval_dataset=tokenized_dataset,    
)


In [33]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.6218,0.026678
2,0.1663,0.039884
3,0.0927,0.041378
4,0.0709,0.038004
5,0.0675,0.036844


TrainOutput(global_step=160, training_loss=0.27615029737353325, metrics={'train_runtime': 79.8276, 'train_samples_per_second': 15.659, 'train_steps_per_second': 2.004, 'total_flos': 169177251840000.0, 'train_loss': 0.27615029737353325, 'epoch': 5.0})

In [35]:
save_directory = './complaince-latest'
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

('./complaince-latest/tokenizer_config.json',
 './complaince-latest/special_tokens_map.json',
 './complaince-latest/spiece.model',
 './complaince-latest/added_tokens.json')

# testing

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Model
from torch import nn

# Load the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('./complaince-latest')
tokenizer = T5Tokenizer.from_pretrained("./complaince-latest")  # Replace with your tokenizer path

def preprocess_input_for_comp_llm(conversation):
    intro = (
        """Given the following conversation, determine whether the agent has shared sensitive information (like balance in either words or numbers or account details) without verifying the customer's identity (such as date of birth, address, or Social Security Number). Ensure that the name does not come under verification. Respond with "True" if a violation has occurred, otherwise "False".\n\nConversation:\n"""
    )

    dialogue = ""
    for turn in conversation:
        dialogue += f"{turn['speaker']}: {turn['text']}\n"

    final_input = intro + dialogue
    return final_input

def predict_conversation(conversation):
    input_text = preprocess_input_for_comp_llm(conversation)
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
   
  
    with torch.no_grad():
        outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=5,
        decoder_start_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )
        
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if decoded_output.lower() == 'true':
        return "True"
    else:
        return "False"

result = predict_conversation(a)
print(result)

False


In [89]:
a = [
    {
        "speaker": "Agent",
        "text": "Hello, this is Mark calling from XYZ Collections. How are you today?",
        "stime": 0,
        "etime": 7
    },
    {
        "speaker": "Customer",
        "text": "I'm doing alright, thanks. What\u2019s this about?",
        "stime": 7.5,
        "etime": 12
    },
    {
        "speaker": "Agent",
        "text": "I'm calling regarding an outstanding balance on your account with Mega Credit. Can I get you to confirm your name for security purposes?",
        "stime": 11,
        "etime": 19
    },
    {
        "speaker": "Customer",
        "text": "Sure, it's Jessica Brown.",
        "stime": 18,
        "etime": 21
    },
    {
        "speaker": "Agent",
        "text": "Thank you, Jessica. You currently have a balance of two thousand five hundred dollars.",
        "stime": 21.5,
        "etime": 30
    },
    {
        "speaker": "Customer",
        "text": "That's quite a bit. Can you give me more details on that?",
        "stime": 29,
        "etime": 32
    },
    {
        "speaker": "Agent",
        "text": "Of course! This amount includes your last payment and the missed payments since then.",
        "stime": 31,
        "etime": 39
    },
    {
        "speaker": "Customer",
        "text": "I see. I wasn't aware of the total. What options do I have?",
        "stime": 38.5,
        "etime": 44
    },
    {
        "speaker": "Agent",
        "text": "You can make a payment plan or pay the full amount today. What works best for you?",
        "stime": 43,
        "etime": 52
    },
    {
        "speaker": "Customer",
        "text": "I need some time to think about that.",
        "stime": 51,
        "etime": 55
    },
    {
        "speaker": "Agent",
        "text": "No problem! Please feel free to reach out if you have any questions, but just remember the balance is accumulating interest.",
        "stime": 54,
        "etime": 64
    },
    {
        "speaker": "Customer",
        "text": "Got it, thank you.",
        "stime": 63,
        "etime": 65
    },
    {
        "speaker": "Agent",
        "text": "Thank you for your time, Jessica. Have a great day!",
        "stime": 66,
        "etime": 73
    },
    {
        "speaker": "Customer",
        "text": "You too, bye!",
        "stime": 72,
        "etime": 75
    }
]