In [1]:
# =========================================
# 1) Imports
# =========================================
import os
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm
  warn(





In [2]:
# =========================================
# 2) Setup Dataset
# =========================================
class ConversationDataset(Dataset):
    def __init__(self, tokenizer, data, max_len=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        partial_text = item['partial_text']
        full_text = item['full_text']
        encoded_partial = self.tokenizer(partial_text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
        encoded_full = self.tokenizer(full_text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")

        return {
            'input_ids': encoded_partial.input_ids.flatten(),
            'attention_mask': encoded_partial.attention_mask.flatten(),
            'labels': encoded_full.input_ids.flatten()
        }

def load_data(tokenizer, file_path, partial_ratio=0.5):
    df = pd.read_csv(file_path)
    dataset = []
    for _, group in df.groupby('CONVERSATION_ID'):
        group = group.sort_values(by='CONVERSATION_STEP')
        full_text = ' '.join(group['TEXT'].tolist())
        partial_text = ' '.join(group['TEXT'].iloc[:int(len(group) * partial_ratio)].tolist())
        dataset.append({'partial_text': partial_text, 'full_text': full_text})
    return dataset


In [3]:
# =========================================
# 3) Load Tokenizer and Model
# =========================================
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.to('cuda' if torch.cuda.is_available() else 'cpu')


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [4]:
# =========================================
# 4) Prepare Training Data
# =========================================
csv_path = "C:/Users/DELL/Desktop/VOIP_Phishing_Attacks/Repos/convoPredict/conversation-prediction/FINAL_DATASET2.csv"
data = load_data(tokenizer, csv_path)
training_set = ConversationDataset(tokenizer, data)


In [5]:
# =========================================
# 5) Set Up Training
# =========================================
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set
)

In [6]:
# =========================================
# 6) Train Model
# =========================================
trainer.train()


  0%|          | 0/57 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
 18%|█▊        | 10/57 [05:19<28:00, 35.76s/it]

{'loss': 13.1637, 'grad_norm': 64.6446762084961, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.53}


 35%|███▌      | 20/57 [11:36<23:26, 38.02s/it]

{'loss': 13.4144, 'grad_norm': 61.89200210571289, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.05}


 53%|█████▎    | 30/57 [18:21<18:09, 40.35s/it]

{'loss': 12.8732, 'grad_norm': 62.35877227783203, 'learning_rate': 3e-06, 'epoch': 1.58}


 70%|███████   | 40/57 [24:53<10:06, 35.69s/it]

{'loss': 12.1277, 'grad_norm': 97.1176986694336, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.11}


 88%|████████▊ | 50/57 [30:39<04:01, 34.55s/it]

{'loss': 11.6697, 'grad_norm': 164.0145721435547, 'learning_rate': 5e-06, 'epoch': 2.63}


100%|██████████| 57/57 [34:36<00:00, 36.42s/it]

{'train_runtime': 2076.1282, 'train_samples_per_second': 0.11, 'train_steps_per_second': 0.027, 'train_loss': 12.415448640522204, 'epoch': 3.0}





TrainOutput(global_step=57, training_loss=12.415448640522204, metrics={'train_runtime': 2076.1282, 'train_samples_per_second': 0.11, 'train_steps_per_second': 0.027, 'total_flos': 30857930735616.0, 'train_loss': 12.415448640522204, 'epoch': 3.0})

In [7]:

# =========================================
# 7) Save Model
# =========================================
model_path = "./t5_finetuned_conversation"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./t5_finetuned_conversation\\tokenizer_config.json',
 './t5_finetuned_conversation\\special_tokens_map.json',
 './t5_finetuned_conversation\\spiece.model',
 './t5_finetuned_conversation\\added_tokens.json')